1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 74 /* 75 * Function prototypes. 76 */ 77 static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); 78 static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); 79 static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 80 static int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *); 81 static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *); 82 static int vsw_get_physaddr(vsw_t *); 83 static int vsw_setup_switching(vsw_t *); 84 static int vsw_setup_layer2(vsw_t *); 85 static int vsw_setup_layer3(vsw_t *); 86 87 /* MAC Ring table functions. */ 88 static void vsw_mac_ring_tbl_init(vsw_t *vswp); 89 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp); 90 static void vsw_queue_worker(vsw_mac_ring_t *rrp); 91 static void vsw_queue_stop(vsw_queue_t *vqp); 92 static vsw_queue_t *vsw_queue_create(); 93 static void vsw_queue_destroy(vsw_queue_t *vqp); 94 95 /* MAC layer routines */ 96 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg, 97 mac_resource_t *mrp); 98 static int vsw_get_hw_maddr(vsw_t *); 99 static int vsw_set_hw(vsw_t *, vsw_port_t *, int); 100 static int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *); 101 static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int); 102 static int vsw_unset_hw(vsw_t *, vsw_port_t *, int); 103 static int vsw_unset_hw_addr(vsw_t *, int); 104 static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int); 105 static void vsw_reconfig_hw(vsw_t *); 106 static int vsw_prog_if(vsw_t *); 107 static int vsw_prog_ports(vsw_t *); 108 static int vsw_mac_attach(vsw_t *vswp); 109 static void vsw_mac_detach(vsw_t *vswp); 110 111 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *); 112 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); 113 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 114 static int vsw_mac_register(vsw_t *); 115 static int vsw_mac_unregister(vsw_t *); 116 static int vsw_m_stat(void *, uint_t, uint64_t *); 117 static void vsw_m_stop(void *arg); 118 static int vsw_m_start(void *arg); 119 static int vsw_m_unicst(void *arg, const uint8_t *); 120 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *); 121 static int vsw_m_promisc(void *arg, boolean_t); 122 static mblk_t *vsw_m_tx(void *arg, mblk_t *); 123 124 /* MDEG routines */ 125 static int vsw_mdeg_register(vsw_t *vswp); 126 static void vsw_mdeg_unregister(vsw_t *vswp); 127 static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); 128 static int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *); 129 static void vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t); 130 static void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t); 131 132 /* Port add/deletion routines */ 133 static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); 134 static int vsw_port_attach(vsw_t *vswp, int p_instance, 135 uint64_t *ldcids, int nids, struct ether_addr *macaddr); 136 static int vsw_detach_ports(vsw_t *vswp); 137 static int vsw_port_detach(vsw_t *vswp, int p_instance); 138 static int vsw_port_delete(vsw_port_t *port); 139 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); 140 static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id); 141 static int vsw_init_ldcs(vsw_port_t *port); 142 static int vsw_uninit_ldcs(vsw_port_t *port); 143 static int vsw_ldc_init(vsw_ldc_t *ldcp); 144 static int vsw_ldc_uninit(vsw_ldc_t *ldcp); 145 static int vsw_drain_ldcs(vsw_port_t *port); 146 static int vsw_drain_port_taskq(vsw_port_t *port); 147 static void vsw_marker_task(void *); 148 static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); 149 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); 150 151 /* Interrupt routines */ 152 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); 153 154 /* Handshake routines */ 155 static void vsw_ldc_reinit(vsw_ldc_t *); 156 static void vsw_process_conn_evt(vsw_ldc_t *, uint16_t); 157 static void vsw_conn_task(void *); 158 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); 159 static void vsw_next_milestone(vsw_ldc_t *); 160 static int vsw_supported_version(vio_ver_msg_t *); 161 162 /* Data processing routines */ 163 static void vsw_process_pkt(void *); 164 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t); 165 static void vsw_process_ctrl_pkt(void *); 166 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); 167 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); 168 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); 169 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); 170 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); 171 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); 172 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 173 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *); 174 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *); 175 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); 176 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 177 178 /* Switching/data transmit routines */ 179 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 180 vsw_port_t *port, mac_resource_handle_t); 181 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 182 vsw_port_t *port, mac_resource_handle_t); 183 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, 184 vsw_port_t *port); 185 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, 186 vsw_port_t *port); 187 static int vsw_portsend(vsw_port_t *, mblk_t *); 188 static int vsw_dringsend(vsw_ldc_t *, mblk_t *); 189 static int vsw_descrsend(vsw_ldc_t *, mblk_t *); 190 191 /* Packet creation routines */ 192 static void vsw_send_ver(void *); 193 static void vsw_send_attr(vsw_ldc_t *); 194 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *); 195 static void vsw_send_dring_info(vsw_ldc_t *); 196 static void vsw_send_rdx(vsw_ldc_t *); 197 198 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t); 199 200 /* Forwarding database (FDB) routines */ 201 static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 202 static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 203 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 204 static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 205 static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 206 static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 207 static void vsw_del_addr(uint8_t, void *, uint64_t); 208 static void vsw_del_mcst_port(vsw_port_t *); 209 static void vsw_del_mcst_vsw(vsw_t *); 210 211 /* Dring routines */ 212 static dring_info_t *vsw_create_dring(vsw_ldc_t *); 213 static void vsw_create_privring(vsw_ldc_t *); 214 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); 215 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, 216 int *); 217 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); 218 219 static void vsw_set_lane_attr(vsw_t *, lane_t *); 220 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *); 221 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg); 222 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *); 223 static int vsw_check_dring_info(vio_dring_reg_msg_t *); 224 225 /* Misc support routines */ 226 static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); 227 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); 228 static int vsw_free_ring(dring_info_t *); 229 230 /* Debugging routines */ 231 static void dump_flags(uint64_t); 232 static void display_state(void); 233 static void display_lane(lane_t *); 234 static void display_ring(dring_info_t *); 235 236 int vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */ 237 int vsw_wretries = 100; /* # of write attempts */ 238 int vsw_chain_len = 150; /* max # of mblks in msg chain */ 239 int vsw_desc_delay = 0; /* delay in us */ 240 int vsw_read_attempts = 5; /* # of reads of descriptor */ 241 242 uint32_t vsw_mblk_size = VSW_MBLK_SIZE; 243 uint32_t vsw_num_mblks = VSW_NUM_MBLKS; 244 245 static mac_callbacks_t vsw_m_callbacks = { 246 0, 247 vsw_m_stat, 248 vsw_m_start, 249 vsw_m_stop, 250 vsw_m_promisc, 251 vsw_m_multicst, 252 vsw_m_unicst, 253 vsw_m_tx, 254 NULL, 255 NULL, 256 NULL 257 }; 258 259 static struct cb_ops vsw_cb_ops = { 260 nulldev, /* cb_open */ 261 nulldev, /* cb_close */ 262 nodev, /* cb_strategy */ 263 nodev, /* cb_print */ 264 nodev, /* cb_dump */ 265 nodev, /* cb_read */ 266 nodev, /* cb_write */ 267 nodev, /* cb_ioctl */ 268 nodev, /* cb_devmap */ 269 nodev, /* cb_mmap */ 270 nodev, /* cb_segmap */ 271 nochpoll, /* cb_chpoll */ 272 ddi_prop_op, /* cb_prop_op */ 273 NULL, /* cb_stream */ 274 D_MP, /* cb_flag */ 275 CB_REV, /* rev */ 276 nodev, /* int (*cb_aread)() */ 277 nodev /* int (*cb_awrite)() */ 278 }; 279 280 static struct dev_ops vsw_ops = { 281 DEVO_REV, /* devo_rev */ 282 0, /* devo_refcnt */ 283 vsw_getinfo, /* devo_getinfo */ 284 nulldev, /* devo_identify */ 285 nulldev, /* devo_probe */ 286 vsw_attach, /* devo_attach */ 287 vsw_detach, /* devo_detach */ 288 nodev, /* devo_reset */ 289 &vsw_cb_ops, /* devo_cb_ops */ 290 (struct bus_ops *)NULL, /* devo_bus_ops */ 291 ddi_power /* devo_power */ 292 }; 293 294 extern struct mod_ops mod_driverops; 295 static struct modldrv vswmodldrv = { 296 &mod_driverops, 297 "sun4v Virtual Switch %I%", 298 &vsw_ops, 299 }; 300 301 #define LDC_ENTER_LOCK(ldcp) \ 302 mutex_enter(&((ldcp)->ldc_cblock));\ 303 mutex_enter(&((ldcp)->ldc_txlock)); 304 #define LDC_EXIT_LOCK(ldcp) \ 305 mutex_exit(&((ldcp)->ldc_txlock));\ 306 mutex_exit(&((ldcp)->ldc_cblock)); 307 308 /* Driver soft state ptr */ 309 static void *vsw_state; 310 311 /* 312 * Linked list of "vsw_t" structures - one per instance. 313 */ 314 vsw_t *vsw_head = NULL; 315 krwlock_t vsw_rw; 316 317 /* 318 * Property names 319 */ 320 static char vdev_propname[] = "virtual-device"; 321 static char vsw_propname[] = "virtual-network-switch"; 322 static char physdev_propname[] = "vsw-phys-dev"; 323 static char smode_propname[] = "vsw-switch-mode"; 324 static char macaddr_propname[] = "local-mac-address"; 325 static char remaddr_propname[] = "remote-mac-address"; 326 static char ldcids_propname[] = "ldc-ids"; 327 static char chan_propname[] = "channel-endpoint"; 328 static char id_propname[] = "id"; 329 static char reg_propname[] = "reg"; 330 331 /* supported versions */ 332 static ver_sup_t vsw_versions[] = { {1, 0} }; 333 334 /* 335 * Matching criteria passed to the MDEG to register interest 336 * in changes to 'virtual-device-port' nodes identified by their 337 * 'id' property. 338 */ 339 static md_prop_match_t vport_prop_match[] = { 340 { MDET_PROP_VAL, "id" }, 341 { MDET_LIST_END, NULL } 342 }; 343 344 static mdeg_node_match_t vport_match = { "virtual-device-port", 345 vport_prop_match }; 346 347 /* 348 * Matching criteria passed to the MDEG to register interest 349 * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified 350 * by their 'name' and 'cfg-handle' properties. 351 */ 352 static md_prop_match_t vdev_prop_match[] = { 353 { MDET_PROP_STR, "name" }, 354 { MDET_PROP_VAL, "cfg-handle" }, 355 { MDET_LIST_END, NULL } 356 }; 357 358 static mdeg_node_match_t vdev_match = { "virtual-device", 359 vdev_prop_match }; 360 361 362 /* 363 * Specification of an MD node passed to the MDEG to filter any 364 * 'vport' nodes that do not belong to the specified node. This 365 * template is copied for each vsw instance and filled in with 366 * the appropriate 'cfg-handle' value before being passed to the MDEG. 367 */ 368 static mdeg_prop_spec_t vsw_prop_template[] = { 369 { MDET_PROP_STR, "name", vsw_propname }, 370 { MDET_PROP_VAL, "cfg-handle", NULL }, 371 { MDET_LIST_END, NULL, NULL } 372 }; 373 374 #define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 375 376 /* 377 * From /etc/system enable/disable thread per ring. This is a mode 378 * selection that is done a vsw driver attach time. 379 */ 380 boolean_t vsw_multi_ring_enable = B_FALSE; 381 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS; 382 383 /* 384 * Print debug messages - set to 0x1f to enable all msgs 385 * or 0x0 to turn all off. 386 */ 387 int vswdbg = 0x0; 388 389 /* 390 * debug levels: 391 * 0x01: Function entry/exit tracing 392 * 0x02: Internal function messages 393 * 0x04: Verbose internal messages 394 * 0x08: Warning messages 395 * 0x10: Error messages 396 */ 397 398 static void 399 vswdebug(vsw_t *vswp, const char *fmt, ...) 400 { 401 char buf[512]; 402 va_list ap; 403 404 va_start(ap, fmt); 405 (void) vsprintf(buf, fmt, ap); 406 va_end(ap); 407 408 if (vswp == NULL) 409 cmn_err(CE_CONT, "%s\n", buf); 410 else 411 cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf); 412 } 413 414 /* 415 * For the moment the state dump routines have their own 416 * private flag. 417 */ 418 #define DUMP_STATE 0 419 420 #if DUMP_STATE 421 422 #define DUMP_TAG(tag) \ 423 { \ 424 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ 425 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ 426 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ 427 } 428 429 #define DUMP_TAG_PTR(tag) \ 430 { \ 431 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ 432 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ 433 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ 434 } 435 436 #define DUMP_FLAGS(flags) dump_flags(flags); 437 #define DISPLAY_STATE() display_state() 438 439 #else 440 441 #define DUMP_TAG(tag) 442 #define DUMP_TAG_PTR(tag) 443 #define DUMP_FLAGS(state) 444 #define DISPLAY_STATE() 445 446 #endif /* DUMP_STATE */ 447 448 #ifdef DEBUG 449 450 #define D1 \ 451 if (vswdbg & 0x01) \ 452 vswdebug 453 454 #define D2 \ 455 if (vswdbg & 0x02) \ 456 vswdebug 457 458 #define D3 \ 459 if (vswdbg & 0x04) \ 460 vswdebug 461 462 #define DWARN \ 463 if (vswdbg & 0x08) \ 464 vswdebug 465 466 #define DERR \ 467 if (vswdbg & 0x10) \ 468 vswdebug 469 470 #else 471 472 #define DERR if (0) vswdebug 473 #define DWARN if (0) vswdebug 474 #define D1 if (0) vswdebug 475 #define D2 if (0) vswdebug 476 #define D3 if (0) vswdebug 477 478 #endif /* DEBUG */ 479 480 static struct modlinkage modlinkage = { 481 MODREV_1, 482 &vswmodldrv, 483 NULL 484 }; 485 486 int 487 _init(void) 488 { 489 int status; 490 491 rw_init(&vsw_rw, NULL, RW_DRIVER, NULL); 492 493 status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1); 494 if (status != 0) { 495 return (status); 496 } 497 498 mac_init_ops(&vsw_ops, "vsw"); 499 status = mod_install(&modlinkage); 500 if (status != 0) { 501 ddi_soft_state_fini(&vsw_state); 502 } 503 return (status); 504 } 505 506 int 507 _fini(void) 508 { 509 int status; 510 511 status = mod_remove(&modlinkage); 512 if (status != 0) 513 return (status); 514 mac_fini_ops(&vsw_ops); 515 ddi_soft_state_fini(&vsw_state); 516 517 rw_destroy(&vsw_rw); 518 519 return (status); 520 } 521 522 int 523 _info(struct modinfo *modinfop) 524 { 525 return (mod_info(&modlinkage, modinfop)); 526 } 527 528 static int 529 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 530 { 531 vsw_t *vswp; 532 int instance; 533 char hashname[MAXNAMELEN]; 534 char qname[TASKQ_NAMELEN]; 535 enum { PROG_init = 0x00, 536 PROG_if_lock = 0x01, 537 PROG_fdb = 0x02, 538 PROG_mfdb = 0x04, 539 PROG_report_dev = 0x08, 540 PROG_plist = 0x10, 541 PROG_taskq = 0x20} 542 progress; 543 544 progress = PROG_init; 545 546 switch (cmd) { 547 case DDI_ATTACH: 548 break; 549 case DDI_RESUME: 550 /* nothing to do for this non-device */ 551 return (DDI_SUCCESS); 552 case DDI_PM_RESUME: 553 default: 554 return (DDI_FAILURE); 555 } 556 557 instance = ddi_get_instance(dip); 558 if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) { 559 DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance); 560 return (DDI_FAILURE); 561 } 562 vswp = ddi_get_soft_state(vsw_state, instance); 563 564 if (vswp == NULL) { 565 DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance); 566 goto vsw_attach_fail; 567 } 568 569 vswp->dip = dip; 570 vswp->instance = instance; 571 ddi_set_driver_private(dip, (caddr_t)vswp); 572 573 mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL); 574 mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL); 575 rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); 576 progress |= PROG_if_lock; 577 578 /* setup the unicast forwarding database */ 579 (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", 580 vswp->instance); 581 D2(vswp, "creating unicast hash table (%s)...", hashname); 582 vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 583 mod_hash_null_valdtor, sizeof (void *)); 584 585 progress |= PROG_fdb; 586 587 /* setup the multicast fowarding database */ 588 (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d", 589 vswp->instance); 590 D2(vswp, "creating multicast hash table %s)...", hashname); 591 rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); 592 vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 593 mod_hash_null_valdtor, sizeof (void *)); 594 595 progress |= PROG_mfdb; 596 597 /* 598 * create lock protecting list of multicast addresses 599 * which could come via m_multicst() entry point when plumbed. 600 */ 601 mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); 602 vswp->mcap = NULL; 603 604 ddi_report_dev(vswp->dip); 605 606 progress |= PROG_report_dev; 607 608 WRITE_ENTER(&vsw_rw); 609 vswp->next = vsw_head; 610 vsw_head = vswp; 611 RW_EXIT(&vsw_rw); 612 613 /* setup the port list */ 614 rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); 615 vswp->plist.head = NULL; 616 617 progress |= PROG_plist; 618 619 /* 620 * Create the taskq which will process all the VIO 621 * control messages. 622 */ 623 (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); 624 if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, 625 TASKQ_DEFAULTPRI, 0)) == NULL) { 626 cmn_err(CE_WARN, "!vsw%d: Unable to create task queue", 627 vswp->instance); 628 goto vsw_attach_fail; 629 } 630 631 progress |= PROG_taskq; 632 633 /* prevent auto-detaching */ 634 if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, 635 DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { 636 cmn_err(CE_NOTE, "!Unable to set \"%s\" property for " 637 "instance %u", DDI_NO_AUTODETACH, instance); 638 } 639 640 /* 641 * Now we have everything setup, register an interest in 642 * specific MD nodes. 643 * 644 * The callback is invoked in 2 cases, firstly if upon mdeg 645 * registration there are existing nodes which match our specified 646 * criteria, and secondly if the MD is changed (and again, there 647 * are nodes which we are interested in present within it. Note 648 * that our callback will be invoked even if our specified nodes 649 * have not actually changed). 650 * 651 * Until the callback is invoked we cannot switch any pkts as 652 * we don't know basic information such as what mode we are 653 * operating in. However we expect the callback to be invoked 654 * immediately upon registration as this driver should only 655 * be attaching if there are vsw nodes in the MD. 656 */ 657 if (vsw_mdeg_register(vswp)) 658 goto vsw_attach_fail; 659 660 return (DDI_SUCCESS); 661 662 vsw_attach_fail: 663 DERR(NULL, "vsw_attach: failed"); 664 665 if (progress & PROG_taskq) 666 ddi_taskq_destroy(vswp->taskq_p); 667 668 if (progress & PROG_plist) 669 rw_destroy(&vswp->plist.lockrw); 670 671 if (progress & PROG_report_dev) { 672 ddi_remove_minor_node(dip, NULL); 673 mutex_destroy(&vswp->mca_lock); 674 } 675 676 if (progress & PROG_mfdb) { 677 mod_hash_destroy_hash(vswp->mfdb); 678 vswp->mfdb = NULL; 679 rw_destroy(&vswp->mfdbrw); 680 } 681 682 if (progress & PROG_fdb) { 683 mod_hash_destroy_hash(vswp->fdb); 684 vswp->fdb = NULL; 685 } 686 687 if (progress & PROG_if_lock) { 688 rw_destroy(&vswp->if_lockrw); 689 mutex_destroy(&vswp->mac_lock); 690 mutex_destroy(&vswp->hw_lock); 691 } 692 693 ddi_soft_state_free(vsw_state, instance); 694 return (DDI_FAILURE); 695 } 696 697 static int 698 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 699 { 700 vio_mblk_pool_t *poolp, *npoolp; 701 vsw_t **vswpp, *vswp; 702 int instance; 703 704 instance = ddi_get_instance(dip); 705 vswp = ddi_get_soft_state(vsw_state, instance); 706 707 if (vswp == NULL) { 708 return (DDI_FAILURE); 709 } 710 711 switch (cmd) { 712 case DDI_DETACH: 713 break; 714 case DDI_SUSPEND: 715 case DDI_PM_SUSPEND: 716 default: 717 return (DDI_FAILURE); 718 } 719 720 D2(vswp, "detaching instance %d", instance); 721 722 if (vswp->if_state & VSW_IF_REG) { 723 if (vsw_mac_unregister(vswp) != 0) { 724 cmn_err(CE_WARN, "!vsw%d: Unable to detach from " 725 "MAC layer", vswp->instance); 726 return (DDI_FAILURE); 727 } 728 } 729 730 vsw_mdeg_unregister(vswp); 731 732 /* remove mac layer callback */ 733 mutex_enter(&vswp->mac_lock); 734 if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { 735 mac_rx_remove(vswp->mh, vswp->mrh); 736 vswp->mrh = NULL; 737 } 738 mutex_exit(&vswp->mac_lock); 739 740 if (vsw_detach_ports(vswp) != 0) { 741 cmn_err(CE_WARN, "!vsw%d: Unable to detach ports", 742 vswp->instance); 743 return (DDI_FAILURE); 744 } 745 746 rw_destroy(&vswp->if_lockrw); 747 748 mutex_destroy(&vswp->hw_lock); 749 750 /* 751 * Now that the ports have been deleted, stop and close 752 * the physical device. 753 */ 754 mutex_enter(&vswp->mac_lock); 755 if (vswp->mh != NULL) { 756 if (vswp->mstarted) 757 mac_stop(vswp->mh); 758 if (vswp->mresources) 759 mac_resource_set(vswp->mh, NULL, NULL); 760 mac_close(vswp->mh); 761 762 vswp->mh = NULL; 763 vswp->txinfo = NULL; 764 } 765 mutex_exit(&vswp->mac_lock); 766 mutex_destroy(&vswp->mac_lock); 767 768 /* 769 * Destroy any free pools that may still exist. 770 */ 771 poolp = vswp->rxh; 772 while (poolp != NULL) { 773 npoolp = vswp->rxh = poolp->nextp; 774 if (vio_destroy_mblks(poolp) != 0) { 775 vswp->rxh = poolp; 776 return (DDI_FAILURE); 777 } 778 poolp = npoolp; 779 } 780 781 /* 782 * Remove this instance from any entries it may be on in 783 * the hash table by using the list of addresses maintained 784 * in the vsw_t structure. 785 */ 786 vsw_del_mcst_vsw(vswp); 787 788 vswp->mcap = NULL; 789 mutex_destroy(&vswp->mca_lock); 790 791 /* 792 * By now any pending tasks have finished and the underlying 793 * ldc's have been destroyed, so its safe to delete the control 794 * message taskq. 795 */ 796 if (vswp->taskq_p != NULL) 797 ddi_taskq_destroy(vswp->taskq_p); 798 799 /* 800 * At this stage all the data pointers in the hash table 801 * should be NULL, as all the ports have been removed and will 802 * have deleted themselves from the port lists which the data 803 * pointers point to. Hence we can destroy the table using the 804 * default destructors. 805 */ 806 D2(vswp, "vsw_detach: destroying hash tables.."); 807 mod_hash_destroy_hash(vswp->fdb); 808 vswp->fdb = NULL; 809 810 WRITE_ENTER(&vswp->mfdbrw); 811 mod_hash_destroy_hash(vswp->mfdb); 812 vswp->mfdb = NULL; 813 RW_EXIT(&vswp->mfdbrw); 814 rw_destroy(&vswp->mfdbrw); 815 816 ddi_remove_minor_node(dip, NULL); 817 818 rw_destroy(&vswp->plist.lockrw); 819 WRITE_ENTER(&vsw_rw); 820 for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) { 821 if (*vswpp == vswp) { 822 *vswpp = vswp->next; 823 break; 824 } 825 } 826 RW_EXIT(&vsw_rw); 827 ddi_soft_state_free(vsw_state, instance); 828 829 return (DDI_SUCCESS); 830 } 831 832 static int 833 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 834 { 835 _NOTE(ARGUNUSED(dip)) 836 837 vsw_t *vswp = NULL; 838 dev_t dev = (dev_t)arg; 839 int instance; 840 841 instance = getminor(dev); 842 843 switch (infocmd) { 844 case DDI_INFO_DEVT2DEVINFO: 845 if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) { 846 *result = NULL; 847 return (DDI_FAILURE); 848 } 849 *result = vswp->dip; 850 return (DDI_SUCCESS); 851 852 case DDI_INFO_DEVT2INSTANCE: 853 *result = (void *)(uintptr_t)instance; 854 return (DDI_SUCCESS); 855 856 default: 857 *result = NULL; 858 return (DDI_FAILURE); 859 } 860 } 861 862 /* 863 * Get the value of the "vsw-phys-dev" property in the specified 864 * node. This property is the name of the physical device that 865 * the virtual switch will use to talk to the outside world. 866 * 867 * Note it is valid for this property to be NULL (but the property 868 * itself must exist). Callers of this routine should verify that 869 * the value returned is what they expected (i.e. either NULL or non NULL). 870 * 871 * On success returns value of the property in region pointed to by 872 * the 'name' argument, and with return value of 0. Otherwise returns 1. 873 */ 874 static int 875 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name) 876 { 877 int len = 0; 878 char *physname = NULL; 879 char *dev; 880 881 if (md_get_prop_data(mdp, node, physdev_propname, 882 (uint8_t **)(&physname), &len) != 0) { 883 cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical " 884 "device(s) from MD", vswp->instance); 885 return (1); 886 } else if ((strlen(physname) + 1) > LIFNAMSIZ) { 887 cmn_err(CE_WARN, "!vsw%d: %s is too long a device name", 888 vswp->instance, physname); 889 return (1); 890 } else { 891 (void) strncpy(name, physname, strlen(physname) + 1); 892 D2(vswp, "%s: using first device specified (%s)", 893 __func__, physname); 894 } 895 896 #ifdef DEBUG 897 /* 898 * As a temporary measure to aid testing we check to see if there 899 * is a vsw.conf file present. If there is we use the value of the 900 * vsw_physname property in the file as the name of the physical 901 * device, overriding the value from the MD. 902 * 903 * There may be multiple devices listed, but for the moment 904 * we just use the first one. 905 */ 906 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, 907 "vsw_physname", &dev) == DDI_PROP_SUCCESS) { 908 if ((strlen(dev) + 1) > LIFNAMSIZ) { 909 cmn_err(CE_WARN, "vsw%d: %s is too long a device name", 910 vswp->instance, dev); 911 ddi_prop_free(dev); 912 return (1); 913 } else { 914 cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from " 915 "config file", vswp->instance, dev); 916 917 (void) strncpy(name, dev, strlen(dev) + 1); 918 } 919 920 ddi_prop_free(dev); 921 } 922 #endif 923 924 return (0); 925 } 926 927 /* 928 * Read the 'vsw-switch-mode' property from the specified MD node. 929 * 930 * Returns 0 on success and the number of modes found in 'found', 931 * otherwise returns 1. 932 */ 933 static int 934 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, 935 uint8_t *modes, int *found) 936 { 937 int len = 0; 938 int smode_num = 0; 939 char *smode = NULL; 940 char *curr_mode = NULL; 941 942 D1(vswp, "%s: enter", __func__); 943 944 /* 945 * Get the switch-mode property. The modes are listed in 946 * decreasing order of preference, i.e. prefered mode is 947 * first item in list. 948 */ 949 len = 0; 950 smode_num = 0; 951 if (md_get_prop_data(mdp, node, smode_propname, 952 (uint8_t **)(&smode), &len) != 0) { 953 /* 954 * Unable to get switch-mode property from MD, nothing 955 * more we can do. 956 */ 957 cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property" 958 " from the MD", vswp->instance); 959 *found = 0; 960 return (1); 961 } 962 963 curr_mode = smode; 964 /* 965 * Modes of operation: 966 * 'switched' - layer 2 switching, underlying HW in 967 * programmed mode. 968 * 'promiscuous' - layer 2 switching, underlying HW in 969 * promiscuous mode. 970 * 'routed' - layer 3 (i.e. IP) routing, underlying HW 971 * in non-promiscuous mode. 972 */ 973 while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) { 974 D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); 975 if (strcmp(curr_mode, "switched") == 0) { 976 modes[smode_num++] = VSW_LAYER2; 977 } else if (strcmp(curr_mode, "promiscuous") == 0) { 978 modes[smode_num++] = VSW_LAYER2_PROMISC; 979 } else if (strcmp(curr_mode, "routed") == 0) { 980 modes[smode_num++] = VSW_LAYER3; 981 } else { 982 cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, " 983 "setting to default switched mode", 984 vswp->instance, curr_mode); 985 modes[smode_num++] = VSW_LAYER2; 986 } 987 curr_mode += strlen(curr_mode) + 1; 988 } 989 *found = smode_num; 990 991 D2(vswp, "%s: %d modes found", __func__, smode_num); 992 993 D1(vswp, "%s: exit", __func__); 994 995 return (0); 996 } 997 998 /* 999 * Get the mac address of the physical device. 1000 * 1001 * Returns 0 on success, 1 on failure. 1002 */ 1003 static int 1004 vsw_get_physaddr(vsw_t *vswp) 1005 { 1006 mac_handle_t mh; 1007 char drv[LIFNAMSIZ]; 1008 uint_t ddi_instance; 1009 1010 D1(vswp, "%s: enter", __func__); 1011 1012 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) 1013 return (1); 1014 1015 if (mac_open(vswp->physname, ddi_instance, &mh) != 0) { 1016 cmn_err(CE_WARN, "!vsw%d: mac_open %s failed", 1017 vswp->instance, vswp->physname); 1018 return (1); 1019 } 1020 1021 READ_ENTER(&vswp->if_lockrw); 1022 mac_unicst_get(mh, vswp->if_addr.ether_addr_octet); 1023 RW_EXIT(&vswp->if_lockrw); 1024 1025 mac_close(mh); 1026 1027 vswp->mdprops |= VSW_DEV_MACADDR; 1028 1029 D1(vswp, "%s: exit", __func__); 1030 1031 return (0); 1032 } 1033 1034 /* 1035 * Check to see if the card supports the setting of multiple unicst 1036 * addresses. 1037 * 1038 * Returns 0 if card supports the programming of multiple unicast addresses, 1039 * otherwise returns 1. 1040 */ 1041 static int 1042 vsw_get_hw_maddr(vsw_t *vswp) 1043 { 1044 D1(vswp, "%s: enter", __func__); 1045 1046 mutex_enter(&vswp->mac_lock); 1047 if (vswp->mh == NULL) { 1048 mutex_exit(&vswp->mac_lock); 1049 return (1); 1050 } 1051 1052 if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { 1053 cmn_err(CE_WARN, "!vsw%d: device (%s) does not support " 1054 "setting multiple unicast addresses", vswp->instance, 1055 vswp->physname); 1056 mutex_exit(&vswp->mac_lock); 1057 return (1); 1058 } 1059 mutex_exit(&vswp->mac_lock); 1060 1061 D2(vswp, "%s: %d addrs : %d free", __func__, 1062 vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree); 1063 1064 D1(vswp, "%s: exit", __func__); 1065 1066 return (0); 1067 } 1068 1069 /* 1070 * Setup the required switching mode. 1071 * 1072 * Returns 0 on success, 1 on failure. 1073 */ 1074 static int 1075 vsw_setup_switching(vsw_t *vswp) 1076 { 1077 int i, rv = 1; 1078 1079 D1(vswp, "%s: enter", __func__); 1080 1081 /* select best switching mode */ 1082 for (i = 0; i < vswp->smode_num; i++) { 1083 vswp->smode_idx = i; 1084 switch (vswp->smode[i]) { 1085 case VSW_LAYER2: 1086 case VSW_LAYER2_PROMISC: 1087 rv = vsw_setup_layer2(vswp); 1088 break; 1089 1090 case VSW_LAYER3: 1091 rv = vsw_setup_layer3(vswp); 1092 break; 1093 1094 default: 1095 DERR(vswp, "unknown switch mode"); 1096 rv = 1; 1097 break; 1098 } 1099 1100 if (rv == 0) 1101 break; 1102 } 1103 1104 if (rv == 1) { 1105 cmn_err(CE_WARN, "!vsw%d: Unable to setup specified " 1106 "switching mode", vswp->instance); 1107 return (rv); 1108 } 1109 1110 D2(vswp, "%s: Operating in mode %d", __func__, 1111 vswp->smode[vswp->smode_idx]); 1112 1113 D1(vswp, "%s: exit", __func__); 1114 1115 return (0); 1116 } 1117 1118 /* 1119 * Setup for layer 2 switching. 1120 * 1121 * Returns 0 on success, 1 on failure. 1122 */ 1123 static int 1124 vsw_setup_layer2(vsw_t *vswp) 1125 { 1126 D1(vswp, "%s: enter", __func__); 1127 1128 vswp->vsw_switch_frame = vsw_switch_l2_frame; 1129 1130 /* 1131 * Attempt to link into the MAC layer so we can get 1132 * and send packets out over the physical adapter. 1133 */ 1134 if (vswp->mdprops & VSW_MD_PHYSNAME) { 1135 if (vsw_mac_attach(vswp) != 0) { 1136 /* 1137 * Registration with the MAC layer has failed, 1138 * so return 1 so that can fall back to next 1139 * prefered switching method. 1140 */ 1141 cmn_err(CE_WARN, "!vsw%d: Unable to join as MAC layer " 1142 "client", vswp->instance); 1143 return (1); 1144 } 1145 1146 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 1147 /* 1148 * Verify that underlying device can support multiple 1149 * unicast mac addresses. 1150 */ 1151 if (vsw_get_hw_maddr(vswp) != 0) { 1152 cmn_err(CE_WARN, "!vsw%d: Unable to setup " 1153 "layer2 switching", vswp->instance); 1154 vsw_mac_detach(vswp); 1155 return (1); 1156 } 1157 } 1158 1159 } else { 1160 /* 1161 * No physical device name found in MD which is 1162 * required for layer 2. 1163 */ 1164 cmn_err(CE_WARN, "!vsw%d: no physical device name specified", 1165 vswp->instance); 1166 return (1); 1167 } 1168 1169 D1(vswp, "%s: exit", __func__); 1170 1171 return (0); 1172 } 1173 1174 static int 1175 vsw_setup_layer3(vsw_t *vswp) 1176 { 1177 D1(vswp, "%s: enter", __func__); 1178 1179 D2(vswp, "%s: operating in layer 3 mode", __func__); 1180 vswp->vsw_switch_frame = vsw_switch_l3_frame; 1181 1182 D1(vswp, "%s: exit", __func__); 1183 1184 return (0); 1185 } 1186 1187 /* 1188 * Link into the MAC layer to gain access to the services provided by 1189 * the underlying physical device driver (which should also have 1190 * registered with the MAC layer). 1191 * 1192 * Only when in layer 2 mode. 1193 */ 1194 static int 1195 vsw_mac_attach(vsw_t *vswp) 1196 { 1197 char drv[LIFNAMSIZ]; 1198 uint_t ddi_instance; 1199 1200 D1(vswp, "%s: enter", __func__); 1201 1202 ASSERT(vswp->mh == NULL); 1203 ASSERT(vswp->mrh == NULL); 1204 ASSERT(vswp->mstarted == B_FALSE); 1205 ASSERT(vswp->mresources == B_FALSE); 1206 1207 ASSERT(vswp->mdprops & VSW_MD_PHYSNAME); 1208 1209 mutex_enter(&vswp->mac_lock); 1210 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) { 1211 cmn_err(CE_WARN, "!vsw%d: invalid device name: %s", 1212 vswp->instance, vswp->physname); 1213 goto mac_fail_exit; 1214 } 1215 1216 if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) { 1217 cmn_err(CE_WARN, "!vsw%d: mac_open %s failed", 1218 vswp->instance, vswp->physname); 1219 goto mac_fail_exit; 1220 } 1221 1222 ASSERT(vswp->mh != NULL); 1223 1224 D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); 1225 1226 if (vsw_multi_ring_enable) { 1227 /* 1228 * Initialize the ring table. 1229 */ 1230 vsw_mac_ring_tbl_init(vswp); 1231 1232 /* 1233 * Register our rx callback function. 1234 */ 1235 vswp->mrh = mac_rx_add(vswp->mh, 1236 vsw_rx_queue_cb, (void *)vswp); 1237 ASSERT(vswp->mrh != NULL); 1238 1239 /* 1240 * Register our mac resource callback. 1241 */ 1242 mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp); 1243 vswp->mresources = B_TRUE; 1244 1245 /* 1246 * Get the ring resources available to us from 1247 * the mac below us. 1248 */ 1249 mac_resources(vswp->mh); 1250 } else { 1251 /* 1252 * Just register our rx callback function 1253 */ 1254 vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); 1255 ASSERT(vswp->mrh != NULL); 1256 } 1257 1258 /* Get the MAC tx fn */ 1259 vswp->txinfo = mac_tx_get(vswp->mh); 1260 1261 /* start the interface */ 1262 if (mac_start(vswp->mh) != 0) { 1263 cmn_err(CE_WARN, "!vsw%d: Could not start mac interface", 1264 vswp->instance); 1265 goto mac_fail_exit; 1266 } 1267 1268 mutex_exit(&vswp->mac_lock); 1269 1270 vswp->mstarted = B_TRUE; 1271 1272 D1(vswp, "%s: exit", __func__); 1273 return (0); 1274 1275 mac_fail_exit: 1276 mutex_exit(&vswp->mac_lock); 1277 vsw_mac_detach(vswp); 1278 1279 D1(vswp, "%s: exit", __func__); 1280 return (1); 1281 } 1282 1283 static void 1284 vsw_mac_detach(vsw_t *vswp) 1285 { 1286 D1(vswp, "vsw_mac_detach: enter"); 1287 1288 ASSERT(vswp != NULL); 1289 1290 if (vsw_multi_ring_enable) { 1291 vsw_mac_ring_tbl_destroy(vswp); 1292 } 1293 1294 mutex_enter(&vswp->mac_lock); 1295 1296 if (vswp->mh != NULL) { 1297 if (vswp->mstarted) 1298 mac_stop(vswp->mh); 1299 if (vswp->mrh != NULL) 1300 mac_rx_remove(vswp->mh, vswp->mrh); 1301 if (vswp->mresources) 1302 mac_resource_set(vswp->mh, NULL, NULL); 1303 mac_close(vswp->mh); 1304 } 1305 1306 vswp->mrh = NULL; 1307 vswp->mh = NULL; 1308 vswp->txinfo = NULL; 1309 vswp->mstarted = B_FALSE; 1310 1311 mutex_exit(&vswp->mac_lock); 1312 1313 D1(vswp, "vsw_mac_detach: exit"); 1314 } 1315 1316 /* 1317 * Depending on the mode specified, the capabilites and capacity 1318 * of the underlying device setup the physical device. 1319 * 1320 * If in layer 3 mode, then do nothing. 1321 * 1322 * If in layer 2 programmed mode attempt to program the unicast address 1323 * associated with the port into the physical device. If this is not 1324 * possible due to resource exhaustion or simply because the device does 1325 * not support multiple unicast addresses then if required fallback onto 1326 * putting the card into promisc mode. 1327 * 1328 * If in promisc mode then simply set the card into promisc mode. 1329 * 1330 * Returns 0 success, 1 on failure. 1331 */ 1332 static int 1333 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type) 1334 { 1335 mac_multi_addr_t mac_addr; 1336 int err; 1337 1338 D1(vswp, "%s: enter", __func__); 1339 1340 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1341 ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); 1342 1343 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1344 return (0); 1345 1346 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) { 1347 return (vsw_set_hw_promisc(vswp, port, type)); 1348 } 1349 1350 /* 1351 * Attempt to program the unicast address into the HW. 1352 */ 1353 mac_addr.mma_addrlen = ETHERADDRL; 1354 if (type == VSW_VNETPORT) { 1355 ASSERT(port != NULL); 1356 ether_copy(&port->p_macaddr, &mac_addr.mma_addr); 1357 } else { 1358 READ_ENTER(&vswp->if_lockrw); 1359 /* 1360 * Don't program if the interface is not UP. This 1361 * is possible if the address has just been changed 1362 * in the MD node, but the interface has not yet been 1363 * plumbed. 1364 */ 1365 if (!(vswp->if_state & VSW_IF_UP)) { 1366 RW_EXIT(&vswp->if_lockrw); 1367 return (0); 1368 } 1369 ether_copy(&vswp->if_addr, &mac_addr.mma_addr); 1370 RW_EXIT(&vswp->if_lockrw); 1371 } 1372 1373 err = vsw_set_hw_addr(vswp, &mac_addr); 1374 if (err != 0) { 1375 /* 1376 * Mark that attempt should be made to re-config sometime 1377 * in future if a port is deleted. 1378 */ 1379 vswp->recfg_reqd = B_TRUE; 1380 1381 /* 1382 * Only 1 mode specified, nothing more to do. 1383 */ 1384 if (vswp->smode_num == 1) 1385 return (err); 1386 1387 /* 1388 * If promiscuous was next mode specified try to 1389 * set the card into that mode. 1390 */ 1391 if ((vswp->smode_idx <= (vswp->smode_num - 2)) && 1392 (vswp->smode[vswp->smode_idx + 1] 1393 == VSW_LAYER2_PROMISC)) { 1394 vswp->smode_idx += 1; 1395 return (vsw_set_hw_promisc(vswp, port, type)); 1396 } 1397 return (err); 1398 } 1399 1400 if (type == VSW_VNETPORT) { 1401 port->addr_slot = mac_addr.mma_slot; 1402 port->addr_set = VSW_ADDR_HW; 1403 } else { 1404 vswp->addr_slot = mac_addr.mma_slot; 1405 vswp->addr_set = VSW_ADDR_HW; 1406 } 1407 1408 D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x into slot %d " 1409 "of device %s", 1410 mac_addr.mma_addr[0], mac_addr.mma_addr[1], 1411 mac_addr.mma_addr[2], mac_addr.mma_addr[3], 1412 mac_addr.mma_addr[4], mac_addr.mma_addr[5], 1413 mac_addr.mma_slot, vswp->physname); 1414 1415 D1(vswp, "%s: exit", __func__); 1416 1417 return (0); 1418 } 1419 1420 /* 1421 * If in layer 3 mode do nothing. 1422 * 1423 * If in layer 2 switched mode remove the address from the physical 1424 * device. 1425 * 1426 * If in layer 2 promiscuous mode disable promisc mode. 1427 * 1428 * Returns 0 on success. 1429 */ 1430 static int 1431 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type) 1432 { 1433 mac_addr_slot_t slot; 1434 int rv; 1435 1436 D1(vswp, "%s: enter", __func__); 1437 1438 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1439 1440 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1441 return (0); 1442 1443 switch (type) { 1444 case VSW_VNETPORT: 1445 ASSERT(port != NULL); 1446 1447 if (port->addr_set == VSW_ADDR_PROMISC) { 1448 return (vsw_unset_hw_promisc(vswp, port, type)); 1449 1450 } else if (port->addr_set == VSW_ADDR_HW) { 1451 slot = port->addr_slot; 1452 if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0) 1453 port->addr_set = VSW_ADDR_UNSET; 1454 } 1455 1456 break; 1457 1458 case VSW_LOCALDEV: 1459 if (vswp->addr_set == VSW_ADDR_PROMISC) { 1460 return (vsw_unset_hw_promisc(vswp, NULL, type)); 1461 1462 } else if (vswp->addr_set == VSW_ADDR_HW) { 1463 slot = vswp->addr_slot; 1464 if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0) 1465 vswp->addr_set = VSW_ADDR_UNSET; 1466 } 1467 1468 break; 1469 1470 default: 1471 /* should never happen */ 1472 DERR(vswp, "%s: unknown type %d", __func__, type); 1473 ASSERT(0); 1474 return (1); 1475 } 1476 1477 D1(vswp, "%s: exit", __func__); 1478 return (rv); 1479 } 1480 1481 /* 1482 * Attempt to program a unicast address into HW. 1483 * 1484 * Returns 0 on sucess, 1 on failure. 1485 */ 1486 static int 1487 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac) 1488 { 1489 void *mah; 1490 int rv; 1491 1492 D1(vswp, "%s: enter", __func__); 1493 1494 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1495 1496 if (vswp->maddr.maddr_handle == NULL) 1497 return (1); 1498 1499 mah = vswp->maddr.maddr_handle; 1500 1501 rv = vswp->maddr.maddr_add(mah, mac); 1502 1503 if (rv == 0) 1504 return (0); 1505 1506 /* 1507 * Its okay for the add to fail because we have exhausted 1508 * all the resouces in the hardware device. Any other error 1509 * we want to flag. 1510 */ 1511 if (rv != ENOSPC) { 1512 cmn_err(CE_WARN, "!vsw%d: error programming " 1513 "address %x:%x:%x:%x:%x:%x into HW " 1514 "err (%d)", vswp->instance, 1515 mac->mma_addr[0], mac->mma_addr[1], 1516 mac->mma_addr[2], mac->mma_addr[3], 1517 mac->mma_addr[4], mac->mma_addr[5], rv); 1518 } 1519 D1(vswp, "%s: exit", __func__); 1520 return (1); 1521 } 1522 1523 /* 1524 * Remove a unicast mac address which has previously been programmed 1525 * into HW. 1526 * 1527 * Returns 0 on sucess, 1 on failure. 1528 */ 1529 static int 1530 vsw_unset_hw_addr(vsw_t *vswp, int slot) 1531 { 1532 void *mah; 1533 int rv; 1534 1535 D1(vswp, "%s: enter", __func__); 1536 1537 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1538 ASSERT(slot >= 0); 1539 1540 if (vswp->maddr.maddr_handle == NULL) 1541 return (1); 1542 1543 mah = vswp->maddr.maddr_handle; 1544 1545 rv = vswp->maddr.maddr_remove(mah, slot); 1546 if (rv != 0) { 1547 cmn_err(CE_WARN, "!vsw%d: unable to remove address " 1548 "from slot %d in device %s (err %d)", 1549 vswp->instance, slot, vswp->physname, rv); 1550 return (1); 1551 } 1552 1553 D2(vswp, "removed addr from slot %d in device %s", 1554 slot, vswp->physname); 1555 1556 D1(vswp, "%s: exit", __func__); 1557 return (0); 1558 } 1559 1560 /* 1561 * Set network card into promisc mode. 1562 * 1563 * Returns 0 on success, 1 on failure. 1564 */ 1565 static int 1566 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type) 1567 { 1568 D1(vswp, "%s: enter", __func__); 1569 1570 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1571 ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); 1572 1573 mutex_enter(&vswp->mac_lock); 1574 if (vswp->mh == NULL) { 1575 mutex_exit(&vswp->mac_lock); 1576 return (1); 1577 } 1578 1579 if (vswp->promisc_cnt++ == 0) { 1580 if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { 1581 vswp->promisc_cnt--; 1582 mutex_exit(&vswp->mac_lock); 1583 return (1); 1584 } 1585 cmn_err(CE_NOTE, "!vsw%d: switching device %s into " 1586 "promiscuous mode", vswp->instance, vswp->physname); 1587 } 1588 mutex_exit(&vswp->mac_lock); 1589 1590 if (type == VSW_VNETPORT) { 1591 ASSERT(port != NULL); 1592 port->addr_set = VSW_ADDR_PROMISC; 1593 } else { 1594 vswp->addr_set = VSW_ADDR_PROMISC; 1595 } 1596 1597 D1(vswp, "%s: exit", __func__); 1598 1599 return (0); 1600 } 1601 1602 /* 1603 * Turn off promiscuous mode on network card. 1604 * 1605 * Returns 0 on success, 1 on failure. 1606 */ 1607 static int 1608 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type) 1609 { 1610 vsw_port_list_t *plist = &vswp->plist; 1611 1612 D2(vswp, "%s: enter", __func__); 1613 1614 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1615 ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); 1616 1617 mutex_enter(&vswp->mac_lock); 1618 if (vswp->mh == NULL) { 1619 mutex_exit(&vswp->mac_lock); 1620 return (1); 1621 } 1622 1623 if (--vswp->promisc_cnt == 0) { 1624 if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { 1625 vswp->promisc_cnt++; 1626 mutex_exit(&vswp->mac_lock); 1627 return (1); 1628 } 1629 1630 /* 1631 * We are exiting promisc mode either because we were 1632 * only in promisc mode because we had failed over from 1633 * switched mode due to HW resource issues, or the user 1634 * wanted the card in promisc mode for all the ports and 1635 * the last port is now being deleted. Tweak the message 1636 * accordingly. 1637 */ 1638 if (plist->num_ports != 0) { 1639 cmn_err(CE_NOTE, "!vsw%d: switching device %s back to " 1640 "programmed mode", vswp->instance, 1641 vswp->physname); 1642 } else { 1643 cmn_err(CE_NOTE, "!vsw%d: switching device %s out of " 1644 "promiscuous mode", vswp->instance, 1645 vswp->physname); 1646 } 1647 } 1648 mutex_exit(&vswp->mac_lock); 1649 1650 if (type == VSW_VNETPORT) { 1651 ASSERT(port != NULL); 1652 ASSERT(port->addr_set == VSW_ADDR_PROMISC); 1653 port->addr_set = VSW_ADDR_UNSET; 1654 } else { 1655 ASSERT(vswp->addr_set == VSW_ADDR_PROMISC); 1656 vswp->addr_set = VSW_ADDR_UNSET; 1657 } 1658 1659 D1(vswp, "%s: exit", __func__); 1660 return (0); 1661 } 1662 1663 /* 1664 * Determine whether or not we are operating in our prefered 1665 * mode and if not whether the physical resources now allow us 1666 * to operate in it. 1667 * 1668 * If a port is being removed should only be invoked after port has been 1669 * removed from the port list. 1670 */ 1671 static void 1672 vsw_reconfig_hw(vsw_t *vswp) 1673 { 1674 int s_idx; 1675 1676 D1(vswp, "%s: enter", __func__); 1677 1678 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1679 1680 if (vswp->maddr.maddr_handle == NULL) { 1681 return; 1682 } 1683 1684 /* 1685 * If we are in layer 2 (i.e. switched) or would like to be 1686 * in layer 2 then check if any ports or the vswitch itself 1687 * need to be programmed into the HW. 1688 * 1689 * This can happen in two cases - switched was specified as 1690 * the prefered mode of operation but we exhausted the HW 1691 * resources and so failed over to the next specifed mode, 1692 * or switched was the only mode specified so after HW 1693 * resources were exhausted there was nothing more we 1694 * could do. 1695 */ 1696 if (vswp->smode_idx > 0) 1697 s_idx = vswp->smode_idx - 1; 1698 else 1699 s_idx = vswp->smode_idx; 1700 1701 if (vswp->smode[s_idx] != VSW_LAYER2) { 1702 return; 1703 } 1704 1705 D2(vswp, "%s: attempting reconfig..", __func__); 1706 1707 /* 1708 * First, attempt to set the vswitch mac address into HW, 1709 * if required. 1710 */ 1711 if (vsw_prog_if(vswp)) { 1712 return; 1713 } 1714 1715 /* 1716 * Next, attempt to set any ports which have not yet been 1717 * programmed into HW. 1718 */ 1719 if (vsw_prog_ports(vswp)) { 1720 return; 1721 } 1722 1723 /* 1724 * By now we know that have programmed all desired ports etc 1725 * into HW, so safe to mark reconfiguration as complete. 1726 */ 1727 vswp->recfg_reqd = B_FALSE; 1728 1729 vswp->smode_idx = s_idx; 1730 1731 D1(vswp, "%s: exit", __func__); 1732 } 1733 1734 /* 1735 * Check to see if vsw itself is plumbed, and if so whether or not 1736 * its mac address should be written into HW. 1737 * 1738 * Returns 0 if could set address, or didn't have to set it. 1739 * Returns 1 if failed to set address. 1740 */ 1741 static int 1742 vsw_prog_if(vsw_t *vswp) 1743 { 1744 mac_multi_addr_t addr; 1745 1746 D1(vswp, "%s: enter", __func__); 1747 1748 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1749 1750 READ_ENTER(&vswp->if_lockrw); 1751 if ((vswp->if_state & VSW_IF_UP) && 1752 (vswp->addr_set != VSW_ADDR_HW)) { 1753 1754 addr.mma_addrlen = ETHERADDRL; 1755 ether_copy(&vswp->if_addr, &addr.mma_addr); 1756 1757 if (vsw_set_hw_addr(vswp, &addr) != 0) { 1758 RW_EXIT(&vswp->if_lockrw); 1759 return (1); 1760 } 1761 1762 vswp->addr_slot = addr.mma_slot; 1763 1764 /* 1765 * If previously when plumbed had had to place 1766 * interface into promisc mode, now reverse that. 1767 * 1768 * Note that interface will only actually be set into 1769 * non-promisc mode when last port/interface has been 1770 * programmed into HW. 1771 */ 1772 if (vswp->addr_set == VSW_ADDR_PROMISC) 1773 (void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV); 1774 1775 vswp->addr_set = VSW_ADDR_HW; 1776 } 1777 RW_EXIT(&vswp->if_lockrw); 1778 1779 D1(vswp, "%s: exit", __func__); 1780 return (0); 1781 } 1782 1783 /* 1784 * Scan the port list for any ports which have not yet been set 1785 * into HW. For those found attempt to program their mac addresses 1786 * into the physical device. 1787 * 1788 * Returns 0 if able to program all required ports (can be 0) into HW. 1789 * Returns 1 if failed to set at least one mac address. 1790 */ 1791 static int 1792 vsw_prog_ports(vsw_t *vswp) 1793 { 1794 mac_multi_addr_t addr; 1795 vsw_port_list_t *plist = &vswp->plist; 1796 vsw_port_t *tp; 1797 int rv = 0; 1798 1799 D1(vswp, "%s: enter", __func__); 1800 1801 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1802 1803 READ_ENTER(&plist->lockrw); 1804 for (tp = plist->head; tp != NULL; tp = tp->p_next) { 1805 if (tp->addr_set != VSW_ADDR_HW) { 1806 addr.mma_addrlen = ETHERADDRL; 1807 ether_copy(&tp->p_macaddr, &addr.mma_addr); 1808 1809 if (vsw_set_hw_addr(vswp, &addr) != 0) { 1810 rv = 1; 1811 break; 1812 } 1813 1814 tp->addr_slot = addr.mma_slot; 1815 1816 /* 1817 * If when this port had first attached we had 1818 * had to place the interface into promisc mode, 1819 * then now reverse that. 1820 * 1821 * Note that the interface will not actually 1822 * change to non-promisc mode until all ports 1823 * have been programmed. 1824 */ 1825 if (tp->addr_set == VSW_ADDR_PROMISC) 1826 (void) vsw_unset_hw_promisc(vswp, 1827 tp, VSW_VNETPORT); 1828 1829 tp->addr_set = VSW_ADDR_HW; 1830 } 1831 } 1832 RW_EXIT(&plist->lockrw); 1833 1834 D1(vswp, "%s: exit", __func__); 1835 return (rv); 1836 } 1837 1838 static void 1839 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp) 1840 { 1841 ringp->ring_state = VSW_MAC_RING_FREE; 1842 ringp->ring_arg = NULL; 1843 ringp->ring_blank = NULL; 1844 ringp->ring_vqp = NULL; 1845 ringp->ring_vswp = vswp; 1846 } 1847 1848 static void 1849 vsw_mac_ring_tbl_init(vsw_t *vswp) 1850 { 1851 int i; 1852 1853 mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL); 1854 1855 vswp->mac_ring_tbl_sz = vsw_mac_rx_rings; 1856 vswp->mac_ring_tbl = 1857 kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), 1858 KM_SLEEP); 1859 1860 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) 1861 vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]); 1862 } 1863 1864 static void 1865 vsw_mac_ring_tbl_destroy(vsw_t *vswp) 1866 { 1867 int i; 1868 vsw_mac_ring_t *ringp; 1869 1870 mutex_enter(&vswp->mac_ring_lock); 1871 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1872 ringp = &vswp->mac_ring_tbl[i]; 1873 1874 if (ringp->ring_state != VSW_MAC_RING_FREE) { 1875 /* 1876 * Destroy the queue. 1877 */ 1878 vsw_queue_stop(ringp->ring_vqp); 1879 vsw_queue_destroy(ringp->ring_vqp); 1880 1881 /* 1882 * Re-initialize the structure. 1883 */ 1884 vsw_mac_ring_tbl_entry_init(vswp, ringp); 1885 } 1886 } 1887 mutex_exit(&vswp->mac_ring_lock); 1888 1889 mutex_destroy(&vswp->mac_ring_lock); 1890 kmem_free(vswp->mac_ring_tbl, 1891 vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t)); 1892 vswp->mac_ring_tbl_sz = 0; 1893 } 1894 1895 /* 1896 * Handle resource add callbacks from the driver below. 1897 */ 1898 static mac_resource_handle_t 1899 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp) 1900 { 1901 vsw_t *vswp = (vsw_t *)arg; 1902 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 1903 vsw_mac_ring_t *ringp; 1904 vsw_queue_t *vqp; 1905 int i; 1906 1907 ASSERT(vswp != NULL); 1908 ASSERT(mrp != NULL); 1909 ASSERT(vswp->mac_ring_tbl != NULL); 1910 1911 D1(vswp, "%s: enter", __func__); 1912 1913 /* 1914 * Check to make sure we have the correct resource type. 1915 */ 1916 if (mrp->mr_type != MAC_RX_FIFO) 1917 return (NULL); 1918 1919 /* 1920 * Find a open entry in the ring table. 1921 */ 1922 mutex_enter(&vswp->mac_ring_lock); 1923 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1924 ringp = &vswp->mac_ring_tbl[i]; 1925 1926 /* 1927 * Check for an empty slot, if found, then setup queue 1928 * and thread. 1929 */ 1930 if (ringp->ring_state == VSW_MAC_RING_FREE) { 1931 /* 1932 * Create the queue for this ring. 1933 */ 1934 vqp = vsw_queue_create(); 1935 1936 /* 1937 * Initialize the ring data structure. 1938 */ 1939 ringp->ring_vqp = vqp; 1940 ringp->ring_arg = mrfp->mrf_arg; 1941 ringp->ring_blank = mrfp->mrf_blank; 1942 ringp->ring_state = VSW_MAC_RING_INUSE; 1943 1944 /* 1945 * Create the worker thread. 1946 */ 1947 vqp->vq_worker = thread_create(NULL, 0, 1948 vsw_queue_worker, ringp, 0, &p0, 1949 TS_RUN, minclsyspri); 1950 if (vqp->vq_worker == NULL) { 1951 vsw_queue_destroy(vqp); 1952 vsw_mac_ring_tbl_entry_init(vswp, ringp); 1953 ringp = NULL; 1954 } 1955 1956 if (ringp != NULL) { 1957 /* 1958 * Make sure thread get's running state for 1959 * this ring. 1960 */ 1961 mutex_enter(&vqp->vq_lock); 1962 while ((vqp->vq_state != VSW_QUEUE_RUNNING) && 1963 (vqp->vq_state != VSW_QUEUE_DRAINED)) { 1964 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 1965 } 1966 1967 /* 1968 * If the thread is not running, cleanup. 1969 */ 1970 if (vqp->vq_state == VSW_QUEUE_DRAINED) { 1971 vsw_queue_destroy(vqp); 1972 vsw_mac_ring_tbl_entry_init(vswp, 1973 ringp); 1974 ringp = NULL; 1975 } 1976 mutex_exit(&vqp->vq_lock); 1977 } 1978 1979 mutex_exit(&vswp->mac_ring_lock); 1980 D1(vswp, "%s: exit", __func__); 1981 return ((mac_resource_handle_t)ringp); 1982 } 1983 } 1984 mutex_exit(&vswp->mac_ring_lock); 1985 1986 /* 1987 * No slots in the ring table available. 1988 */ 1989 D1(vswp, "%s: exit", __func__); 1990 return (NULL); 1991 } 1992 1993 static void 1994 vsw_queue_stop(vsw_queue_t *vqp) 1995 { 1996 mutex_enter(&vqp->vq_lock); 1997 1998 if (vqp->vq_state == VSW_QUEUE_RUNNING) { 1999 vqp->vq_state = VSW_QUEUE_STOP; 2000 cv_signal(&vqp->vq_cv); 2001 2002 while (vqp->vq_state != VSW_QUEUE_DRAINED) 2003 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 2004 } 2005 2006 vqp->vq_state = VSW_QUEUE_STOPPED; 2007 2008 mutex_exit(&vqp->vq_lock); 2009 } 2010 2011 static vsw_queue_t * 2012 vsw_queue_create() 2013 { 2014 vsw_queue_t *vqp; 2015 2016 vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP); 2017 2018 mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL); 2019 cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL); 2020 vqp->vq_first = NULL; 2021 vqp->vq_last = NULL; 2022 vqp->vq_state = VSW_QUEUE_STOPPED; 2023 2024 return (vqp); 2025 } 2026 2027 static void 2028 vsw_queue_destroy(vsw_queue_t *vqp) 2029 { 2030 cv_destroy(&vqp->vq_cv); 2031 mutex_destroy(&vqp->vq_lock); 2032 kmem_free(vqp, sizeof (vsw_queue_t)); 2033 } 2034 2035 static void 2036 vsw_queue_worker(vsw_mac_ring_t *rrp) 2037 { 2038 mblk_t *mp; 2039 vsw_queue_t *vqp = rrp->ring_vqp; 2040 vsw_t *vswp = rrp->ring_vswp; 2041 2042 mutex_enter(&vqp->vq_lock); 2043 2044 ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED); 2045 2046 /* 2047 * Set the state to running, since the thread is now active. 2048 */ 2049 vqp->vq_state = VSW_QUEUE_RUNNING; 2050 cv_signal(&vqp->vq_cv); 2051 2052 while (vqp->vq_state == VSW_QUEUE_RUNNING) { 2053 /* 2054 * Wait for work to do or the state has changed 2055 * to not running. 2056 */ 2057 while ((vqp->vq_state == VSW_QUEUE_RUNNING) && 2058 (vqp->vq_first == NULL)) { 2059 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 2060 } 2061 2062 /* 2063 * Process packets that we received from the interface. 2064 */ 2065 if (vqp->vq_first != NULL) { 2066 mp = vqp->vq_first; 2067 2068 vqp->vq_first = NULL; 2069 vqp->vq_last = NULL; 2070 2071 mutex_exit(&vqp->vq_lock); 2072 2073 /* switch the chain of packets received */ 2074 vswp->vsw_switch_frame(vswp, mp, 2075 VSW_PHYSDEV, NULL, NULL); 2076 2077 mutex_enter(&vqp->vq_lock); 2078 } 2079 } 2080 2081 /* 2082 * We are drained and signal we are done. 2083 */ 2084 vqp->vq_state = VSW_QUEUE_DRAINED; 2085 cv_signal(&vqp->vq_cv); 2086 2087 /* 2088 * Exit lock and drain the remaining packets. 2089 */ 2090 mutex_exit(&vqp->vq_lock); 2091 2092 /* 2093 * Exit the thread 2094 */ 2095 thread_exit(); 2096 } 2097 2098 /* 2099 * static void 2100 * vsw_rx_queue_cb() - Receive callback routine when 2101 * vsw_multi_ring_enable is non-zero. Queue the packets 2102 * to a packet queue for a worker thread to process. 2103 */ 2104 static void 2105 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 2106 { 2107 vsw_mac_ring_t *ringp = (vsw_mac_ring_t *)mrh; 2108 vsw_t *vswp = (vsw_t *)arg; 2109 vsw_queue_t *vqp; 2110 mblk_t *bp, *last; 2111 2112 ASSERT(mrh != NULL); 2113 ASSERT(vswp != NULL); 2114 ASSERT(mp != NULL); 2115 2116 D1(vswp, "%s: enter", __func__); 2117 2118 /* 2119 * Find the last element in the mblk chain. 2120 */ 2121 bp = mp; 2122 do { 2123 last = bp; 2124 bp = bp->b_next; 2125 } while (bp != NULL); 2126 2127 /* Get the queue for the packets */ 2128 vqp = ringp->ring_vqp; 2129 2130 /* 2131 * Grab the lock such we can queue the packets. 2132 */ 2133 mutex_enter(&vqp->vq_lock); 2134 2135 if (vqp->vq_state != VSW_QUEUE_RUNNING) { 2136 freemsg(mp); 2137 mutex_exit(&vqp->vq_lock); 2138 goto vsw_rx_queue_cb_exit; 2139 } 2140 2141 /* 2142 * Add the mblk chain to the queue. If there 2143 * is some mblks in the queue, then add the new 2144 * chain to the end. 2145 */ 2146 if (vqp->vq_first == NULL) 2147 vqp->vq_first = mp; 2148 else 2149 vqp->vq_last->b_next = mp; 2150 2151 vqp->vq_last = last; 2152 2153 /* 2154 * Signal the worker thread that there is work to 2155 * do. 2156 */ 2157 cv_signal(&vqp->vq_cv); 2158 2159 /* 2160 * Let go of the lock and exit. 2161 */ 2162 mutex_exit(&vqp->vq_lock); 2163 2164 vsw_rx_queue_cb_exit: 2165 D1(vswp, "%s: exit", __func__); 2166 } 2167 2168 /* 2169 * receive callback routine. Invoked by MAC layer when there 2170 * are pkts being passed up from physical device. 2171 * 2172 * PERF: It may be more efficient when the card is in promisc 2173 * mode to check the dest address of the pkts here (against 2174 * the FDB) rather than checking later. Needs to be investigated. 2175 */ 2176 static void 2177 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 2178 { 2179 _NOTE(ARGUNUSED(mrh)) 2180 2181 vsw_t *vswp = (vsw_t *)arg; 2182 2183 ASSERT(vswp != NULL); 2184 2185 D1(vswp, "vsw_rx_cb: enter"); 2186 2187 /* switch the chain of packets received */ 2188 vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 2189 2190 D1(vswp, "vsw_rx_cb: exit"); 2191 } 2192 2193 /* 2194 * Send a message out over the physical device via the MAC layer. 2195 * 2196 * Returns any mblks that it was unable to transmit. 2197 */ 2198 static mblk_t * 2199 vsw_tx_msg(vsw_t *vswp, mblk_t *mp) 2200 { 2201 const mac_txinfo_t *mtp; 2202 mblk_t *nextp; 2203 2204 mutex_enter(&vswp->mac_lock); 2205 if (vswp->mh == NULL) { 2206 DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); 2207 mutex_exit(&vswp->mac_lock); 2208 return (mp); 2209 } else { 2210 for (;;) { 2211 nextp = mp->b_next; 2212 mp->b_next = NULL; 2213 2214 mtp = vswp->txinfo; 2215 2216 if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { 2217 mp->b_next = nextp; 2218 break; 2219 } 2220 2221 if ((mp = nextp) == NULL) 2222 break; 2223 } 2224 } 2225 mutex_exit(&vswp->mac_lock); 2226 2227 return (mp); 2228 } 2229 2230 /* 2231 * Register with the MAC layer as a network device, so we 2232 * can be plumbed if necessary. 2233 */ 2234 static int 2235 vsw_mac_register(vsw_t *vswp) 2236 { 2237 mac_register_t *macp; 2238 int rv; 2239 2240 D1(vswp, "%s: enter", __func__); 2241 2242 if ((macp = mac_alloc(MAC_VERSION)) == NULL) 2243 return (EINVAL); 2244 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 2245 macp->m_driver = vswp; 2246 macp->m_dip = vswp->dip; 2247 macp->m_src_addr = (uint8_t *)&vswp->if_addr; 2248 macp->m_callbacks = &vsw_m_callbacks; 2249 macp->m_min_sdu = 0; 2250 macp->m_max_sdu = ETHERMTU; 2251 rv = mac_register(macp, &vswp->if_mh); 2252 mac_free(macp); 2253 if (rv == 0) 2254 vswp->if_state |= VSW_IF_REG; 2255 2256 D1(vswp, "%s: exit", __func__); 2257 2258 return (rv); 2259 } 2260 2261 static int 2262 vsw_mac_unregister(vsw_t *vswp) 2263 { 2264 int rv = 0; 2265 2266 D1(vswp, "%s: enter", __func__); 2267 2268 WRITE_ENTER(&vswp->if_lockrw); 2269 2270 if (vswp->if_state & VSW_IF_REG) { 2271 rv = mac_unregister(vswp->if_mh); 2272 if (rv != 0) { 2273 DWARN(vswp, "%s: unable to unregister from MAC " 2274 "framework", __func__); 2275 2276 RW_EXIT(&vswp->if_lockrw); 2277 D1(vswp, "%s: fail exit", __func__); 2278 return (rv); 2279 } 2280 2281 /* mark i/f as down and unregistered */ 2282 vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG); 2283 } 2284 RW_EXIT(&vswp->if_lockrw); 2285 2286 D1(vswp, "%s: exit", __func__); 2287 2288 return (rv); 2289 } 2290 2291 static int 2292 vsw_m_stat(void *arg, uint_t stat, uint64_t *val) 2293 { 2294 vsw_t *vswp = (vsw_t *)arg; 2295 2296 D1(vswp, "%s: enter", __func__); 2297 2298 mutex_enter(&vswp->mac_lock); 2299 if (vswp->mh == NULL) { 2300 mutex_exit(&vswp->mac_lock); 2301 return (EINVAL); 2302 } 2303 2304 /* return stats from underlying device */ 2305 *val = mac_stat_get(vswp->mh, stat); 2306 2307 mutex_exit(&vswp->mac_lock); 2308 2309 return (0); 2310 } 2311 2312 static void 2313 vsw_m_stop(void *arg) 2314 { 2315 vsw_t *vswp = (vsw_t *)arg; 2316 2317 D1(vswp, "%s: enter", __func__); 2318 2319 WRITE_ENTER(&vswp->if_lockrw); 2320 vswp->if_state &= ~VSW_IF_UP; 2321 RW_EXIT(&vswp->if_lockrw); 2322 2323 mutex_enter(&vswp->hw_lock); 2324 2325 (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); 2326 2327 if (vswp->recfg_reqd) 2328 vsw_reconfig_hw(vswp); 2329 2330 mutex_exit(&vswp->hw_lock); 2331 2332 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2333 } 2334 2335 static int 2336 vsw_m_start(void *arg) 2337 { 2338 vsw_t *vswp = (vsw_t *)arg; 2339 2340 D1(vswp, "%s: enter", __func__); 2341 2342 WRITE_ENTER(&vswp->if_lockrw); 2343 vswp->if_state |= VSW_IF_UP; 2344 RW_EXIT(&vswp->if_lockrw); 2345 2346 mutex_enter(&vswp->hw_lock); 2347 (void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV); 2348 mutex_exit(&vswp->hw_lock); 2349 2350 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2351 return (0); 2352 } 2353 2354 /* 2355 * Change the local interface address. 2356 * 2357 * Note: we don't support this entry point. The local 2358 * mac address of the switch can only be changed via its 2359 * MD node properties. 2360 */ 2361 static int 2362 vsw_m_unicst(void *arg, const uint8_t *macaddr) 2363 { 2364 _NOTE(ARGUNUSED(arg, macaddr)) 2365 2366 return (DDI_FAILURE); 2367 } 2368 2369 static int 2370 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) 2371 { 2372 vsw_t *vswp = (vsw_t *)arg; 2373 mcst_addr_t *mcst_p = NULL; 2374 uint64_t addr = 0x0; 2375 int i, ret = 0; 2376 2377 D1(vswp, "%s: enter", __func__); 2378 2379 /* 2380 * Convert address into form that can be used 2381 * as hash table key. 2382 */ 2383 for (i = 0; i < ETHERADDRL; i++) { 2384 addr = (addr << 8) | mca[i]; 2385 } 2386 2387 D2(vswp, "%s: addr = 0x%llx", __func__, addr); 2388 2389 if (add) { 2390 D2(vswp, "%s: adding multicast", __func__); 2391 if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2392 /* 2393 * Update the list of multicast addresses 2394 * contained within the vsw_t structure to 2395 * include this new one. 2396 */ 2397 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP); 2398 if (mcst_p == NULL) { 2399 DERR(vswp, "%s unable to alloc mem", __func__); 2400 return (1); 2401 } 2402 mcst_p->addr = addr; 2403 2404 mutex_enter(&vswp->mca_lock); 2405 mcst_p->nextp = vswp->mcap; 2406 vswp->mcap = mcst_p; 2407 mutex_exit(&vswp->mca_lock); 2408 2409 /* 2410 * Call into the underlying driver to program the 2411 * address into HW. 2412 */ 2413 mutex_enter(&vswp->mac_lock); 2414 if (vswp->mh != NULL) { 2415 ret = mac_multicst_add(vswp->mh, mca); 2416 if (ret != 0) { 2417 cmn_err(CE_WARN, "!vsw%d: unable to " 2418 "add multicast address", 2419 vswp->instance); 2420 mutex_exit(&vswp->mac_lock); 2421 goto vsw_remove_addr; 2422 } 2423 } 2424 mutex_exit(&vswp->mac_lock); 2425 } else { 2426 cmn_err(CE_WARN, "!vsw%d: unable to add multicast " 2427 "address", vswp->instance); 2428 } 2429 return (ret); 2430 } 2431 2432 vsw_remove_addr: 2433 2434 D2(vswp, "%s: removing multicast", __func__); 2435 /* 2436 * Remove the address from the hash table.. 2437 */ 2438 if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2439 2440 /* 2441 * ..and then from the list maintained in the 2442 * vsw_t structure. 2443 */ 2444 vsw_del_addr(VSW_LOCALDEV, vswp, addr); 2445 2446 mutex_enter(&vswp->mac_lock); 2447 if (vswp->mh != NULL) 2448 (void) mac_multicst_remove(vswp->mh, mca); 2449 mutex_exit(&vswp->mac_lock); 2450 } 2451 2452 D1(vswp, "%s: exit", __func__); 2453 2454 return (0); 2455 } 2456 2457 static int 2458 vsw_m_promisc(void *arg, boolean_t on) 2459 { 2460 vsw_t *vswp = (vsw_t *)arg; 2461 2462 D1(vswp, "%s: enter", __func__); 2463 2464 WRITE_ENTER(&vswp->if_lockrw); 2465 if (on) 2466 vswp->if_state |= VSW_IF_PROMISC; 2467 else 2468 vswp->if_state &= ~VSW_IF_PROMISC; 2469 RW_EXIT(&vswp->if_lockrw); 2470 2471 D1(vswp, "%s: exit", __func__); 2472 2473 return (0); 2474 } 2475 2476 static mblk_t * 2477 vsw_m_tx(void *arg, mblk_t *mp) 2478 { 2479 vsw_t *vswp = (vsw_t *)arg; 2480 2481 D1(vswp, "%s: enter", __func__); 2482 2483 vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); 2484 2485 D1(vswp, "%s: exit", __func__); 2486 2487 return (NULL); 2488 } 2489 2490 /* 2491 * Register for machine description (MD) updates. 2492 * 2493 * Returns 0 on success, 1 on failure. 2494 */ 2495 static int 2496 vsw_mdeg_register(vsw_t *vswp) 2497 { 2498 mdeg_prop_spec_t *pspecp; 2499 mdeg_node_spec_t *inst_specp; 2500 mdeg_handle_t mdeg_hdl, mdeg_port_hdl; 2501 size_t templatesz; 2502 int inst, rv; 2503 2504 D1(vswp, "%s: enter", __func__); 2505 2506 /* 2507 * In each 'virtual-device' node in the MD there is a 2508 * 'cfg-handle' property which is the MD's concept of 2509 * an instance number (this may be completely different from 2510 * the device drivers instance #). OBP reads that value and 2511 * stores it in the 'reg' property of the appropriate node in 2512 * the device tree. So we use the 'reg' value when registering 2513 * with the mdeg framework, to ensure we get events for the 2514 * correct nodes. 2515 */ 2516 inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 2517 DDI_PROP_DONTPASS, reg_propname, -1); 2518 if (inst == -1) { 2519 cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from " 2520 "OBP device tree", vswp->instance, reg_propname); 2521 return (1); 2522 } 2523 2524 D2(vswp, "%s: instance %d registering with mdeg", __func__, inst); 2525 2526 /* 2527 * Allocate and initialize a per-instance copy 2528 * of the global property spec array that will 2529 * uniquely identify this vsw instance. 2530 */ 2531 templatesz = sizeof (vsw_prop_template); 2532 pspecp = kmem_zalloc(templatesz, KM_SLEEP); 2533 2534 bcopy(vsw_prop_template, pspecp, templatesz); 2535 2536 VSW_SET_MDEG_PROP_INST(pspecp, inst); 2537 2538 /* initialize the complete prop spec structure */ 2539 inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 2540 inst_specp->namep = "virtual-device"; 2541 inst_specp->specp = pspecp; 2542 2543 /* 2544 * Register an interest in 'virtual-device' nodes with a 2545 * 'name' property of 'virtual-network-switch' 2546 */ 2547 rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb, 2548 (void *)vswp, &mdeg_hdl); 2549 if (rv != MDEG_SUCCESS) { 2550 DERR(vswp, "%s: mdeg_register failed (%d) for vsw node", 2551 __func__, rv); 2552 goto mdeg_reg_fail; 2553 } 2554 2555 /* 2556 * Register an interest in 'vsw-port' nodes. 2557 */ 2558 rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb, 2559 (void *)vswp, &mdeg_port_hdl); 2560 if (rv != MDEG_SUCCESS) { 2561 DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); 2562 (void) mdeg_unregister(mdeg_hdl); 2563 goto mdeg_reg_fail; 2564 } 2565 2566 /* save off data that will be needed later */ 2567 vswp->inst_spec = inst_specp; 2568 vswp->mdeg_hdl = mdeg_hdl; 2569 vswp->mdeg_port_hdl = mdeg_port_hdl; 2570 2571 D1(vswp, "%s: exit", __func__); 2572 return (0); 2573 2574 mdeg_reg_fail: 2575 cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks", 2576 vswp->instance); 2577 kmem_free(pspecp, templatesz); 2578 kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); 2579 2580 vswp->mdeg_hdl = NULL; 2581 vswp->mdeg_port_hdl = NULL; 2582 2583 return (1); 2584 } 2585 2586 static void 2587 vsw_mdeg_unregister(vsw_t *vswp) 2588 { 2589 D1(vswp, "vsw_mdeg_unregister: enter"); 2590 2591 if (vswp->mdeg_hdl != NULL) 2592 (void) mdeg_unregister(vswp->mdeg_hdl); 2593 2594 if (vswp->mdeg_port_hdl != NULL) 2595 (void) mdeg_unregister(vswp->mdeg_port_hdl); 2596 2597 if (vswp->inst_spec != NULL) { 2598 if (vswp->inst_spec->specp != NULL) { 2599 (void) kmem_free(vswp->inst_spec->specp, 2600 sizeof (vsw_prop_template)); 2601 vswp->inst_spec->specp = NULL; 2602 } 2603 2604 (void) kmem_free(vswp->inst_spec, 2605 sizeof (mdeg_node_spec_t)); 2606 vswp->inst_spec = NULL; 2607 } 2608 2609 D1(vswp, "vsw_mdeg_unregister: exit"); 2610 } 2611 2612 /* 2613 * Mdeg callback invoked for the vsw node itself. 2614 */ 2615 static int 2616 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2617 { 2618 vsw_t *vswp; 2619 int idx; 2620 md_t *mdp; 2621 mde_cookie_t node; 2622 uint64_t inst; 2623 char *node_name = NULL; 2624 2625 if (resp == NULL) 2626 return (MDEG_FAILURE); 2627 2628 vswp = (vsw_t *)cb_argp; 2629 2630 D1(vswp, "%s: added %d : removed %d : curr matched %d" 2631 " : prev matched %d", __func__, resp->added.nelem, 2632 resp->removed.nelem, resp->match_curr.nelem, 2633 resp->match_prev.nelem); 2634 2635 /* 2636 * Expect 'added' to be non-zero if virtual-network-switch 2637 * nodes exist in the MD when the driver attaches. 2638 */ 2639 for (idx = 0; idx < resp->added.nelem; idx++) { 2640 mdp = resp->added.mdp; 2641 node = resp->added.mdep[idx]; 2642 2643 if (md_get_prop_str(mdp, node, "name", &node_name) != 0) { 2644 DERR(vswp, "%s: unable to get node name for " 2645 "node(%d) 0x%lx", __func__, idx, node); 2646 continue; 2647 } 2648 2649 if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) { 2650 DERR(vswp, "%s: prop(cfg-handle) not found port(%d)", 2651 __func__, idx); 2652 continue; 2653 } 2654 2655 D2(vswp, "%s: added node(%d) 0x%lx with name %s " 2656 "and inst %d", __func__, idx, node, node_name, inst); 2657 2658 vsw_get_initial_md_properties(vswp, mdp, node); 2659 } 2660 2661 /* 2662 * A non-zero 'match' value indicates that the MD has been 2663 * updated and that a virtual-network-switch node is present 2664 * which may or may not have been updated. It is up to the clients 2665 * to examine their own nodes and determine if they have changed. 2666 */ 2667 for (idx = 0; idx < resp->match_curr.nelem; idx++) { 2668 mdp = resp->match_curr.mdp; 2669 node = resp->match_curr.mdep[idx]; 2670 2671 if (md_get_prop_str(mdp, node, "name", &node_name) != 0) { 2672 DERR(vswp, "%s: unable to get node name for " 2673 "node(%d) 0x%lx", __func__, idx, node); 2674 continue; 2675 } 2676 2677 if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) { 2678 DERR(vswp, "%s: prop(cfg-handle) not found port(%d)", 2679 __func__, idx); 2680 continue; 2681 } 2682 2683 D2(vswp, "%s: changed node(%d) 0x%lx with name %s " 2684 "and inst %d", __func__, idx, node, node_name, inst); 2685 2686 vsw_update_md_prop(vswp, mdp, node); 2687 } 2688 2689 return (MDEG_SUCCESS); 2690 } 2691 2692 /* 2693 * Mdeg callback invoked for changes to the vsw-port nodes 2694 * under the vsw node. 2695 */ 2696 static int 2697 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2698 { 2699 vsw_t *vswp; 2700 int idx; 2701 md_t *mdp; 2702 mde_cookie_t node; 2703 uint64_t inst; 2704 2705 if ((resp == NULL) || (cb_argp == NULL)) 2706 return (MDEG_FAILURE); 2707 2708 vswp = (vsw_t *)cb_argp; 2709 2710 D2(vswp, "%s: added %d : removed %d : curr matched %d" 2711 " : prev matched %d", __func__, resp->added.nelem, 2712 resp->removed.nelem, resp->match_curr.nelem, 2713 resp->match_prev.nelem); 2714 2715 /* process added ports */ 2716 for (idx = 0; idx < resp->added.nelem; idx++) { 2717 mdp = resp->added.mdp; 2718 node = resp->added.mdep[idx]; 2719 2720 D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); 2721 2722 if (vsw_port_add(vswp, mdp, &node) != 0) { 2723 cmn_err(CE_WARN, "!vsw%d: Unable to add new port " 2724 "(0x%lx)", vswp->instance, node); 2725 } 2726 } 2727 2728 /* process removed ports */ 2729 for (idx = 0; idx < resp->removed.nelem; idx++) { 2730 mdp = resp->removed.mdp; 2731 node = resp->removed.mdep[idx]; 2732 2733 if (md_get_prop_val(mdp, node, id_propname, &inst)) { 2734 DERR(vswp, "%s: prop(%s) not found in port(%d)", 2735 __func__, id_propname, idx); 2736 continue; 2737 } 2738 2739 D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); 2740 2741 if (vsw_port_detach(vswp, inst) != 0) { 2742 cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld", 2743 vswp->instance, inst); 2744 } 2745 } 2746 2747 /* 2748 * Currently no support for updating already active ports. 2749 * So, ignore the match_curr and match_priv arrays for now. 2750 */ 2751 2752 D1(vswp, "%s: exit", __func__); 2753 2754 return (MDEG_SUCCESS); 2755 } 2756 2757 /* 2758 * Read the initial start-of-day values from the specified MD node. 2759 */ 2760 static void 2761 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node) 2762 { 2763 int i; 2764 uint64_t macaddr = 0; 2765 2766 D1(vswp, "%s: enter", __func__); 2767 2768 if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) == 0) { 2769 /* 2770 * Note it is valid for the physname property to 2771 * be NULL so check actual name length to determine 2772 * if we have a actual device name. 2773 */ 2774 if (strlen(vswp->physname) > 0) 2775 vswp->mdprops |= VSW_MD_PHYSNAME; 2776 } else { 2777 cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical " 2778 "device from MD", vswp->instance); 2779 return; 2780 } 2781 2782 /* mac address for vswitch device itself */ 2783 if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { 2784 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", 2785 vswp->instance); 2786 2787 /* 2788 * Fallback to using the mac address of the physical 2789 * device. 2790 */ 2791 if (vsw_get_physaddr(vswp) == 0) { 2792 cmn_err(CE_NOTE, "!vsw%d: Using MAC address from " 2793 "physical device (%s)", vswp->instance, 2794 vswp->physname); 2795 } else { 2796 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address" 2797 "from device %s", vswp->instance, 2798 vswp->physname); 2799 } 2800 } else { 2801 WRITE_ENTER(&vswp->if_lockrw); 2802 for (i = ETHERADDRL - 1; i >= 0; i--) { 2803 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 2804 macaddr >>= 8; 2805 } 2806 RW_EXIT(&vswp->if_lockrw); 2807 vswp->mdprops |= VSW_MD_MACADDR; 2808 } 2809 2810 if (vsw_get_md_smodes(vswp, mdp, node, 2811 vswp->smode, &vswp->smode_num)) { 2812 cmn_err(CE_WARN, "vsw%d: Unable to read %s property from " 2813 "MD, defaulting to programmed mode", vswp->instance, 2814 smode_propname); 2815 2816 for (i = 0; i < NUM_SMODES; i++) 2817 vswp->smode[i] = VSW_LAYER2; 2818 2819 vswp->smode_num = NUM_SMODES; 2820 } else { 2821 ASSERT(vswp->smode_num != 0); 2822 vswp->mdprops |= VSW_MD_SMODE; 2823 } 2824 2825 /* 2826 * Unable to setup any switching mode, nothing more 2827 * we can do. 2828 */ 2829 if (vsw_setup_switching(vswp)) 2830 return; 2831 2832 WRITE_ENTER(&vswp->if_lockrw); 2833 vswp->if_state &= ~VSW_IF_UP; 2834 RW_EXIT(&vswp->if_lockrw); 2835 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 2836 if (vsw_mac_register(vswp) != 0) { 2837 /* 2838 * Treat this as a non-fatal error as we may be 2839 * able to operate in some other mode. 2840 */ 2841 cmn_err(CE_WARN, "vsw%d: Unable to register as " 2842 "provider with MAC layer", vswp->instance); 2843 } 2844 } 2845 2846 D1(vswp, "%s: exit", __func__); 2847 } 2848 2849 /* 2850 * Check to see if the relevant properties in the specified node have 2851 * changed, and if so take the appropriate action. 2852 * 2853 * If any of the properties are missing or invalid we don't take 2854 * any action, as this function should only be invoked when modifications 2855 * have been made to what we assume is a working configuration, which 2856 * we leave active. 2857 * 2858 * Note it is legal for this routine to be invoked even if none of the 2859 * properties in the port node within the MD have actually changed. 2860 */ 2861 static void 2862 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) 2863 { 2864 char physname[LIFNAMSIZ]; 2865 char drv[LIFNAMSIZ]; 2866 uint_t ddi_instance; 2867 uint8_t new_smode[NUM_SMODES]; 2868 int i, smode_num = 0; 2869 uint64_t macaddr = 0; 2870 vsw_port_list_t *plist = &vswp->plist; 2871 vsw_port_t *port = NULL; 2872 enum {MD_init = 0x1, 2873 MD_physname = 0x2, 2874 MD_macaddr = 0x4, 2875 MD_smode = 0x8} updated; 2876 2877 updated = MD_init; 2878 2879 D1(vswp, "%s: enter", __func__); 2880 2881 /* 2882 * Check if name of physical device in MD has changed. 2883 */ 2884 if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) { 2885 /* 2886 * Do basic sanity check on new device name/instance, 2887 * if its non NULL. It is valid for the device name to 2888 * have changed from a non NULL to a NULL value, i.e. 2889 * the vsw is being changed to 'routed' mode. 2890 */ 2891 if ((strlen(physname) != 0) && 2892 (ddi_parse(physname, drv, 2893 &ddi_instance) != DDI_SUCCESS)) { 2894 cmn_err(CE_WARN, "!vsw%d: new device name %s is not" 2895 " a valid device name/instance", 2896 vswp->instance, physname); 2897 goto fail_reconf; 2898 } 2899 2900 if (strcmp(physname, vswp->physname)) { 2901 D2(vswp, "%s: device name changed from %s to %s", 2902 __func__, vswp->physname, physname); 2903 2904 updated |= MD_physname; 2905 } else { 2906 D2(vswp, "%s: device name unchanged at %s", 2907 __func__, vswp->physname); 2908 } 2909 } else { 2910 cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical " 2911 "device from updated MD.", vswp->instance); 2912 goto fail_reconf; 2913 } 2914 2915 /* 2916 * Check if MAC address has changed. 2917 */ 2918 if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { 2919 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", 2920 vswp->instance); 2921 goto fail_reconf; 2922 } else { 2923 READ_ENTER(&vswp->if_lockrw); 2924 for (i = ETHERADDRL - 1; i >= 0; i--) { 2925 if (vswp->if_addr.ether_addr_octet[i] 2926 != (macaddr & 0xFF)) { 2927 D2(vswp, "%s: octet[%d] 0x%x != 0x%x", 2928 __func__, i, 2929 vswp->if_addr.ether_addr_octet[i], 2930 (macaddr & 0xFF)); 2931 updated |= MD_macaddr; 2932 break; 2933 } 2934 macaddr >>= 8; 2935 } 2936 RW_EXIT(&vswp->if_lockrw); 2937 } 2938 2939 /* 2940 * Check if switching modes have changed. 2941 */ 2942 if (vsw_get_md_smodes(vswp, mdp, node, 2943 new_smode, &smode_num)) { 2944 cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD", 2945 vswp->instance, smode_propname); 2946 goto fail_reconf; 2947 } else { 2948 ASSERT(smode_num != 0); 2949 if (smode_num != vswp->smode_num) { 2950 D2(vswp, "%s: number of modes changed from %d to %d", 2951 __func__, vswp->smode_num, smode_num); 2952 } 2953 2954 for (i = 0; i < smode_num; i++) { 2955 if (new_smode[i] != vswp->smode[i]) { 2956 D2(vswp, "%s: mode changed from %d to %d", 2957 __func__, vswp->smode[i], new_smode[i]); 2958 updated |= MD_smode; 2959 break; 2960 } 2961 } 2962 } 2963 2964 /* 2965 * Now make any changes which are needed... 2966 */ 2967 2968 if (updated & (MD_physname | MD_smode)) { 2969 /* 2970 * Disconnect all ports from the current card 2971 */ 2972 WRITE_ENTER(&plist->lockrw); 2973 for (port = plist->head; port != NULL; port = port->p_next) { 2974 /* Remove address if was programmed into HW. */ 2975 mutex_enter(&vswp->hw_lock); 2976 if (vsw_unset_hw(vswp, port, VSW_VNETPORT)) { 2977 mutex_exit(&vswp->hw_lock); 2978 RW_EXIT(&plist->lockrw); 2979 goto fail_update; 2980 } 2981 mutex_exit(&vswp->hw_lock); 2982 } 2983 RW_EXIT(&plist->lockrw); 2984 2985 /* 2986 * Stop, detach the old device.. 2987 */ 2988 vsw_mac_detach(vswp); 2989 2990 /* 2991 * Update phys name. 2992 */ 2993 if (updated & MD_physname) { 2994 cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s", 2995 vswp->instance, vswp->physname, physname); 2996 (void) strncpy(vswp->physname, 2997 physname, strlen(physname) + 1); 2998 2999 if (strlen(vswp->physname) > 0) 3000 vswp->mdprops |= VSW_MD_PHYSNAME; 3001 } 3002 3003 /* 3004 * Update array with the new switch mode values. 3005 */ 3006 if (updated & MD_smode) { 3007 for (i = 0; i < smode_num; i++) 3008 vswp->smode[i] = new_smode[i]; 3009 3010 vswp->smode_num = smode_num; 3011 vswp->smode_idx = 0; 3012 } 3013 3014 /* 3015 * ..and attach, start the new device. 3016 */ 3017 if (vsw_setup_switching(vswp)) 3018 goto fail_update; 3019 3020 /* 3021 * Connect ports to new card. 3022 */ 3023 WRITE_ENTER(&plist->lockrw); 3024 for (port = plist->head; port != NULL; port = port->p_next) { 3025 mutex_enter(&vswp->hw_lock); 3026 if (vsw_set_hw(vswp, port, VSW_VNETPORT)) { 3027 mutex_exit(&vswp->hw_lock); 3028 RW_EXIT(&plist->lockrw); 3029 goto fail_update; 3030 } 3031 mutex_exit(&vswp->hw_lock); 3032 } 3033 RW_EXIT(&plist->lockrw); 3034 } 3035 3036 if (updated & MD_macaddr) { 3037 cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx", 3038 vswp->instance, macaddr); 3039 3040 WRITE_ENTER(&vswp->if_lockrw); 3041 for (i = ETHERADDRL - 1; i >= 0; i--) { 3042 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 3043 macaddr >>= 8; 3044 } 3045 RW_EXIT(&vswp->if_lockrw); 3046 3047 /* 3048 * Remove old address from HW (if programmed) and set 3049 * new address. 3050 */ 3051 mutex_enter(&vswp->hw_lock); 3052 (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); 3053 (void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV); 3054 mutex_exit(&vswp->hw_lock); 3055 3056 /* 3057 * Notify the MAC layer of the changed address. 3058 */ 3059 mac_unicst_update(vswp->if_mh, (uint8_t *)&vswp->if_addr); 3060 } 3061 3062 return; 3063 3064 fail_reconf: 3065 cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance); 3066 return; 3067 3068 fail_update: 3069 cmn_err(CE_WARN, "!vsw%d: update of configuration failed", 3070 vswp->instance); 3071 } 3072 3073 /* 3074 * Add a new port to the system. 3075 * 3076 * Returns 0 on success, 1 on failure. 3077 */ 3078 int 3079 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node) 3080 { 3081 uint64_t ldc_id; 3082 uint8_t *addrp; 3083 int i, addrsz; 3084 int num_nodes = 0, nchan = 0; 3085 int listsz = 0; 3086 mde_cookie_t *listp = NULL; 3087 struct ether_addr ea; 3088 uint64_t macaddr; 3089 uint64_t inst = 0; 3090 vsw_port_t *port; 3091 3092 if (md_get_prop_val(mdp, *node, id_propname, &inst)) { 3093 DWARN(vswp, "%s: prop(%s) not found", __func__, 3094 id_propname); 3095 return (1); 3096 } 3097 3098 /* 3099 * Find the channel endpoint node(s) (which should be under this 3100 * port node) which contain the channel id(s). 3101 */ 3102 if ((num_nodes = md_node_count(mdp)) <= 0) { 3103 DERR(vswp, "%s: invalid number of nodes found (%d)", 3104 __func__, num_nodes); 3105 return (1); 3106 } 3107 3108 D2(vswp, "%s: %d nodes found", __func__, num_nodes); 3109 3110 /* allocate enough space for node list */ 3111 listsz = num_nodes * sizeof (mde_cookie_t); 3112 listp = kmem_zalloc(listsz, KM_SLEEP); 3113 3114 nchan = md_scan_dag(mdp, *node, 3115 md_find_name(mdp, chan_propname), 3116 md_find_name(mdp, "fwd"), listp); 3117 3118 if (nchan <= 0) { 3119 DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname); 3120 kmem_free(listp, listsz); 3121 return (1); 3122 } 3123 3124 D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname); 3125 3126 /* use property from first node found */ 3127 if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) { 3128 DWARN(vswp, "%s: prop(%s) not found\n", __func__, 3129 id_propname); 3130 kmem_free(listp, listsz); 3131 return (1); 3132 } 3133 3134 /* don't need list any more */ 3135 kmem_free(listp, listsz); 3136 3137 D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id); 3138 3139 /* read mac-address property */ 3140 if (md_get_prop_data(mdp, *node, remaddr_propname, 3141 &addrp, &addrsz)) { 3142 DWARN(vswp, "%s: prop(%s) not found", 3143 __func__, remaddr_propname); 3144 return (1); 3145 } 3146 3147 if (addrsz < ETHERADDRL) { 3148 DWARN(vswp, "%s: invalid address size", __func__); 3149 return (1); 3150 } 3151 3152 macaddr = *((uint64_t *)addrp); 3153 D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr); 3154 3155 for (i = ETHERADDRL - 1; i >= 0; i--) { 3156 ea.ether_addr_octet[i] = macaddr & 0xFF; 3157 macaddr >>= 8; 3158 } 3159 3160 if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) { 3161 DERR(vswp, "%s: failed to attach port", __func__); 3162 return (1); 3163 } 3164 3165 port = vsw_lookup_port(vswp, (int)inst); 3166 3167 /* just successfuly created the port, so it should exist */ 3168 ASSERT(port != NULL); 3169 3170 return (0); 3171 } 3172 3173 /* 3174 * Attach the specified port. 3175 * 3176 * Returns 0 on success, 1 on failure. 3177 */ 3178 static int 3179 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids, 3180 struct ether_addr *macaddr) 3181 { 3182 vsw_port_list_t *plist = &vswp->plist; 3183 vsw_port_t *port, **prev_port; 3184 int i; 3185 3186 D1(vswp, "%s: enter : port %d", __func__, p_instance); 3187 3188 /* port already exists? */ 3189 READ_ENTER(&plist->lockrw); 3190 for (port = plist->head; port != NULL; port = port->p_next) { 3191 if (port->p_instance == p_instance) { 3192 DWARN(vswp, "%s: port instance %d already attached", 3193 __func__, p_instance); 3194 RW_EXIT(&plist->lockrw); 3195 return (1); 3196 } 3197 } 3198 RW_EXIT(&plist->lockrw); 3199 3200 port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP); 3201 port->p_vswp = vswp; 3202 port->p_instance = p_instance; 3203 port->p_ldclist.num_ldcs = 0; 3204 port->p_ldclist.head = NULL; 3205 port->addr_set = VSW_ADDR_UNSET; 3206 3207 rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL); 3208 3209 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); 3210 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); 3211 3212 mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL); 3213 cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL); 3214 3215 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); 3216 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); 3217 port->state = VSW_PORT_INIT; 3218 3219 if (nids > VSW_PORT_MAX_LDCS) { 3220 D2(vswp, "%s: using first of %d ldc ids", 3221 __func__, nids); 3222 nids = VSW_PORT_MAX_LDCS; 3223 } 3224 3225 D2(vswp, "%s: %d nids", __func__, nids); 3226 for (i = 0; i < nids; i++) { 3227 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); 3228 if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { 3229 DERR(vswp, "%s: ldc_attach failed", __func__); 3230 3231 rw_destroy(&port->p_ldclist.lockrw); 3232 3233 cv_destroy(&port->ref_cv); 3234 mutex_destroy(&port->ref_lock); 3235 3236 cv_destroy(&port->state_cv); 3237 mutex_destroy(&port->state_lock); 3238 3239 mutex_destroy(&port->tx_lock); 3240 mutex_destroy(&port->mca_lock); 3241 kmem_free(port, sizeof (vsw_port_t)); 3242 return (1); 3243 } 3244 } 3245 3246 ether_copy(macaddr, &port->p_macaddr); 3247 3248 WRITE_ENTER(&plist->lockrw); 3249 3250 /* create the fdb entry for this port/mac address */ 3251 (void) vsw_add_fdb(vswp, port); 3252 3253 mutex_enter(&vswp->hw_lock); 3254 (void) vsw_set_hw(vswp, port, VSW_VNETPORT); 3255 mutex_exit(&vswp->hw_lock); 3256 3257 /* link it into the list of ports for this vsw instance */ 3258 prev_port = (vsw_port_t **)(&plist->head); 3259 port->p_next = *prev_port; 3260 *prev_port = port; 3261 plist->num_ports++; 3262 RW_EXIT(&plist->lockrw); 3263 3264 /* 3265 * Initialise the port and any ldc's under it. 3266 */ 3267 (void) vsw_init_ldcs(port); 3268 3269 D1(vswp, "%s: exit", __func__); 3270 return (0); 3271 } 3272 3273 /* 3274 * Detach the specified port. 3275 * 3276 * Returns 0 on success, 1 on failure. 3277 */ 3278 static int 3279 vsw_port_detach(vsw_t *vswp, int p_instance) 3280 { 3281 vsw_port_t *port = NULL; 3282 vsw_port_list_t *plist = &vswp->plist; 3283 3284 D1(vswp, "%s: enter: port id %d", __func__, p_instance); 3285 3286 WRITE_ENTER(&plist->lockrw); 3287 3288 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { 3289 RW_EXIT(&plist->lockrw); 3290 return (1); 3291 } 3292 3293 if (vsw_plist_del_node(vswp, port)) { 3294 RW_EXIT(&plist->lockrw); 3295 return (1); 3296 } 3297 3298 /* Remove the fdb entry for this port/mac address */ 3299 (void) vsw_del_fdb(vswp, port); 3300 3301 /* Remove any multicast addresses.. */ 3302 vsw_del_mcst_port(port); 3303 3304 /* 3305 * No longer need to hold writer lock on port list now 3306 * that we have unlinked the target port from the list. 3307 */ 3308 RW_EXIT(&plist->lockrw); 3309 3310 /* Remove address if was programmed into HW. */ 3311 mutex_enter(&vswp->hw_lock); 3312 (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); 3313 if (vswp->recfg_reqd) 3314 vsw_reconfig_hw(vswp); 3315 mutex_exit(&vswp->hw_lock); 3316 3317 if (vsw_port_delete(port)) { 3318 return (1); 3319 } 3320 3321 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); 3322 return (0); 3323 } 3324 3325 /* 3326 * Detach all active ports. 3327 * 3328 * Returns 0 on success, 1 on failure. 3329 */ 3330 static int 3331 vsw_detach_ports(vsw_t *vswp) 3332 { 3333 vsw_port_list_t *plist = &vswp->plist; 3334 vsw_port_t *port = NULL; 3335 3336 D1(vswp, "%s: enter", __func__); 3337 3338 WRITE_ENTER(&plist->lockrw); 3339 3340 while ((port = plist->head) != NULL) { 3341 if (vsw_plist_del_node(vswp, port)) { 3342 DERR(vswp, "%s: Error deleting port %d" 3343 " from port list", __func__, 3344 port->p_instance); 3345 RW_EXIT(&plist->lockrw); 3346 return (1); 3347 } 3348 3349 /* Remove address if was programmed into HW. */ 3350 mutex_enter(&vswp->hw_lock); 3351 (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); 3352 mutex_exit(&vswp->hw_lock); 3353 3354 /* Remove the fdb entry for this port/mac address */ 3355 (void) vsw_del_fdb(vswp, port); 3356 3357 /* Remove any multicast addresses.. */ 3358 vsw_del_mcst_port(port); 3359 3360 /* 3361 * No longer need to hold the lock on the port list 3362 * now that we have unlinked the target port from the 3363 * list. 3364 */ 3365 RW_EXIT(&plist->lockrw); 3366 if (vsw_port_delete(port)) { 3367 DERR(vswp, "%s: Error deleting port %d", 3368 __func__, port->p_instance); 3369 return (1); 3370 } 3371 WRITE_ENTER(&plist->lockrw); 3372 } 3373 RW_EXIT(&plist->lockrw); 3374 3375 D1(vswp, "%s: exit", __func__); 3376 3377 return (0); 3378 } 3379 3380 /* 3381 * Delete the specified port. 3382 * 3383 * Returns 0 on success, 1 on failure. 3384 */ 3385 static int 3386 vsw_port_delete(vsw_port_t *port) 3387 { 3388 vsw_ldc_list_t *ldcl; 3389 vsw_t *vswp = port->p_vswp; 3390 3391 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); 3392 3393 (void) vsw_uninit_ldcs(port); 3394 3395 /* 3396 * Wait for any pending ctrl msg tasks which reference this 3397 * port to finish. 3398 */ 3399 if (vsw_drain_port_taskq(port)) 3400 return (1); 3401 3402 /* 3403 * Wait for port reference count to hit zero. 3404 */ 3405 mutex_enter(&port->ref_lock); 3406 while (port->ref_cnt != 0) 3407 cv_wait(&port->ref_cv, &port->ref_lock); 3408 mutex_exit(&port->ref_lock); 3409 3410 /* 3411 * Wait for any active callbacks to finish 3412 */ 3413 if (vsw_drain_ldcs(port)) 3414 return (1); 3415 3416 ldcl = &port->p_ldclist; 3417 WRITE_ENTER(&ldcl->lockrw); 3418 while (ldcl->num_ldcs > 0) { 3419 if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {; 3420 cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld", 3421 vswp->instance, ldcl->head->ldc_id); 3422 RW_EXIT(&ldcl->lockrw); 3423 return (1); 3424 } 3425 } 3426 RW_EXIT(&ldcl->lockrw); 3427 3428 rw_destroy(&port->p_ldclist.lockrw); 3429 3430 mutex_destroy(&port->mca_lock); 3431 mutex_destroy(&port->tx_lock); 3432 cv_destroy(&port->ref_cv); 3433 mutex_destroy(&port->ref_lock); 3434 3435 cv_destroy(&port->state_cv); 3436 mutex_destroy(&port->state_lock); 3437 3438 kmem_free(port, sizeof (vsw_port_t)); 3439 3440 D1(vswp, "%s: exit", __func__); 3441 3442 return (0); 3443 } 3444 3445 /* 3446 * Attach a logical domain channel (ldc) under a specified port. 3447 * 3448 * Returns 0 on success, 1 on failure. 3449 */ 3450 static int 3451 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) 3452 { 3453 vsw_t *vswp = port->p_vswp; 3454 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3455 vsw_ldc_t *ldcp = NULL; 3456 ldc_attr_t attr; 3457 ldc_status_t istatus; 3458 int status = DDI_FAILURE; 3459 int rv; 3460 enum { PROG_init = 0x0, PROG_mblks = 0x1, 3461 PROG_callback = 0x2} 3462 progress; 3463 3464 progress = PROG_init; 3465 3466 D1(vswp, "%s: enter", __func__); 3467 3468 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); 3469 if (ldcp == NULL) { 3470 DERR(vswp, "%s: kmem_zalloc failed", __func__); 3471 return (1); 3472 } 3473 ldcp->ldc_id = ldc_id; 3474 3475 /* allocate pool of receive mblks */ 3476 rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh)); 3477 if (rv) { 3478 DWARN(vswp, "%s: unable to create free mblk pool for" 3479 " channel %ld (rv %d)", __func__, ldc_id, rv); 3480 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3481 return (1); 3482 } 3483 3484 progress |= PROG_mblks; 3485 3486 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); 3487 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); 3488 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); 3489 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); 3490 rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL); 3491 rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL); 3492 3493 /* required for handshake with peer */ 3494 ldcp->local_session = (uint64_t)ddi_get_lbolt(); 3495 ldcp->peer_session = 0; 3496 ldcp->session_status = 0; 3497 3498 mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL); 3499 ldcp->hss_id = 1; /* Initial handshake session id */ 3500 3501 /* only set for outbound lane, inbound set by peer */ 3502 mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL); 3503 mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL); 3504 vsw_set_lane_attr(vswp, &ldcp->lane_out); 3505 3506 attr.devclass = LDC_DEV_NT_SVC; 3507 attr.instance = ddi_get_instance(vswp->dip); 3508 attr.mode = LDC_MODE_UNRELIABLE; 3509 attr.mtu = VSW_LDC_MTU; 3510 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); 3511 if (status != 0) { 3512 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", 3513 __func__, ldc_id, status); 3514 goto ldc_attach_fail; 3515 } 3516 3517 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); 3518 if (status != 0) { 3519 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", 3520 __func__, ldc_id, status); 3521 (void) ldc_fini(ldcp->ldc_handle); 3522 goto ldc_attach_fail; 3523 } 3524 3525 progress |= PROG_callback; 3526 3527 mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL); 3528 3529 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 3530 DERR(vswp, "%s: ldc_status failed", __func__); 3531 mutex_destroy(&ldcp->status_lock); 3532 goto ldc_attach_fail; 3533 } 3534 3535 ldcp->ldc_status = istatus; 3536 ldcp->ldc_port = port; 3537 ldcp->ldc_vswp = vswp; 3538 3539 /* link it into the list of channels for this port */ 3540 WRITE_ENTER(&ldcl->lockrw); 3541 ldcp->ldc_next = ldcl->head; 3542 ldcl->head = ldcp; 3543 ldcl->num_ldcs++; 3544 RW_EXIT(&ldcl->lockrw); 3545 3546 D1(vswp, "%s: exit", __func__); 3547 return (0); 3548 3549 ldc_attach_fail: 3550 mutex_destroy(&ldcp->ldc_txlock); 3551 mutex_destroy(&ldcp->ldc_cblock); 3552 3553 cv_destroy(&ldcp->drain_cv); 3554 3555 rw_destroy(&ldcp->lane_in.dlistrw); 3556 rw_destroy(&ldcp->lane_out.dlistrw); 3557 3558 if (progress & PROG_callback) { 3559 (void) ldc_unreg_callback(ldcp->ldc_handle); 3560 } 3561 3562 if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) { 3563 if (vio_destroy_mblks(ldcp->rxh) != 0) { 3564 /* 3565 * Something odd has happened, as the destroy 3566 * will only fail if some mblks have been allocated 3567 * from the pool already (which shouldn't happen) 3568 * and have not been returned. 3569 * 3570 * Add the pool pointer to a list maintained in 3571 * the device instance. Another attempt will be made 3572 * to free the pool when the device itself detaches. 3573 */ 3574 cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld " 3575 "failed and cannot destroy associated mblk " 3576 "pool", vswp->instance, ldc_id); 3577 ldcp->rxh->nextp = vswp->rxh; 3578 vswp->rxh = ldcp->rxh; 3579 } 3580 } 3581 mutex_destroy(&ldcp->drain_cv_lock); 3582 mutex_destroy(&ldcp->hss_lock); 3583 3584 mutex_destroy(&ldcp->lane_in.seq_lock); 3585 mutex_destroy(&ldcp->lane_out.seq_lock); 3586 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3587 3588 return (1); 3589 } 3590 3591 /* 3592 * Detach a logical domain channel (ldc) belonging to a 3593 * particular port. 3594 * 3595 * Returns 0 on success, 1 on failure. 3596 */ 3597 static int 3598 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) 3599 { 3600 vsw_t *vswp = port->p_vswp; 3601 vsw_ldc_t *ldcp, *prev_ldcp; 3602 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3603 int rv; 3604 3605 prev_ldcp = ldcl->head; 3606 for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) { 3607 if (ldcp->ldc_id == ldc_id) { 3608 break; 3609 } 3610 } 3611 3612 /* specified ldc id not found */ 3613 if (ldcp == NULL) { 3614 DERR(vswp, "%s: ldcp = NULL", __func__); 3615 return (1); 3616 } 3617 3618 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); 3619 3620 /* 3621 * Before we can close the channel we must release any mapped 3622 * resources (e.g. drings). 3623 */ 3624 vsw_free_lane_resources(ldcp, INBOUND); 3625 vsw_free_lane_resources(ldcp, OUTBOUND); 3626 3627 /* 3628 * If the close fails we are in serious trouble, as won't 3629 * be able to delete the parent port. 3630 */ 3631 if ((rv = ldc_close(ldcp->ldc_handle)) != 0) { 3632 DERR(vswp, "%s: error %d closing channel %lld", 3633 __func__, rv, ldcp->ldc_id); 3634 return (1); 3635 } 3636 3637 (void) ldc_fini(ldcp->ldc_handle); 3638 3639 ldcp->ldc_status = LDC_INIT; 3640 ldcp->ldc_handle = NULL; 3641 ldcp->ldc_vswp = NULL; 3642 3643 if (ldcp->rxh != NULL) { 3644 if (vio_destroy_mblks(ldcp->rxh)) { 3645 /* 3646 * Mostly likely some mblks are still in use and 3647 * have not been returned to the pool. Add the pool 3648 * to the list maintained in the device instance. 3649 * Another attempt will be made to destroy the pool 3650 * when the device detaches. 3651 */ 3652 ldcp->rxh->nextp = vswp->rxh; 3653 vswp->rxh = ldcp->rxh; 3654 } 3655 } 3656 3657 /* unlink it from the list */ 3658 prev_ldcp = ldcp->ldc_next; 3659 ldcl->num_ldcs--; 3660 3661 mutex_destroy(&ldcp->ldc_txlock); 3662 mutex_destroy(&ldcp->ldc_cblock); 3663 cv_destroy(&ldcp->drain_cv); 3664 mutex_destroy(&ldcp->drain_cv_lock); 3665 mutex_destroy(&ldcp->hss_lock); 3666 mutex_destroy(&ldcp->lane_in.seq_lock); 3667 mutex_destroy(&ldcp->lane_out.seq_lock); 3668 mutex_destroy(&ldcp->status_lock); 3669 rw_destroy(&ldcp->lane_in.dlistrw); 3670 rw_destroy(&ldcp->lane_out.dlistrw); 3671 3672 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3673 3674 return (0); 3675 } 3676 3677 /* 3678 * Open and attempt to bring up the channel. Note that channel 3679 * can only be brought up if peer has also opened channel. 3680 * 3681 * Returns 0 if can open and bring up channel, otherwise 3682 * returns 1. 3683 */ 3684 static int 3685 vsw_ldc_init(vsw_ldc_t *ldcp) 3686 { 3687 vsw_t *vswp = ldcp->ldc_vswp; 3688 ldc_status_t istatus = 0; 3689 int rv; 3690 3691 D1(vswp, "%s: enter", __func__); 3692 3693 LDC_ENTER_LOCK(ldcp); 3694 3695 /* don't start at 0 in case clients don't like that */ 3696 ldcp->next_ident = 1; 3697 3698 rv = ldc_open(ldcp->ldc_handle); 3699 if (rv != 0) { 3700 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", 3701 __func__, ldcp->ldc_id, rv); 3702 LDC_EXIT_LOCK(ldcp); 3703 return (1); 3704 } 3705 3706 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 3707 DERR(vswp, "%s: unable to get status", __func__); 3708 LDC_EXIT_LOCK(ldcp); 3709 return (1); 3710 3711 } else if (istatus != LDC_OPEN && istatus != LDC_READY) { 3712 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", 3713 __func__, ldcp->ldc_id, istatus); 3714 LDC_EXIT_LOCK(ldcp); 3715 return (1); 3716 } 3717 3718 mutex_enter(&ldcp->status_lock); 3719 ldcp->ldc_status = istatus; 3720 mutex_exit(&ldcp->status_lock); 3721 3722 rv = ldc_up(ldcp->ldc_handle); 3723 if (rv != 0) { 3724 /* 3725 * Not a fatal error for ldc_up() to fail, as peer 3726 * end point may simply not be ready yet. 3727 */ 3728 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 3729 ldcp->ldc_id, rv); 3730 LDC_EXIT_LOCK(ldcp); 3731 return (1); 3732 } 3733 3734 /* 3735 * ldc_up() call is non-blocking so need to explicitly 3736 * check channel status to see if in fact the channel 3737 * is UP. 3738 */ 3739 mutex_enter(&ldcp->status_lock); 3740 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 3741 DERR(vswp, "%s: unable to get status", __func__); 3742 mutex_exit(&ldcp->status_lock); 3743 LDC_EXIT_LOCK(ldcp); 3744 return (1); 3745 3746 } 3747 3748 if (ldcp->ldc_status == LDC_UP) { 3749 D2(vswp, "%s: channel %ld now UP (%ld)", __func__, 3750 ldcp->ldc_id, istatus); 3751 mutex_exit(&ldcp->status_lock); 3752 LDC_EXIT_LOCK(ldcp); 3753 3754 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 3755 return (0); 3756 } 3757 3758 mutex_exit(&ldcp->status_lock); 3759 LDC_EXIT_LOCK(ldcp); 3760 3761 D1(vswp, "%s: exit", __func__); 3762 return (0); 3763 } 3764 3765 /* disable callbacks on the channel */ 3766 static int 3767 vsw_ldc_uninit(vsw_ldc_t *ldcp) 3768 { 3769 vsw_t *vswp = ldcp->ldc_vswp; 3770 int rv; 3771 3772 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); 3773 3774 LDC_ENTER_LOCK(ldcp); 3775 3776 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); 3777 if (rv != 0) { 3778 DERR(vswp, "vsw_ldc_uninit(%lld): error disabling " 3779 "interrupts (rv = %d)\n", ldcp->ldc_id, rv); 3780 LDC_EXIT_LOCK(ldcp); 3781 return (1); 3782 } 3783 3784 mutex_enter(&ldcp->status_lock); 3785 ldcp->ldc_status = LDC_INIT; 3786 mutex_exit(&ldcp->status_lock); 3787 3788 LDC_EXIT_LOCK(ldcp); 3789 3790 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); 3791 3792 return (0); 3793 } 3794 3795 static int 3796 vsw_init_ldcs(vsw_port_t *port) 3797 { 3798 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3799 vsw_ldc_t *ldcp; 3800 3801 READ_ENTER(&ldcl->lockrw); 3802 ldcp = ldcl->head; 3803 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3804 (void) vsw_ldc_init(ldcp); 3805 } 3806 RW_EXIT(&ldcl->lockrw); 3807 3808 return (0); 3809 } 3810 3811 static int 3812 vsw_uninit_ldcs(vsw_port_t *port) 3813 { 3814 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3815 vsw_ldc_t *ldcp; 3816 3817 D1(NULL, "vsw_uninit_ldcs: enter\n"); 3818 3819 READ_ENTER(&ldcl->lockrw); 3820 ldcp = ldcl->head; 3821 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3822 (void) vsw_ldc_uninit(ldcp); 3823 } 3824 RW_EXIT(&ldcl->lockrw); 3825 3826 D1(NULL, "vsw_uninit_ldcs: exit\n"); 3827 3828 return (0); 3829 } 3830 3831 /* 3832 * Wait until the callback(s) associated with the ldcs under the specified 3833 * port have completed. 3834 * 3835 * Prior to this function being invoked each channel under this port 3836 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3837 * 3838 * A short explaination of what we are doing below.. 3839 * 3840 * The simplest approach would be to have a reference counter in 3841 * the ldc structure which is increment/decremented by the callbacks as 3842 * they use the channel. The drain function could then simply disable any 3843 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately 3844 * there is a tiny window here - before the callback is able to get the lock 3845 * on the channel it is interrupted and this function gets to execute. It 3846 * sees that the ref count is zero and believes its free to delete the 3847 * associated data structures. 3848 * 3849 * We get around this by taking advantage of the fact that before the ldc 3850 * framework invokes a callback it sets a flag to indicate that there is a 3851 * callback active (or about to become active). If when we attempt to 3852 * unregister a callback when this active flag is set then the unregister 3853 * will fail with EWOULDBLOCK. 3854 * 3855 * If the unregister fails we do a cv_timedwait. We will either be signaled 3856 * by the callback as it is exiting (note we have to wait a short period to 3857 * allow the callback to return fully to the ldc framework and it to clear 3858 * the active flag), or by the timer expiring. In either case we again attempt 3859 * the unregister. We repeat this until we can succesfully unregister the 3860 * callback. 3861 * 3862 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch 3863 * the case where the callback has finished but the ldc framework has not yet 3864 * cleared the active flag. In this case we would never get a cv_signal. 3865 */ 3866 static int 3867 vsw_drain_ldcs(vsw_port_t *port) 3868 { 3869 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3870 vsw_ldc_t *ldcp; 3871 vsw_t *vswp = port->p_vswp; 3872 3873 D1(vswp, "%s: enter", __func__); 3874 3875 READ_ENTER(&ldcl->lockrw); 3876 3877 ldcp = ldcl->head; 3878 3879 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3880 /* 3881 * If we can unregister the channel callback then we 3882 * know that there is no callback either running or 3883 * scheduled to run for this channel so move on to next 3884 * channel in the list. 3885 */ 3886 mutex_enter(&ldcp->drain_cv_lock); 3887 3888 /* prompt active callbacks to quit */ 3889 ldcp->drain_state = VSW_LDC_DRAINING; 3890 3891 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { 3892 D2(vswp, "%s: unreg callback for chan %ld", __func__, 3893 ldcp->ldc_id); 3894 mutex_exit(&ldcp->drain_cv_lock); 3895 continue; 3896 } else { 3897 /* 3898 * If we end up here we know that either 1) a callback 3899 * is currently executing, 2) is about to start (i.e. 3900 * the ldc framework has set the active flag but 3901 * has not actually invoked the callback yet, or 3) 3902 * has finished and has returned to the ldc framework 3903 * but the ldc framework has not yet cleared the 3904 * active bit. 3905 * 3906 * Wait for it to finish. 3907 */ 3908 while (ldc_unreg_callback(ldcp->ldc_handle) 3909 == EWOULDBLOCK) 3910 (void) cv_timedwait(&ldcp->drain_cv, 3911 &ldcp->drain_cv_lock, lbolt + hz); 3912 3913 mutex_exit(&ldcp->drain_cv_lock); 3914 D2(vswp, "%s: unreg callback for chan %ld after " 3915 "timeout", __func__, ldcp->ldc_id); 3916 } 3917 } 3918 RW_EXIT(&ldcl->lockrw); 3919 3920 D1(vswp, "%s: exit", __func__); 3921 return (0); 3922 } 3923 3924 /* 3925 * Wait until all tasks which reference this port have completed. 3926 * 3927 * Prior to this function being invoked each channel under this port 3928 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3929 */ 3930 static int 3931 vsw_drain_port_taskq(vsw_port_t *port) 3932 { 3933 vsw_t *vswp = port->p_vswp; 3934 3935 D1(vswp, "%s: enter", __func__); 3936 3937 /* 3938 * Mark the port as in the process of being detached, and 3939 * dispatch a marker task to the queue so we know when all 3940 * relevant tasks have completed. 3941 */ 3942 mutex_enter(&port->state_lock); 3943 port->state = VSW_PORT_DETACHING; 3944 3945 if ((vswp->taskq_p == NULL) || 3946 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, 3947 port, DDI_NOSLEEP) != DDI_SUCCESS)) { 3948 DERR(vswp, "%s: unable to dispatch marker task", 3949 __func__); 3950 mutex_exit(&port->state_lock); 3951 return (1); 3952 } 3953 3954 /* 3955 * Wait for the marker task to finish. 3956 */ 3957 while (port->state != VSW_PORT_DETACHABLE) 3958 cv_wait(&port->state_cv, &port->state_lock); 3959 3960 mutex_exit(&port->state_lock); 3961 3962 D1(vswp, "%s: exit", __func__); 3963 3964 return (0); 3965 } 3966 3967 static void 3968 vsw_marker_task(void *arg) 3969 { 3970 vsw_port_t *port = arg; 3971 vsw_t *vswp = port->p_vswp; 3972 3973 D1(vswp, "%s: enter", __func__); 3974 3975 mutex_enter(&port->state_lock); 3976 3977 /* 3978 * No further tasks should be dispatched which reference 3979 * this port so ok to mark it as safe to detach. 3980 */ 3981 port->state = VSW_PORT_DETACHABLE; 3982 3983 cv_signal(&port->state_cv); 3984 3985 mutex_exit(&port->state_lock); 3986 3987 D1(vswp, "%s: exit", __func__); 3988 } 3989 3990 static vsw_port_t * 3991 vsw_lookup_port(vsw_t *vswp, int p_instance) 3992 { 3993 vsw_port_list_t *plist = &vswp->plist; 3994 vsw_port_t *port; 3995 3996 for (port = plist->head; port != NULL; port = port->p_next) { 3997 if (port->p_instance == p_instance) { 3998 D2(vswp, "vsw_lookup_port: found p_instance\n"); 3999 return (port); 4000 } 4001 } 4002 4003 return (NULL); 4004 } 4005 4006 /* 4007 * Search for and remove the specified port from the port 4008 * list. Returns 0 if able to locate and remove port, otherwise 4009 * returns 1. 4010 */ 4011 static int 4012 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) 4013 { 4014 vsw_port_list_t *plist = &vswp->plist; 4015 vsw_port_t *curr_p, *prev_p; 4016 4017 if (plist->head == NULL) 4018 return (1); 4019 4020 curr_p = prev_p = plist->head; 4021 4022 while (curr_p != NULL) { 4023 if (curr_p == port) { 4024 if (prev_p == curr_p) { 4025 plist->head = curr_p->p_next; 4026 } else { 4027 prev_p->p_next = curr_p->p_next; 4028 } 4029 plist->num_ports--; 4030 break; 4031 } else { 4032 prev_p = curr_p; 4033 curr_p = curr_p->p_next; 4034 } 4035 } 4036 return (0); 4037 } 4038 4039 /* 4040 * Interrupt handler for ldc messages. 4041 */ 4042 static uint_t 4043 vsw_ldc_cb(uint64_t event, caddr_t arg) 4044 { 4045 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 4046 vsw_t *vswp = ldcp->ldc_vswp; 4047 4048 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4049 4050 mutex_enter(&ldcp->ldc_cblock); 4051 4052 mutex_enter(&ldcp->status_lock); 4053 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { 4054 mutex_exit(&ldcp->status_lock); 4055 mutex_exit(&ldcp->ldc_cblock); 4056 return (LDC_SUCCESS); 4057 } 4058 mutex_exit(&ldcp->status_lock); 4059 4060 if (event & LDC_EVT_UP) { 4061 /* 4062 * Channel has come up. 4063 */ 4064 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", 4065 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 4066 4067 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 4068 4069 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 4070 } 4071 4072 if (event & LDC_EVT_READ) { 4073 /* 4074 * Data available for reading. 4075 */ 4076 D2(vswp, "%s: id(ld) event(%llx) data READ", 4077 __func__, ldcp->ldc_id, event); 4078 4079 vsw_process_pkt(ldcp); 4080 4081 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 4082 4083 goto vsw_cb_exit; 4084 } 4085 4086 if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) { 4087 D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)", 4088 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 4089 4090 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 4091 } 4092 4093 /* 4094 * Catch either LDC_EVT_WRITE which we don't support or any 4095 * unknown event. 4096 */ 4097 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET 4098 | LDC_EVT_DOWN | LDC_EVT_READ)) { 4099 4100 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", 4101 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 4102 } 4103 4104 vsw_cb_exit: 4105 mutex_exit(&ldcp->ldc_cblock); 4106 4107 /* 4108 * Let the drain function know we are finishing if it 4109 * is waiting. 4110 */ 4111 mutex_enter(&ldcp->drain_cv_lock); 4112 if (ldcp->drain_state == VSW_LDC_DRAINING) 4113 cv_signal(&ldcp->drain_cv); 4114 mutex_exit(&ldcp->drain_cv_lock); 4115 4116 return (LDC_SUCCESS); 4117 } 4118 4119 /* 4120 * Reinitialise data structures associated with the channel. 4121 */ 4122 static void 4123 vsw_ldc_reinit(vsw_ldc_t *ldcp) 4124 { 4125 vsw_t *vswp = ldcp->ldc_vswp; 4126 vsw_port_t *port; 4127 vsw_ldc_list_t *ldcl; 4128 4129 D1(vswp, "%s: enter", __func__); 4130 4131 port = ldcp->ldc_port; 4132 ldcl = &port->p_ldclist; 4133 4134 READ_ENTER(&ldcl->lockrw); 4135 4136 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, 4137 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 4138 4139 vsw_free_lane_resources(ldcp, INBOUND); 4140 vsw_free_lane_resources(ldcp, OUTBOUND); 4141 RW_EXIT(&ldcl->lockrw); 4142 4143 ldcp->lane_in.lstate = 0; 4144 ldcp->lane_out.lstate = 0; 4145 4146 /* 4147 * Remove parent port from any multicast groups 4148 * it may have registered with. Client must resend 4149 * multicast add command after handshake completes. 4150 */ 4151 (void) vsw_del_fdb(vswp, port); 4152 4153 vsw_del_mcst_port(port); 4154 4155 ldcp->peer_session = 0; 4156 ldcp->session_status = 0; 4157 ldcp->hcnt = 0; 4158 ldcp->hphase = VSW_MILESTONE0; 4159 4160 D1(vswp, "%s: exit", __func__); 4161 } 4162 4163 /* 4164 * Process a connection event. 4165 * 4166 * Note - care must be taken to ensure that this function is 4167 * not called with the dlistrw lock held. 4168 */ 4169 static void 4170 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt) 4171 { 4172 vsw_t *vswp = ldcp->ldc_vswp; 4173 vsw_conn_evt_t *conn = NULL; 4174 4175 D1(vswp, "%s: enter", __func__); 4176 4177 /* 4178 * Check if either a reset or restart event is pending 4179 * or in progress. If so just return. 4180 * 4181 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT 4182 * being received by the callback handler, or a ECONNRESET error 4183 * code being returned from a ldc_read() or ldc_write() call. 4184 * 4185 * A VSW_CONN_RESTART event occurs when some error checking code 4186 * decides that there is a problem with data from the channel, 4187 * and that the handshake should be restarted. 4188 */ 4189 if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) && 4190 (ldstub((uint8_t *)&ldcp->reset_active))) 4191 return; 4192 4193 /* 4194 * If it is an LDC_UP event we first check the recorded 4195 * state of the channel. If this is UP then we know that 4196 * the channel moving to the UP state has already been dealt 4197 * with and don't need to dispatch a new task. 4198 * 4199 * The reason for this check is that when we do a ldc_up(), 4200 * depending on the state of the peer, we may or may not get 4201 * a LDC_UP event. As we can't depend on getting a LDC_UP evt 4202 * every time we do ldc_up() we explicitly check the channel 4203 * status to see has it come up (ldc_up() is asynch and will 4204 * complete at some undefined time), and take the appropriate 4205 * action. 4206 * 4207 * The flip side of this is that we may get a LDC_UP event 4208 * when we have already seen that the channel is up and have 4209 * dealt with that. 4210 */ 4211 mutex_enter(&ldcp->status_lock); 4212 if (evt == VSW_CONN_UP) { 4213 if ((ldcp->ldc_status == LDC_UP) || 4214 (ldcp->reset_active != 0)) { 4215 mutex_exit(&ldcp->status_lock); 4216 return; 4217 } 4218 } 4219 mutex_exit(&ldcp->status_lock); 4220 4221 /* 4222 * The transaction group id allows us to identify and discard 4223 * any tasks which are still pending on the taskq and refer 4224 * to the handshake session we are about to restart or reset. 4225 * These stale messages no longer have any real meaning. 4226 */ 4227 mutex_enter(&ldcp->hss_lock); 4228 ldcp->hss_id++; 4229 mutex_exit(&ldcp->hss_lock); 4230 4231 ASSERT(vswp->taskq_p != NULL); 4232 4233 if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) { 4234 cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for" 4235 " connection event", vswp->instance); 4236 goto err_exit; 4237 } 4238 4239 conn->evt = evt; 4240 conn->ldcp = ldcp; 4241 4242 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn, 4243 DDI_NOSLEEP) != DDI_SUCCESS) { 4244 cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task", 4245 vswp->instance); 4246 4247 kmem_free(conn, sizeof (vsw_conn_evt_t)); 4248 goto err_exit; 4249 } 4250 4251 D1(vswp, "%s: exit", __func__); 4252 return; 4253 4254 err_exit: 4255 /* 4256 * Have mostly likely failed due to memory shortage. Clear the flag so 4257 * that future requests will at least be attempted and will hopefully 4258 * succeed. 4259 */ 4260 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) 4261 ldcp->reset_active = 0; 4262 } 4263 4264 /* 4265 * Deal with events relating to a connection. Invoked from a taskq. 4266 */ 4267 static void 4268 vsw_conn_task(void *arg) 4269 { 4270 vsw_conn_evt_t *conn = (vsw_conn_evt_t *)arg; 4271 vsw_ldc_t *ldcp = NULL; 4272 vsw_t *vswp = NULL; 4273 uint16_t evt; 4274 ldc_status_t curr_status; 4275 4276 ldcp = conn->ldcp; 4277 evt = conn->evt; 4278 vswp = ldcp->ldc_vswp; 4279 4280 D1(vswp, "%s: enter", __func__); 4281 4282 /* can safely free now have copied out data */ 4283 kmem_free(conn, sizeof (vsw_conn_evt_t)); 4284 4285 mutex_enter(&ldcp->status_lock); 4286 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { 4287 cmn_err(CE_WARN, "!vsw%d: Unable to read status of " 4288 "channel %ld", vswp->instance, ldcp->ldc_id); 4289 mutex_exit(&ldcp->status_lock); 4290 return; 4291 } 4292 4293 /* 4294 * If we wish to restart the handshake on this channel, then if 4295 * the channel is UP we bring it DOWN to flush the underlying 4296 * ldc queue. 4297 */ 4298 if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP)) 4299 (void) ldc_down(ldcp->ldc_handle); 4300 4301 /* 4302 * re-init all the associated data structures. 4303 */ 4304 vsw_ldc_reinit(ldcp); 4305 4306 /* 4307 * Bring the channel back up (note it does no harm to 4308 * do this even if the channel is already UP, Just 4309 * becomes effectively a no-op). 4310 */ 4311 (void) ldc_up(ldcp->ldc_handle); 4312 4313 /* 4314 * Check if channel is now UP. This will only happen if 4315 * peer has also done a ldc_up(). 4316 */ 4317 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { 4318 cmn_err(CE_WARN, "!vsw%d: Unable to read status of " 4319 "channel %ld", vswp->instance, ldcp->ldc_id); 4320 mutex_exit(&ldcp->status_lock); 4321 return; 4322 } 4323 4324 ldcp->ldc_status = curr_status; 4325 4326 /* channel UP so restart handshake by sending version info */ 4327 if (curr_status == LDC_UP) { 4328 if (ldcp->hcnt++ > vsw_num_handshakes) { 4329 cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted" 4330 " handshake attempts (%d) on channel %ld", 4331 vswp->instance, ldcp->hcnt, ldcp->ldc_id); 4332 mutex_exit(&ldcp->status_lock); 4333 return; 4334 } 4335 4336 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, 4337 DDI_NOSLEEP) != DDI_SUCCESS) { 4338 cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task", 4339 vswp->instance); 4340 4341 /* 4342 * Don't count as valid restart attempt if couldn't 4343 * send version msg. 4344 */ 4345 if (ldcp->hcnt > 0) 4346 ldcp->hcnt--; 4347 } 4348 } 4349 4350 /* 4351 * Mark that the process is complete by clearing the flag. 4352 * 4353 * Note is it possible that the taskq dispatch above may have failed, 4354 * most likely due to memory shortage. We still clear the flag so 4355 * future attempts will at least be attempted and will hopefully 4356 * succeed. 4357 */ 4358 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) 4359 ldcp->reset_active = 0; 4360 4361 mutex_exit(&ldcp->status_lock); 4362 4363 D1(vswp, "%s: exit", __func__); 4364 } 4365 4366 /* 4367 * returns 0 if legal for event signified by flag to have 4368 * occured at the time it did. Otherwise returns 1. 4369 */ 4370 int 4371 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) 4372 { 4373 vsw_t *vswp = ldcp->ldc_vswp; 4374 uint64_t state; 4375 uint64_t phase; 4376 4377 if (dir == INBOUND) 4378 state = ldcp->lane_in.lstate; 4379 else 4380 state = ldcp->lane_out.lstate; 4381 4382 phase = ldcp->hphase; 4383 4384 switch (flag) { 4385 case VSW_VER_INFO_RECV: 4386 if (phase > VSW_MILESTONE0) { 4387 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" 4388 " when in state %d\n", ldcp->ldc_id, phase); 4389 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4390 return (1); 4391 } 4392 break; 4393 4394 case VSW_VER_ACK_RECV: 4395 case VSW_VER_NACK_RECV: 4396 if (!(state & VSW_VER_INFO_SENT)) { 4397 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK" 4398 " or VER_NACK when in state %d\n", 4399 ldcp->ldc_id, phase); 4400 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4401 return (1); 4402 } else 4403 state &= ~VSW_VER_INFO_SENT; 4404 break; 4405 4406 case VSW_ATTR_INFO_RECV: 4407 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { 4408 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" 4409 " when in state %d\n", ldcp->ldc_id, phase); 4410 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4411 return (1); 4412 } 4413 break; 4414 4415 case VSW_ATTR_ACK_RECV: 4416 case VSW_ATTR_NACK_RECV: 4417 if (!(state & VSW_ATTR_INFO_SENT)) { 4418 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" 4419 " or ATTR_NACK when in state %d\n", 4420 ldcp->ldc_id, phase); 4421 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4422 return (1); 4423 } else 4424 state &= ~VSW_ATTR_INFO_SENT; 4425 break; 4426 4427 case VSW_DRING_INFO_RECV: 4428 if (phase < VSW_MILESTONE1) { 4429 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" 4430 " when in state %d\n", ldcp->ldc_id, phase); 4431 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4432 return (1); 4433 } 4434 break; 4435 4436 case VSW_DRING_ACK_RECV: 4437 case VSW_DRING_NACK_RECV: 4438 if (!(state & VSW_DRING_INFO_SENT)) { 4439 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK" 4440 " or DRING_NACK when in state %d\n", 4441 ldcp->ldc_id, phase); 4442 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4443 return (1); 4444 } else 4445 state &= ~VSW_DRING_INFO_SENT; 4446 break; 4447 4448 case VSW_RDX_INFO_RECV: 4449 if (phase < VSW_MILESTONE3) { 4450 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" 4451 " when in state %d\n", ldcp->ldc_id, phase); 4452 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4453 return (1); 4454 } 4455 break; 4456 4457 case VSW_RDX_ACK_RECV: 4458 case VSW_RDX_NACK_RECV: 4459 if (!(state & VSW_RDX_INFO_SENT)) { 4460 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK" 4461 " or RDX_NACK when in state %d\n", 4462 ldcp->ldc_id, phase); 4463 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4464 return (1); 4465 } else 4466 state &= ~VSW_RDX_INFO_SENT; 4467 break; 4468 4469 case VSW_MCST_INFO_RECV: 4470 if (phase < VSW_MILESTONE3) { 4471 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" 4472 " when in state %d\n", ldcp->ldc_id, phase); 4473 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4474 return (1); 4475 } 4476 break; 4477 4478 default: 4479 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", 4480 ldcp->ldc_id, flag); 4481 return (1); 4482 } 4483 4484 if (dir == INBOUND) 4485 ldcp->lane_in.lstate = state; 4486 else 4487 ldcp->lane_out.lstate = state; 4488 4489 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); 4490 4491 return (0); 4492 } 4493 4494 void 4495 vsw_next_milestone(vsw_ldc_t *ldcp) 4496 { 4497 vsw_t *vswp = ldcp->ldc_vswp; 4498 4499 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, 4500 ldcp->ldc_id, ldcp->hphase); 4501 4502 DUMP_FLAGS(ldcp->lane_in.lstate); 4503 DUMP_FLAGS(ldcp->lane_out.lstate); 4504 4505 switch (ldcp->hphase) { 4506 4507 case VSW_MILESTONE0: 4508 /* 4509 * If we haven't started to handshake with our peer, 4510 * start to do so now. 4511 */ 4512 if (ldcp->lane_out.lstate == 0) { 4513 D2(vswp, "%s: (chan %lld) starting handshake " 4514 "with peer", __func__, ldcp->ldc_id); 4515 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 4516 } 4517 4518 /* 4519 * Only way to pass this milestone is to have successfully 4520 * negotiated version info. 4521 */ 4522 if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) && 4523 (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) { 4524 4525 D2(vswp, "%s: (chan %lld) leaving milestone 0", 4526 __func__, ldcp->ldc_id); 4527 4528 /* 4529 * Next milestone is passed when attribute 4530 * information has been successfully exchanged. 4531 */ 4532 ldcp->hphase = VSW_MILESTONE1; 4533 vsw_send_attr(ldcp); 4534 4535 } 4536 break; 4537 4538 case VSW_MILESTONE1: 4539 /* 4540 * Only way to pass this milestone is to have successfully 4541 * negotiated attribute information. 4542 */ 4543 if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) { 4544 4545 ldcp->hphase = VSW_MILESTONE2; 4546 4547 /* 4548 * If the peer device has said it wishes to 4549 * use descriptor rings then we send it our ring 4550 * info, otherwise we just set up a private ring 4551 * which we use an internal buffer 4552 */ 4553 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) 4554 vsw_send_dring_info(ldcp); 4555 } 4556 break; 4557 4558 case VSW_MILESTONE2: 4559 /* 4560 * If peer has indicated in its attribute message that 4561 * it wishes to use descriptor rings then the only way 4562 * to pass this milestone is for us to have received 4563 * valid dring info. 4564 * 4565 * If peer is not using descriptor rings then just fall 4566 * through. 4567 */ 4568 if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) && 4569 (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))) 4570 break; 4571 4572 D2(vswp, "%s: (chan %lld) leaving milestone 2", 4573 __func__, ldcp->ldc_id); 4574 4575 ldcp->hphase = VSW_MILESTONE3; 4576 vsw_send_rdx(ldcp); 4577 break; 4578 4579 case VSW_MILESTONE3: 4580 /* 4581 * Pass this milestone when all paramaters have been 4582 * successfully exchanged and RDX sent in both directions. 4583 * 4584 * Mark outbound lane as available to transmit data. 4585 */ 4586 if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) && 4587 (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) { 4588 4589 D2(vswp, "%s: (chan %lld) leaving milestone 3", 4590 __func__, ldcp->ldc_id); 4591 D2(vswp, "%s: ** handshake complete (0x%llx : " 4592 "0x%llx) **", __func__, ldcp->lane_in.lstate, 4593 ldcp->lane_out.lstate); 4594 ldcp->lane_out.lstate |= VSW_LANE_ACTIVE; 4595 ldcp->hphase = VSW_MILESTONE4; 4596 ldcp->hcnt = 0; 4597 DISPLAY_STATE(); 4598 } else { 4599 D2(vswp, "%s: still in milestone 3 (0x%llx :" 4600 " 0x%llx", __func__, ldcp->lane_in.lstate, 4601 ldcp->lane_out.lstate); 4602 } 4603 break; 4604 4605 case VSW_MILESTONE4: 4606 D2(vswp, "%s: (chan %lld) in milestone 4", __func__, 4607 ldcp->ldc_id); 4608 break; 4609 4610 default: 4611 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, 4612 ldcp->ldc_id, ldcp->hphase); 4613 } 4614 4615 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, 4616 ldcp->hphase); 4617 } 4618 4619 /* 4620 * Check if major version is supported. 4621 * 4622 * Returns 0 if finds supported major number, and if necessary 4623 * adjusts the minor field. 4624 * 4625 * Returns 1 if can't match major number exactly. Sets mjor/minor 4626 * to next lowest support values, or to zero if no other values possible. 4627 */ 4628 static int 4629 vsw_supported_version(vio_ver_msg_t *vp) 4630 { 4631 int i; 4632 4633 D1(NULL, "vsw_supported_version: enter"); 4634 4635 for (i = 0; i < VSW_NUM_VER; i++) { 4636 if (vsw_versions[i].ver_major == vp->ver_major) { 4637 /* 4638 * Matching or lower major version found. Update 4639 * minor number if necessary. 4640 */ 4641 if (vp->ver_minor > vsw_versions[i].ver_minor) { 4642 D2(NULL, "%s: adjusting minor value" 4643 " from %d to %d", __func__, 4644 vp->ver_minor, 4645 vsw_versions[i].ver_minor); 4646 vp->ver_minor = vsw_versions[i].ver_minor; 4647 } 4648 4649 return (0); 4650 } 4651 4652 if (vsw_versions[i].ver_major < vp->ver_major) { 4653 if (vp->ver_minor > vsw_versions[i].ver_minor) { 4654 D2(NULL, "%s: adjusting minor value" 4655 " from %d to %d", __func__, 4656 vp->ver_minor, 4657 vsw_versions[i].ver_minor); 4658 vp->ver_minor = vsw_versions[i].ver_minor; 4659 } 4660 return (1); 4661 } 4662 } 4663 4664 /* No match was possible, zero out fields */ 4665 vp->ver_major = 0; 4666 vp->ver_minor = 0; 4667 4668 D1(NULL, "vsw_supported_version: exit"); 4669 4670 return (1); 4671 } 4672 4673 /* 4674 * Main routine for processing messages received over LDC. 4675 */ 4676 static void 4677 vsw_process_pkt(void *arg) 4678 { 4679 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 4680 vsw_t *vswp = ldcp->ldc_vswp; 4681 size_t msglen; 4682 vio_msg_tag_t tag; 4683 def_msg_t dmsg; 4684 int rv = 0; 4685 4686 4687 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4688 4689 /* 4690 * If channel is up read messages until channel is empty. 4691 */ 4692 do { 4693 msglen = sizeof (dmsg); 4694 rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen); 4695 4696 if (rv != 0) { 4697 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) " 4698 "len(%d)\n", __func__, ldcp->ldc_id, 4699 rv, msglen); 4700 } 4701 4702 /* channel has been reset */ 4703 if (rv == ECONNRESET) { 4704 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 4705 break; 4706 } 4707 4708 if (msglen == 0) { 4709 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, 4710 ldcp->ldc_id); 4711 break; 4712 } 4713 4714 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, 4715 ldcp->ldc_id, msglen); 4716 4717 /* 4718 * Figure out what sort of packet we have gotten by 4719 * examining the msg tag, and then switch it appropriately. 4720 */ 4721 bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t)); 4722 4723 switch (tag.vio_msgtype) { 4724 case VIO_TYPE_CTRL: 4725 vsw_dispatch_ctrl_task(ldcp, &dmsg, tag); 4726 break; 4727 case VIO_TYPE_DATA: 4728 vsw_process_data_pkt(ldcp, &dmsg, tag); 4729 break; 4730 case VIO_TYPE_ERR: 4731 vsw_process_err_pkt(ldcp, &dmsg, tag); 4732 break; 4733 default: 4734 DERR(vswp, "%s: Unknown tag(%lx) ", __func__, 4735 "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id); 4736 break; 4737 } 4738 } while (msglen); 4739 4740 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4741 } 4742 4743 /* 4744 * Dispatch a task to process a VIO control message. 4745 */ 4746 static void 4747 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag) 4748 { 4749 vsw_ctrl_task_t *ctaskp = NULL; 4750 vsw_port_t *port = ldcp->ldc_port; 4751 vsw_t *vswp = port->p_vswp; 4752 4753 D1(vswp, "%s: enter", __func__); 4754 4755 /* 4756 * We need to handle RDX ACK messages in-band as once they 4757 * are exchanged it is possible that we will get an 4758 * immediate (legitimate) data packet. 4759 */ 4760 if ((tag.vio_subtype_env == VIO_RDX) && 4761 (tag.vio_subtype == VIO_SUBTYPE_ACK)) { 4762 4763 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV)) 4764 return; 4765 4766 ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV; 4767 D2(vswp, "%s (%ld) handling RDX_ACK in place " 4768 "(ostate 0x%llx : hphase %d)", __func__, 4769 ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase); 4770 vsw_next_milestone(ldcp); 4771 return; 4772 } 4773 4774 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); 4775 4776 if (ctaskp == NULL) { 4777 DERR(vswp, "%s: unable to alloc space for ctrl" 4778 " msg", __func__); 4779 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4780 return; 4781 } 4782 4783 ctaskp->ldcp = ldcp; 4784 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t)); 4785 mutex_enter(&ldcp->hss_lock); 4786 ctaskp->hss_id = ldcp->hss_id; 4787 mutex_exit(&ldcp->hss_lock); 4788 4789 /* 4790 * Dispatch task to processing taskq if port is not in 4791 * the process of being detached. 4792 */ 4793 mutex_enter(&port->state_lock); 4794 if (port->state == VSW_PORT_INIT) { 4795 if ((vswp->taskq_p == NULL) || 4796 (ddi_taskq_dispatch(vswp->taskq_p, 4797 vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP) 4798 != DDI_SUCCESS)) { 4799 DERR(vswp, "%s: unable to dispatch task to taskq", 4800 __func__); 4801 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4802 mutex_exit(&port->state_lock); 4803 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4804 return; 4805 } 4806 } else { 4807 DWARN(vswp, "%s: port %d detaching, not dispatching " 4808 "task", __func__, port->p_instance); 4809 } 4810 4811 mutex_exit(&port->state_lock); 4812 4813 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, 4814 ldcp->ldc_id); 4815 D1(vswp, "%s: exit", __func__); 4816 } 4817 4818 /* 4819 * Process a VIO ctrl message. Invoked from taskq. 4820 */ 4821 static void 4822 vsw_process_ctrl_pkt(void *arg) 4823 { 4824 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; 4825 vsw_ldc_t *ldcp = ctaskp->ldcp; 4826 vsw_t *vswp = ldcp->ldc_vswp; 4827 vio_msg_tag_t tag; 4828 uint16_t env; 4829 4830 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4831 4832 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); 4833 env = tag.vio_subtype_env; 4834 4835 /* stale pkt check */ 4836 mutex_enter(&ldcp->hss_lock); 4837 if (ctaskp->hss_id < ldcp->hss_id) { 4838 DWARN(vswp, "%s: discarding stale packet belonging to" 4839 " earlier (%ld) handshake session", __func__, 4840 ctaskp->hss_id); 4841 mutex_exit(&ldcp->hss_lock); 4842 return; 4843 } 4844 mutex_exit(&ldcp->hss_lock); 4845 4846 /* session id check */ 4847 if (ldcp->session_status & VSW_PEER_SESSION) { 4848 if (ldcp->peer_session != tag.vio_sid) { 4849 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 4850 __func__, ldcp->ldc_id, tag.vio_sid); 4851 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4852 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4853 return; 4854 } 4855 } 4856 4857 /* 4858 * Switch on vio_subtype envelope, then let lower routines 4859 * decide if its an INFO, ACK or NACK packet. 4860 */ 4861 switch (env) { 4862 case VIO_VER_INFO: 4863 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); 4864 break; 4865 case VIO_DRING_REG: 4866 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); 4867 break; 4868 case VIO_DRING_UNREG: 4869 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); 4870 break; 4871 case VIO_ATTR_INFO: 4872 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); 4873 break; 4874 case VNET_MCAST_INFO: 4875 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); 4876 break; 4877 case VIO_RDX: 4878 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); 4879 break; 4880 default: 4881 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 4882 __func__, env); 4883 } 4884 4885 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4886 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4887 } 4888 4889 /* 4890 * Version negotiation. We can end up here either because our peer 4891 * has responded to a handshake message we have sent it, or our peer 4892 * has initiated a handshake with us. If its the former then can only 4893 * be ACK or NACK, if its the later can only be INFO. 4894 * 4895 * If its an ACK we move to the next stage of the handshake, namely 4896 * attribute exchange. If its a NACK we see if we can specify another 4897 * version, if we can't we stop. 4898 * 4899 * If it is an INFO we reset all params associated with communication 4900 * in that direction over this channel (remember connection is 4901 * essentially 2 independent simplex channels). 4902 */ 4903 void 4904 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) 4905 { 4906 vio_ver_msg_t *ver_pkt; 4907 vsw_t *vswp = ldcp->ldc_vswp; 4908 4909 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4910 4911 /* 4912 * We know this is a ctrl/version packet so 4913 * cast it into the correct structure. 4914 */ 4915 ver_pkt = (vio_ver_msg_t *)pkt; 4916 4917 switch (ver_pkt->tag.vio_subtype) { 4918 case VIO_SUBTYPE_INFO: 4919 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); 4920 4921 /* 4922 * Record the session id, which we will use from now 4923 * until we see another VER_INFO msg. Even then the 4924 * session id in most cases will be unchanged, execpt 4925 * if channel was reset. 4926 */ 4927 if ((ldcp->session_status & VSW_PEER_SESSION) && 4928 (ldcp->peer_session != ver_pkt->tag.vio_sid)) { 4929 DERR(vswp, "%s: updating session id for chan %lld " 4930 "from %llx to %llx", __func__, ldcp->ldc_id, 4931 ldcp->peer_session, ver_pkt->tag.vio_sid); 4932 } 4933 4934 ldcp->peer_session = ver_pkt->tag.vio_sid; 4935 ldcp->session_status |= VSW_PEER_SESSION; 4936 4937 /* Legal message at this time ? */ 4938 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) 4939 return; 4940 4941 /* 4942 * First check the device class. Currently only expect 4943 * to be talking to a network device. In the future may 4944 * also talk to another switch. 4945 */ 4946 if (ver_pkt->dev_class != VDEV_NETWORK) { 4947 DERR(vswp, "%s: illegal device class %d", __func__, 4948 ver_pkt->dev_class); 4949 4950 ver_pkt->tag.vio_sid = ldcp->local_session; 4951 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4952 4953 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4954 4955 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 4956 sizeof (vio_ver_msg_t), B_TRUE); 4957 4958 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 4959 vsw_next_milestone(ldcp); 4960 return; 4961 } else { 4962 ldcp->dev_class = ver_pkt->dev_class; 4963 } 4964 4965 /* 4966 * Now check the version. 4967 */ 4968 if (vsw_supported_version(ver_pkt) == 0) { 4969 /* 4970 * Support this major version and possibly 4971 * adjusted minor version. 4972 */ 4973 4974 D2(vswp, "%s: accepted ver %d:%d", __func__, 4975 ver_pkt->ver_major, ver_pkt->ver_minor); 4976 4977 /* Store accepted values */ 4978 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4979 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4980 4981 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4982 4983 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; 4984 } else { 4985 /* 4986 * NACK back with the next lower major/minor 4987 * pairing we support (if don't suuport any more 4988 * versions then they will be set to zero. 4989 */ 4990 4991 D2(vswp, "%s: replying with ver %d:%d", __func__, 4992 ver_pkt->ver_major, ver_pkt->ver_minor); 4993 4994 /* Store updated values */ 4995 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4996 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4997 4998 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4999 5000 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 5001 } 5002 5003 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 5004 ver_pkt->tag.vio_sid = ldcp->local_session; 5005 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 5006 sizeof (vio_ver_msg_t), B_TRUE); 5007 5008 vsw_next_milestone(ldcp); 5009 break; 5010 5011 case VIO_SUBTYPE_ACK: 5012 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); 5013 5014 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) 5015 return; 5016 5017 /* Store updated values */ 5018 ldcp->lane_in.ver_major = ver_pkt->ver_major; 5019 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 5020 5021 5022 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; 5023 vsw_next_milestone(ldcp); 5024 5025 break; 5026 5027 case VIO_SUBTYPE_NACK: 5028 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); 5029 5030 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) 5031 return; 5032 5033 /* 5034 * If our peer sent us a NACK with the ver fields set to 5035 * zero then there is nothing more we can do. Otherwise see 5036 * if we support either the version suggested, or a lesser 5037 * one. 5038 */ 5039 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 5040 DERR(vswp, "%s: peer unable to negotiate any " 5041 "further.", __func__); 5042 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 5043 vsw_next_milestone(ldcp); 5044 return; 5045 } 5046 5047 /* 5048 * Check to see if we support this major version or 5049 * a lower one. If we don't then maj/min will be set 5050 * to zero. 5051 */ 5052 (void) vsw_supported_version(ver_pkt); 5053 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 5054 /* Nothing more we can do */ 5055 DERR(vswp, "%s: version negotiation failed.\n", 5056 __func__); 5057 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 5058 vsw_next_milestone(ldcp); 5059 } else { 5060 /* found a supported major version */ 5061 ldcp->lane_out.ver_major = ver_pkt->ver_major; 5062 ldcp->lane_out.ver_minor = ver_pkt->ver_minor; 5063 5064 D2(vswp, "%s: resending with updated values (%x, %x)", 5065 __func__, ver_pkt->ver_major, 5066 ver_pkt->ver_minor); 5067 5068 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; 5069 ver_pkt->tag.vio_sid = ldcp->local_session; 5070 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 5071 5072 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 5073 5074 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 5075 sizeof (vio_ver_msg_t), B_TRUE); 5076 5077 vsw_next_milestone(ldcp); 5078 5079 } 5080 break; 5081 5082 default: 5083 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5084 ver_pkt->tag.vio_subtype); 5085 } 5086 5087 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 5088 } 5089 5090 /* 5091 * Process an attribute packet. We can end up here either because our peer 5092 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our 5093 * peer has sent us an attribute INFO message 5094 * 5095 * If its an ACK we then move to the next stage of the handshake which 5096 * is to send our descriptor ring info to our peer. If its a NACK then 5097 * there is nothing more we can (currently) do. 5098 * 5099 * If we get a valid/acceptable INFO packet (and we have already negotiated 5100 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we 5101 * NACK back and reset channel state to INACTIV. 5102 * 5103 * FUTURE: in time we will probably negotiate over attributes, but for 5104 * the moment unacceptable attributes are regarded as a fatal error. 5105 * 5106 */ 5107 void 5108 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) 5109 { 5110 vnet_attr_msg_t *attr_pkt; 5111 vsw_t *vswp = ldcp->ldc_vswp; 5112 vsw_port_t *port = ldcp->ldc_port; 5113 uint64_t macaddr = 0; 5114 int i; 5115 5116 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5117 5118 /* 5119 * We know this is a ctrl/attr packet so 5120 * cast it into the correct structure. 5121 */ 5122 attr_pkt = (vnet_attr_msg_t *)pkt; 5123 5124 switch (attr_pkt->tag.vio_subtype) { 5125 case VIO_SUBTYPE_INFO: 5126 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5127 5128 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) 5129 return; 5130 5131 /* 5132 * If the attributes are unacceptable then we NACK back. 5133 */ 5134 if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) { 5135 5136 DERR(vswp, "%s (chan %d): invalid attributes", 5137 __func__, ldcp->ldc_id); 5138 5139 vsw_free_lane_resources(ldcp, INBOUND); 5140 5141 attr_pkt->tag.vio_sid = ldcp->local_session; 5142 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5143 5144 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 5145 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; 5146 (void) vsw_send_msg(ldcp, (void *)attr_pkt, 5147 sizeof (vnet_attr_msg_t), B_TRUE); 5148 5149 vsw_next_milestone(ldcp); 5150 return; 5151 } 5152 5153 /* 5154 * Otherwise store attributes for this lane and update 5155 * lane state. 5156 */ 5157 ldcp->lane_in.mtu = attr_pkt->mtu; 5158 ldcp->lane_in.addr = attr_pkt->addr; 5159 ldcp->lane_in.addr_type = attr_pkt->addr_type; 5160 ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode; 5161 ldcp->lane_in.ack_freq = attr_pkt->ack_freq; 5162 5163 macaddr = ldcp->lane_in.addr; 5164 for (i = ETHERADDRL - 1; i >= 0; i--) { 5165 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; 5166 macaddr >>= 8; 5167 } 5168 5169 /* create the fdb entry for this port/mac address */ 5170 (void) vsw_add_fdb(vswp, port); 5171 5172 /* setup device specifc xmit routines */ 5173 mutex_enter(&port->tx_lock); 5174 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) { 5175 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); 5176 port->transmit = vsw_dringsend; 5177 } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) { 5178 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); 5179 vsw_create_privring(ldcp); 5180 port->transmit = vsw_descrsend; 5181 } 5182 mutex_exit(&port->tx_lock); 5183 5184 attr_pkt->tag.vio_sid = ldcp->local_session; 5185 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5186 5187 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 5188 5189 ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; 5190 5191 (void) vsw_send_msg(ldcp, (void *)attr_pkt, 5192 sizeof (vnet_attr_msg_t), B_TRUE); 5193 5194 vsw_next_milestone(ldcp); 5195 break; 5196 5197 case VIO_SUBTYPE_ACK: 5198 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5199 5200 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) 5201 return; 5202 5203 ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV; 5204 vsw_next_milestone(ldcp); 5205 break; 5206 5207 case VIO_SUBTYPE_NACK: 5208 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5209 5210 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) 5211 return; 5212 5213 ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV; 5214 vsw_next_milestone(ldcp); 5215 break; 5216 5217 default: 5218 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5219 attr_pkt->tag.vio_subtype); 5220 } 5221 5222 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5223 } 5224 5225 /* 5226 * Process a dring info packet. We can end up here either because our peer 5227 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our 5228 * peer has sent us a dring INFO message. 5229 * 5230 * If we get a valid/acceptable INFO packet (and we have already negotiated 5231 * a version) we ACK back and update the lane state, otherwise we NACK back. 5232 * 5233 * FUTURE: nothing to stop client from sending us info on multiple dring's 5234 * but for the moment we will just use the first one we are given. 5235 * 5236 */ 5237 void 5238 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) 5239 { 5240 vio_dring_reg_msg_t *dring_pkt; 5241 vsw_t *vswp = ldcp->ldc_vswp; 5242 ldc_mem_info_t minfo; 5243 dring_info_t *dp, *dbp; 5244 int dring_found = 0; 5245 5246 /* 5247 * We know this is a ctrl/dring packet so 5248 * cast it into the correct structure. 5249 */ 5250 dring_pkt = (vio_dring_reg_msg_t *)pkt; 5251 5252 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5253 5254 switch (dring_pkt->tag.vio_subtype) { 5255 case VIO_SUBTYPE_INFO: 5256 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5257 5258 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 5259 return; 5260 5261 /* 5262 * If the dring params are unacceptable then we NACK back. 5263 */ 5264 if (vsw_check_dring_info(dring_pkt)) { 5265 5266 DERR(vswp, "%s (%lld): invalid dring info", 5267 __func__, ldcp->ldc_id); 5268 5269 vsw_free_lane_resources(ldcp, INBOUND); 5270 5271 dring_pkt->tag.vio_sid = ldcp->local_session; 5272 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5273 5274 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5275 5276 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5277 5278 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5279 sizeof (vio_dring_reg_msg_t), B_TRUE); 5280 5281 vsw_next_milestone(ldcp); 5282 return; 5283 } 5284 5285 /* 5286 * Otherwise, attempt to map in the dring using the 5287 * cookie. If that succeeds we send back a unique dring 5288 * identifier that the sending side will use in future 5289 * to refer to this descriptor ring. 5290 */ 5291 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 5292 5293 dp->num_descriptors = dring_pkt->num_descriptors; 5294 dp->descriptor_size = dring_pkt->descriptor_size; 5295 dp->options = dring_pkt->options; 5296 dp->ncookies = dring_pkt->ncookies; 5297 5298 /* 5299 * Note: should only get one cookie. Enforced in 5300 * the ldc layer. 5301 */ 5302 bcopy(&dring_pkt->cookie[0], &dp->cookie[0], 5303 sizeof (ldc_mem_cookie_t)); 5304 5305 D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__, 5306 dp->num_descriptors, dp->descriptor_size); 5307 D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__, 5308 dp->options, dp->ncookies); 5309 5310 if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0], 5311 dp->ncookies, dp->num_descriptors, 5312 dp->descriptor_size, LDC_SHADOW_MAP, 5313 &(dp->handle))) != 0) { 5314 5315 DERR(vswp, "%s: dring_map failed\n", __func__); 5316 5317 kmem_free(dp, sizeof (dring_info_t)); 5318 vsw_free_lane_resources(ldcp, INBOUND); 5319 5320 dring_pkt->tag.vio_sid = ldcp->local_session; 5321 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5322 5323 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5324 5325 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5326 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5327 sizeof (vio_dring_reg_msg_t), B_TRUE); 5328 5329 vsw_next_milestone(ldcp); 5330 return; 5331 } 5332 5333 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 5334 5335 DERR(vswp, "%s: dring_addr failed\n", __func__); 5336 5337 kmem_free(dp, sizeof (dring_info_t)); 5338 vsw_free_lane_resources(ldcp, INBOUND); 5339 5340 dring_pkt->tag.vio_sid = ldcp->local_session; 5341 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5342 5343 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5344 5345 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5346 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5347 sizeof (vio_dring_reg_msg_t), B_TRUE); 5348 5349 vsw_next_milestone(ldcp); 5350 return; 5351 } else { 5352 /* store the address of the pub part of ring */ 5353 dp->pub_addr = minfo.vaddr; 5354 } 5355 5356 /* no private section as we are importing */ 5357 dp->priv_addr = NULL; 5358 5359 /* 5360 * Using simple mono increasing int for ident at 5361 * the moment. 5362 */ 5363 dp->ident = ldcp->next_ident; 5364 ldcp->next_ident++; 5365 5366 dp->end_idx = 0; 5367 dp->next = NULL; 5368 5369 /* 5370 * Link it onto the end of the list of drings 5371 * for this lane. 5372 */ 5373 if (ldcp->lane_in.dringp == NULL) { 5374 D2(vswp, "%s: adding first INBOUND dring", __func__); 5375 ldcp->lane_in.dringp = dp; 5376 } else { 5377 dbp = ldcp->lane_in.dringp; 5378 5379 while (dbp->next != NULL) 5380 dbp = dbp->next; 5381 5382 dbp->next = dp; 5383 } 5384 5385 /* acknowledge it */ 5386 dring_pkt->tag.vio_sid = ldcp->local_session; 5387 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5388 dring_pkt->dring_ident = dp->ident; 5389 5390 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5391 sizeof (vio_dring_reg_msg_t), B_TRUE); 5392 5393 ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; 5394 vsw_next_milestone(ldcp); 5395 break; 5396 5397 case VIO_SUBTYPE_ACK: 5398 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5399 5400 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) 5401 return; 5402 5403 /* 5404 * Peer is acknowledging our dring info and will have 5405 * sent us a dring identifier which we will use to 5406 * refer to this ring w.r.t. our peer. 5407 */ 5408 dp = ldcp->lane_out.dringp; 5409 if (dp != NULL) { 5410 /* 5411 * Find the ring this ident should be associated 5412 * with. 5413 */ 5414 if (vsw_dring_match(dp, dring_pkt)) { 5415 dring_found = 1; 5416 5417 } else while (dp != NULL) { 5418 if (vsw_dring_match(dp, dring_pkt)) { 5419 dring_found = 1; 5420 break; 5421 } 5422 dp = dp->next; 5423 } 5424 5425 if (dring_found == 0) { 5426 DERR(NULL, "%s: unrecognised ring cookie", 5427 __func__); 5428 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5429 return; 5430 } 5431 5432 } else { 5433 DERR(vswp, "%s: DRING ACK received but no drings " 5434 "allocated", __func__); 5435 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5436 return; 5437 } 5438 5439 /* store ident */ 5440 dp->ident = dring_pkt->dring_ident; 5441 ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV; 5442 vsw_next_milestone(ldcp); 5443 break; 5444 5445 case VIO_SUBTYPE_NACK: 5446 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5447 5448 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) 5449 return; 5450 5451 ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV; 5452 vsw_next_milestone(ldcp); 5453 break; 5454 5455 default: 5456 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5457 dring_pkt->tag.vio_subtype); 5458 } 5459 5460 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5461 } 5462 5463 /* 5464 * Process a request from peer to unregister a dring. 5465 * 5466 * For the moment we just restart the handshake if our 5467 * peer endpoint attempts to unregister a dring. 5468 */ 5469 void 5470 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) 5471 { 5472 vsw_t *vswp = ldcp->ldc_vswp; 5473 vio_dring_unreg_msg_t *dring_pkt; 5474 5475 /* 5476 * We know this is a ctrl/dring packet so 5477 * cast it into the correct structure. 5478 */ 5479 dring_pkt = (vio_dring_unreg_msg_t *)pkt; 5480 5481 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5482 5483 switch (dring_pkt->tag.vio_subtype) { 5484 case VIO_SUBTYPE_INFO: 5485 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5486 5487 DWARN(vswp, "%s: restarting handshake..", __func__); 5488 break; 5489 5490 case VIO_SUBTYPE_ACK: 5491 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5492 5493 DWARN(vswp, "%s: restarting handshake..", __func__); 5494 break; 5495 5496 case VIO_SUBTYPE_NACK: 5497 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5498 5499 DWARN(vswp, "%s: restarting handshake..", __func__); 5500 break; 5501 5502 default: 5503 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5504 dring_pkt->tag.vio_subtype); 5505 } 5506 5507 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5508 5509 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5510 } 5511 5512 #define SND_MCST_NACK(ldcp, pkt) \ 5513 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5514 pkt->tag.vio_sid = ldcp->local_session; \ 5515 (void) vsw_send_msg(ldcp, (void *)pkt, \ 5516 sizeof (vnet_mcast_msg_t), B_TRUE); 5517 5518 /* 5519 * Process a multicast request from a vnet. 5520 * 5521 * Vnet's specify a multicast address that they are interested in. This 5522 * address is used as a key into the hash table which forms the multicast 5523 * forwarding database (mFDB). 5524 * 5525 * The table keys are the multicast addresses, while the table entries 5526 * are pointers to lists of ports which wish to receive packets for the 5527 * specified multicast address. 5528 * 5529 * When a multicast packet is being switched we use the address as a key 5530 * into the hash table, and then walk the appropriate port list forwarding 5531 * the pkt to each port in turn. 5532 * 5533 * If a vnet is no longer interested in a particular multicast grouping 5534 * we simply find the correct location in the hash table and then delete 5535 * the relevant port from the port list. 5536 * 5537 * To deal with the case whereby a port is being deleted without first 5538 * removing itself from the lists in the hash table, we maintain a list 5539 * of multicast addresses the port has registered an interest in, within 5540 * the port structure itself. We then simply walk that list of addresses 5541 * using them as keys into the hash table and remove the port from the 5542 * appropriate lists. 5543 */ 5544 static void 5545 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) 5546 { 5547 vnet_mcast_msg_t *mcst_pkt; 5548 vsw_port_t *port = ldcp->ldc_port; 5549 vsw_t *vswp = ldcp->ldc_vswp; 5550 int i; 5551 5552 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5553 5554 /* 5555 * We know this is a ctrl/mcast packet so 5556 * cast it into the correct structure. 5557 */ 5558 mcst_pkt = (vnet_mcast_msg_t *)pkt; 5559 5560 switch (mcst_pkt->tag.vio_subtype) { 5561 case VIO_SUBTYPE_INFO: 5562 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5563 5564 /* 5565 * Check if in correct state to receive a multicast 5566 * message (i.e. handshake complete). If not reset 5567 * the handshake. 5568 */ 5569 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) 5570 return; 5571 5572 /* 5573 * Before attempting to add or remove address check 5574 * that they are valid multicast addresses. 5575 * If not, then NACK back. 5576 */ 5577 for (i = 0; i < mcst_pkt->count; i++) { 5578 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { 5579 DERR(vswp, "%s: invalid multicast address", 5580 __func__); 5581 SND_MCST_NACK(ldcp, mcst_pkt); 5582 return; 5583 } 5584 } 5585 5586 /* 5587 * Now add/remove the addresses. If this fails we 5588 * NACK back. 5589 */ 5590 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { 5591 SND_MCST_NACK(ldcp, mcst_pkt); 5592 return; 5593 } 5594 5595 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5596 mcst_pkt->tag.vio_sid = ldcp->local_session; 5597 5598 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); 5599 5600 (void) vsw_send_msg(ldcp, (void *)mcst_pkt, 5601 sizeof (vnet_mcast_msg_t), B_TRUE); 5602 break; 5603 5604 case VIO_SUBTYPE_ACK: 5605 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5606 5607 /* 5608 * We shouldn't ever get a multicast ACK message as 5609 * at the moment we never request multicast addresses 5610 * to be set on some other device. This may change in 5611 * the future if we have cascading switches. 5612 */ 5613 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) 5614 return; 5615 5616 /* Do nothing */ 5617 break; 5618 5619 case VIO_SUBTYPE_NACK: 5620 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5621 5622 /* 5623 * We shouldn't get a multicast NACK packet for the 5624 * same reasons as we shouldn't get a ACK packet. 5625 */ 5626 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) 5627 return; 5628 5629 /* Do nothing */ 5630 break; 5631 5632 default: 5633 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5634 mcst_pkt->tag.vio_subtype); 5635 } 5636 5637 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5638 } 5639 5640 static void 5641 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) 5642 { 5643 vio_rdx_msg_t *rdx_pkt; 5644 vsw_t *vswp = ldcp->ldc_vswp; 5645 5646 /* 5647 * We know this is a ctrl/rdx packet so 5648 * cast it into the correct structure. 5649 */ 5650 rdx_pkt = (vio_rdx_msg_t *)pkt; 5651 5652 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5653 5654 switch (rdx_pkt->tag.vio_subtype) { 5655 case VIO_SUBTYPE_INFO: 5656 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5657 5658 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV)) 5659 return; 5660 5661 rdx_pkt->tag.vio_sid = ldcp->local_session; 5662 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5663 5664 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); 5665 5666 ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT; 5667 5668 (void) vsw_send_msg(ldcp, (void *)rdx_pkt, 5669 sizeof (vio_rdx_msg_t), B_TRUE); 5670 5671 vsw_next_milestone(ldcp); 5672 break; 5673 5674 case VIO_SUBTYPE_ACK: 5675 /* 5676 * Should be handled in-band by callback handler. 5677 */ 5678 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); 5679 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5680 break; 5681 5682 case VIO_SUBTYPE_NACK: 5683 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5684 5685 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV)) 5686 return; 5687 5688 ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV; 5689 vsw_next_milestone(ldcp); 5690 break; 5691 5692 default: 5693 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5694 rdx_pkt->tag.vio_subtype); 5695 } 5696 5697 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5698 } 5699 5700 static void 5701 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag) 5702 { 5703 uint16_t env = tag.vio_subtype_env; 5704 vsw_t *vswp = ldcp->ldc_vswp; 5705 5706 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5707 5708 /* session id check */ 5709 if (ldcp->session_status & VSW_PEER_SESSION) { 5710 if (ldcp->peer_session != tag.vio_sid) { 5711 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 5712 __func__, ldcp->ldc_id, tag.vio_sid); 5713 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5714 return; 5715 } 5716 } 5717 5718 /* 5719 * It is an error for us to be getting data packets 5720 * before the handshake has completed. 5721 */ 5722 if (ldcp->hphase != VSW_MILESTONE4) { 5723 DERR(vswp, "%s: got data packet before handshake complete " 5724 "hphase %d (%x: %x)", __func__, ldcp->hphase, 5725 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 5726 DUMP_FLAGS(ldcp->lane_in.lstate); 5727 DUMP_FLAGS(ldcp->lane_out.lstate); 5728 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5729 return; 5730 } 5731 5732 /* 5733 * Switch on vio_subtype envelope, then let lower routines 5734 * decide if its an INFO, ACK or NACK packet. 5735 */ 5736 if (env == VIO_DRING_DATA) { 5737 vsw_process_data_dring_pkt(ldcp, dpkt); 5738 } else if (env == VIO_PKT_DATA) { 5739 vsw_process_data_raw_pkt(ldcp, dpkt); 5740 } else if (env == VIO_DESC_DATA) { 5741 vsw_process_data_ibnd_pkt(ldcp, dpkt); 5742 } else { 5743 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 5744 __func__, env); 5745 } 5746 5747 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5748 } 5749 5750 #define SND_DRING_NACK(ldcp, pkt) \ 5751 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5752 pkt->tag.vio_sid = ldcp->local_session; \ 5753 (void) vsw_send_msg(ldcp, (void *)pkt, \ 5754 sizeof (vio_dring_msg_t), B_TRUE); 5755 5756 static void 5757 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) 5758 { 5759 vio_dring_msg_t *dring_pkt; 5760 vnet_public_desc_t *pub_addr = NULL; 5761 vsw_private_desc_t *priv_addr = NULL; 5762 dring_info_t *dp = NULL; 5763 vsw_t *vswp = ldcp->ldc_vswp; 5764 mblk_t *mp = NULL; 5765 mblk_t *bp = NULL; 5766 mblk_t *bpt = NULL; 5767 size_t nbytes = 0; 5768 size_t off = 0; 5769 uint64_t ncookies = 0; 5770 uint64_t chain = 0; 5771 uint64_t j, len; 5772 uint32_t pos, start, datalen; 5773 uint32_t range_start, range_end; 5774 int32_t end, num, cnt = 0; 5775 int i, rv, msg_rv = 0; 5776 boolean_t ack_needed = B_FALSE; 5777 boolean_t prev_desc_ack = B_FALSE; 5778 int read_attempts = 0; 5779 5780 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5781 5782 /* 5783 * We know this is a data/dring packet so 5784 * cast it into the correct structure. 5785 */ 5786 dring_pkt = (vio_dring_msg_t *)dpkt; 5787 5788 /* 5789 * Switch on the vio_subtype. If its INFO then we need to 5790 * process the data. If its an ACK we need to make sure 5791 * it makes sense (i.e did we send an earlier data/info), 5792 * and if its a NACK then we maybe attempt a retry. 5793 */ 5794 switch (dring_pkt->tag.vio_subtype) { 5795 case VIO_SUBTYPE_INFO: 5796 D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); 5797 5798 READ_ENTER(&ldcp->lane_in.dlistrw); 5799 if ((dp = vsw_ident2dring(&ldcp->lane_in, 5800 dring_pkt->dring_ident)) == NULL) { 5801 RW_EXIT(&ldcp->lane_in.dlistrw); 5802 5803 DERR(vswp, "%s(%lld): unable to find dring from " 5804 "ident 0x%llx", __func__, ldcp->ldc_id, 5805 dring_pkt->dring_ident); 5806 5807 SND_DRING_NACK(ldcp, dring_pkt); 5808 return; 5809 } 5810 5811 start = pos = dring_pkt->start_idx; 5812 end = dring_pkt->end_idx; 5813 len = dp->num_descriptors; 5814 5815 range_start = range_end = pos; 5816 5817 D2(vswp, "%s(%lld): start index %ld : end %ld\n", 5818 __func__, ldcp->ldc_id, start, end); 5819 5820 if (end == -1) { 5821 num = -1; 5822 } else if (end >= 0) { 5823 num = end >= pos ? 5824 end - pos + 1: (len - pos + 1) + end; 5825 5826 /* basic sanity check */ 5827 if (end > len) { 5828 RW_EXIT(&ldcp->lane_in.dlistrw); 5829 DERR(vswp, "%s(%lld): endpoint %lld outside " 5830 "ring length %lld", __func__, 5831 ldcp->ldc_id, end, len); 5832 5833 SND_DRING_NACK(ldcp, dring_pkt); 5834 return; 5835 } 5836 } else { 5837 RW_EXIT(&ldcp->lane_in.dlistrw); 5838 DERR(vswp, "%s(%lld): invalid endpoint %lld", 5839 __func__, ldcp->ldc_id, end); 5840 SND_DRING_NACK(ldcp, dring_pkt); 5841 return; 5842 } 5843 5844 while (cnt != num) { 5845 vsw_recheck_desc: 5846 if ((rv = ldc_mem_dring_acquire(dp->handle, 5847 pos, pos)) != 0) { 5848 RW_EXIT(&ldcp->lane_in.dlistrw); 5849 DERR(vswp, "%s(%lld): unable to acquire " 5850 "descriptor at pos %d: err %d", 5851 __func__, pos, ldcp->ldc_id, rv); 5852 SND_DRING_NACK(ldcp, dring_pkt); 5853 return; 5854 } 5855 5856 pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; 5857 5858 /* 5859 * When given a bounded range of descriptors 5860 * to process, its an error to hit a descriptor 5861 * which is not ready. In the non-bounded case 5862 * (end_idx == -1) this simply indicates we have 5863 * reached the end of the current active range. 5864 */ 5865 if (pub_addr->hdr.dstate != VIO_DESC_READY) { 5866 /* unbound - no error */ 5867 if (end == -1) { 5868 if (read_attempts == vsw_read_attempts) 5869 break; 5870 5871 delay(drv_usectohz(vsw_desc_delay)); 5872 read_attempts++; 5873 goto vsw_recheck_desc; 5874 } 5875 5876 /* bounded - error - so NACK back */ 5877 RW_EXIT(&ldcp->lane_in.dlistrw); 5878 DERR(vswp, "%s(%lld): descriptor not READY " 5879 "(%d)", __func__, ldcp->ldc_id, 5880 pub_addr->hdr.dstate); 5881 SND_DRING_NACK(ldcp, dring_pkt); 5882 return; 5883 } 5884 5885 DTRACE_PROBE1(read_attempts, int, read_attempts); 5886 5887 range_end = pos; 5888 5889 /* 5890 * If we ACK'd the previous descriptor then now 5891 * record the new range start position for later 5892 * ACK's. 5893 */ 5894 if (prev_desc_ack) { 5895 range_start = pos; 5896 5897 D2(vswp, "%s(%lld): updating range start " 5898 "to be %d", __func__, ldcp->ldc_id, 5899 range_start); 5900 5901 prev_desc_ack = B_FALSE; 5902 } 5903 5904 /* 5905 * Data is padded to align on 8 byte boundary, 5906 * datalen is actual data length, i.e. minus that 5907 * padding. 5908 */ 5909 datalen = pub_addr->nbytes; 5910 5911 /* 5912 * Does peer wish us to ACK when we have finished 5913 * with this descriptor ? 5914 */ 5915 if (pub_addr->hdr.ack) 5916 ack_needed = B_TRUE; 5917 5918 D2(vswp, "%s(%lld): processing desc %lld at pos" 5919 " 0x%llx : dstate 0x%lx : datalen 0x%lx", 5920 __func__, ldcp->ldc_id, pos, pub_addr, 5921 pub_addr->hdr.dstate, datalen); 5922 5923 /* 5924 * Mark that we are starting to process descriptor. 5925 */ 5926 pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; 5927 5928 mp = vio_allocb(ldcp->rxh); 5929 if (mp == NULL) { 5930 /* 5931 * No free receive buffers available, so 5932 * fallback onto allocb(9F). Make sure that 5933 * we get a data buffer which is a multiple 5934 * of 8 as this is required by ldc_mem_copy. 5935 */ 5936 DTRACE_PROBE(allocb); 5937 mp = allocb(datalen + VNET_IPALIGN + 8, 5938 BPRI_MED); 5939 } 5940 5941 /* 5942 * Ensure that we ask ldc for an aligned 5943 * number of bytes. 5944 */ 5945 nbytes = datalen + VNET_IPALIGN; 5946 if (nbytes & 0x7) { 5947 off = 8 - (nbytes & 0x7); 5948 nbytes += off; 5949 } 5950 5951 ncookies = pub_addr->ncookies; 5952 rv = ldc_mem_copy(ldcp->ldc_handle, 5953 (caddr_t)mp->b_rptr, 0, &nbytes, 5954 pub_addr->memcookie, ncookies, 5955 LDC_COPY_IN); 5956 5957 if (rv != 0) { 5958 DERR(vswp, "%s(%d): unable to copy in " 5959 "data from %d cookies in desc %d" 5960 " (rv %d)", __func__, ldcp->ldc_id, 5961 ncookies, pos, rv); 5962 freemsg(mp); 5963 5964 pub_addr->hdr.dstate = VIO_DESC_DONE; 5965 (void) ldc_mem_dring_release(dp->handle, 5966 pos, pos); 5967 break; 5968 } else { 5969 D2(vswp, "%s(%d): copied in %ld bytes" 5970 " using %d cookies", __func__, 5971 ldcp->ldc_id, nbytes, ncookies); 5972 } 5973 5974 /* adjust the read pointer to skip over the padding */ 5975 mp->b_rptr += VNET_IPALIGN; 5976 5977 /* point to the actual end of data */ 5978 mp->b_wptr = mp->b_rptr + datalen; 5979 5980 /* build a chain of received packets */ 5981 if (bp == NULL) { 5982 /* first pkt */ 5983 bp = mp; 5984 bp->b_next = bp->b_prev = NULL; 5985 bpt = bp; 5986 chain = 1; 5987 } else { 5988 mp->b_next = NULL; 5989 mp->b_prev = bpt; 5990 bpt->b_next = mp; 5991 bpt = mp; 5992 chain++; 5993 } 5994 5995 /* mark we are finished with this descriptor */ 5996 pub_addr->hdr.dstate = VIO_DESC_DONE; 5997 5998 (void) ldc_mem_dring_release(dp->handle, pos, pos); 5999 6000 /* 6001 * Send an ACK back to peer if requested. 6002 */ 6003 if (ack_needed) { 6004 ack_needed = B_FALSE; 6005 6006 dring_pkt->start_idx = range_start; 6007 dring_pkt->end_idx = range_end; 6008 6009 DERR(vswp, "%s(%lld): processed %d %d, ACK" 6010 " requested", __func__, ldcp->ldc_id, 6011 dring_pkt->start_idx, 6012 dring_pkt->end_idx); 6013 6014 dring_pkt->dring_process_state = VIO_DP_ACTIVE; 6015 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 6016 dring_pkt->tag.vio_sid = ldcp->local_session; 6017 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 6018 sizeof (vio_dring_msg_t), 6019 B_FALSE); 6020 6021 /* 6022 * Check if ACK was successfully sent. If not 6023 * we break and deal with that below. 6024 */ 6025 if (msg_rv != 0) 6026 break; 6027 6028 prev_desc_ack = B_TRUE; 6029 range_start = pos; 6030 } 6031 6032 /* next descriptor */ 6033 pos = (pos + 1) % len; 6034 cnt++; 6035 6036 /* 6037 * Break out of loop here and stop processing to 6038 * allow some other network device (or disk) to 6039 * get access to the cpu. 6040 */ 6041 if (chain > vsw_chain_len) { 6042 D3(vswp, "%s(%lld): switching chain of %d " 6043 "msgs", __func__, ldcp->ldc_id, chain); 6044 break; 6045 } 6046 } 6047 RW_EXIT(&ldcp->lane_in.dlistrw); 6048 6049 /* 6050 * If when we attempted to send the ACK we found that the 6051 * channel had been reset then now handle this. We deal with 6052 * it here as we cannot reset the channel while holding the 6053 * dlistrw lock, and we don't want to acquire/release it 6054 * continuously in the above loop, as a channel reset should 6055 * be a rare event. 6056 */ 6057 if (msg_rv == ECONNRESET) { 6058 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 6059 break; 6060 } 6061 6062 /* send the chain of packets to be switched */ 6063 if (bp != NULL) { 6064 D3(vswp, "%s(%lld): switching chain of %d msgs", 6065 __func__, ldcp->ldc_id, chain); 6066 vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT, 6067 ldcp->ldc_port, NULL); 6068 } 6069 6070 DTRACE_PROBE1(msg_cnt, int, cnt); 6071 6072 /* 6073 * We are now finished so ACK back with the state 6074 * set to STOPPING so our peer knows we are finished 6075 */ 6076 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 6077 dring_pkt->tag.vio_sid = ldcp->local_session; 6078 6079 dring_pkt->dring_process_state = VIO_DP_STOPPED; 6080 6081 DTRACE_PROBE(stop_process_sent); 6082 6083 /* 6084 * We have not processed any more descriptors beyond 6085 * the last one we ACK'd. 6086 */ 6087 if (prev_desc_ack) 6088 range_start = range_end; 6089 6090 dring_pkt->start_idx = range_start; 6091 dring_pkt->end_idx = range_end; 6092 6093 D2(vswp, "%s(%lld) processed : %d : %d, now stopping", 6094 __func__, ldcp->ldc_id, dring_pkt->start_idx, 6095 dring_pkt->end_idx); 6096 6097 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 6098 sizeof (vio_dring_msg_t), B_TRUE); 6099 break; 6100 6101 case VIO_SUBTYPE_ACK: 6102 D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); 6103 /* 6104 * Verify that the relevant descriptors are all 6105 * marked as DONE 6106 */ 6107 READ_ENTER(&ldcp->lane_out.dlistrw); 6108 if ((dp = vsw_ident2dring(&ldcp->lane_out, 6109 dring_pkt->dring_ident)) == NULL) { 6110 RW_EXIT(&ldcp->lane_out.dlistrw); 6111 DERR(vswp, "%s: unknown ident in ACK", __func__); 6112 return; 6113 } 6114 6115 pub_addr = (vnet_public_desc_t *)dp->pub_addr; 6116 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 6117 6118 start = end = 0; 6119 start = dring_pkt->start_idx; 6120 end = dring_pkt->end_idx; 6121 len = dp->num_descriptors; 6122 6123 j = num = 0; 6124 /* calculate # descriptors taking into a/c wrap around */ 6125 num = end >= start ? end - start + 1: (len - start + 1) + end; 6126 6127 D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", 6128 __func__, ldcp->ldc_id, start, end, num); 6129 6130 mutex_enter(&dp->dlock); 6131 dp->last_ack_recv = end; 6132 mutex_exit(&dp->dlock); 6133 6134 for (i = start; j < num; i = (i + 1) % len, j++) { 6135 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 6136 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6137 6138 /* 6139 * If the last descriptor in a range has the ACK 6140 * bit set then we will get two messages from our 6141 * peer relating to it. The normal ACK msg and then 6142 * a subsequent STOP msg. The first message will have 6143 * resulted in the descriptor being reclaimed and 6144 * its state set to FREE so when we encounter a non 6145 * DONE descriptor we need to check to see if its 6146 * because we have just reclaimed it. 6147 */ 6148 mutex_enter(&priv_addr->dstate_lock); 6149 if (pub_addr->hdr.dstate == VIO_DESC_DONE) { 6150 /* clear all the fields */ 6151 bzero(priv_addr->datap, priv_addr->datalen); 6152 priv_addr->datalen = 0; 6153 6154 pub_addr->hdr.dstate = VIO_DESC_FREE; 6155 pub_addr->hdr.ack = 0; 6156 6157 priv_addr->dstate = VIO_DESC_FREE; 6158 mutex_exit(&priv_addr->dstate_lock); 6159 6160 D3(vswp, "clearing descp %d : pub state " 6161 "0x%llx : priv state 0x%llx", i, 6162 pub_addr->hdr.dstate, 6163 priv_addr->dstate); 6164 6165 } else { 6166 mutex_exit(&priv_addr->dstate_lock); 6167 6168 if (dring_pkt->dring_process_state != 6169 VIO_DP_STOPPED) { 6170 DERR(vswp, "%s: descriptor %lld at pos " 6171 " 0x%llx not DONE (0x%lx)\n", 6172 __func__, i, pub_addr, 6173 pub_addr->hdr.dstate); 6174 RW_EXIT(&ldcp->lane_out.dlistrw); 6175 return; 6176 } 6177 } 6178 } 6179 6180 /* 6181 * If our peer is stopping processing descriptors then 6182 * we check to make sure it has processed all the descriptors 6183 * we have updated. If not then we send it a new message 6184 * to prompt it to restart. 6185 */ 6186 if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { 6187 DTRACE_PROBE(stop_process_recv); 6188 D2(vswp, "%s(%lld): got stopping msg : %d : %d", 6189 __func__, ldcp->ldc_id, dring_pkt->start_idx, 6190 dring_pkt->end_idx); 6191 6192 /* 6193 * Check next descriptor in public section of ring. 6194 * If its marked as READY then we need to prompt our 6195 * peer to start processing the ring again. 6196 */ 6197 i = (end + 1) % len; 6198 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 6199 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6200 6201 /* 6202 * Hold the restart lock across all of this to 6203 * make sure that its not possible for us to 6204 * decide that a msg needs to be sent in the future 6205 * but the sending code having already checked is 6206 * about to exit. 6207 */ 6208 mutex_enter(&dp->restart_lock); 6209 mutex_enter(&priv_addr->dstate_lock); 6210 if (pub_addr->hdr.dstate == VIO_DESC_READY) { 6211 6212 mutex_exit(&priv_addr->dstate_lock); 6213 6214 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 6215 dring_pkt->tag.vio_sid = ldcp->local_session; 6216 6217 mutex_enter(&ldcp->lane_out.seq_lock); 6218 dring_pkt->seq_num = ldcp->lane_out.seq_num++; 6219 mutex_exit(&ldcp->lane_out.seq_lock); 6220 6221 dring_pkt->start_idx = (end + 1) % len; 6222 dring_pkt->end_idx = -1; 6223 6224 D2(vswp, "%s(%lld) : sending restart msg:" 6225 " %d : %d", __func__, ldcp->ldc_id, 6226 dring_pkt->start_idx, 6227 dring_pkt->end_idx); 6228 6229 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 6230 sizeof (vio_dring_msg_t), B_FALSE); 6231 6232 } else { 6233 mutex_exit(&priv_addr->dstate_lock); 6234 dp->restart_reqd = B_TRUE; 6235 } 6236 mutex_exit(&dp->restart_lock); 6237 } 6238 RW_EXIT(&ldcp->lane_out.dlistrw); 6239 6240 /* only do channel reset after dropping dlistrw lock */ 6241 if (msg_rv == ECONNRESET) 6242 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 6243 6244 break; 6245 6246 case VIO_SUBTYPE_NACK: 6247 DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", 6248 __func__, ldcp->ldc_id); 6249 /* 6250 * Something is badly wrong if we are getting NACK's 6251 * for our data pkts. So reset the channel. 6252 */ 6253 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 6254 6255 break; 6256 6257 default: 6258 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 6259 ldcp->ldc_id, dring_pkt->tag.vio_subtype); 6260 } 6261 6262 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 6263 } 6264 6265 /* 6266 * VIO_PKT_DATA (a.k.a raw data mode ) 6267 * 6268 * Note - currently not supported. Do nothing. 6269 */ 6270 static void 6271 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt) 6272 { 6273 _NOTE(ARGUNUSED(dpkt)) 6274 6275 D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 6276 6277 DERR(NULL, "%s (%lld): currently not supported", 6278 __func__, ldcp->ldc_id); 6279 6280 D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 6281 } 6282 6283 /* 6284 * Process an in-band descriptor message (most likely from 6285 * OBP). 6286 */ 6287 static void 6288 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) 6289 { 6290 vnet_ibnd_desc_t *ibnd_desc; 6291 dring_info_t *dp = NULL; 6292 vsw_private_desc_t *priv_addr = NULL; 6293 vsw_t *vswp = ldcp->ldc_vswp; 6294 mblk_t *mp = NULL; 6295 size_t nbytes = 0; 6296 size_t off = 0; 6297 uint64_t idx = 0; 6298 uint32_t num = 1, len, datalen = 0; 6299 uint64_t ncookies = 0; 6300 int i, rv; 6301 int j = 0; 6302 6303 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6304 6305 ibnd_desc = (vnet_ibnd_desc_t *)pkt; 6306 6307 switch (ibnd_desc->hdr.tag.vio_subtype) { 6308 case VIO_SUBTYPE_INFO: 6309 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 6310 6311 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 6312 return; 6313 6314 /* 6315 * Data is padded to align on a 8 byte boundary, 6316 * nbytes is actual data length, i.e. minus that 6317 * padding. 6318 */ 6319 datalen = ibnd_desc->nbytes; 6320 6321 D2(vswp, "%s(%lld): processing inband desc : " 6322 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); 6323 6324 ncookies = ibnd_desc->ncookies; 6325 6326 /* 6327 * allocb(9F) returns an aligned data block. We 6328 * need to ensure that we ask ldc for an aligned 6329 * number of bytes also. 6330 */ 6331 nbytes = datalen; 6332 if (nbytes & 0x7) { 6333 off = 8 - (nbytes & 0x7); 6334 nbytes += off; 6335 } 6336 6337 mp = allocb(datalen, BPRI_MED); 6338 if (mp == NULL) { 6339 DERR(vswp, "%s(%lld): allocb failed", 6340 __func__, ldcp->ldc_id); 6341 return; 6342 } 6343 6344 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, 6345 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, 6346 LDC_COPY_IN); 6347 6348 if (rv != 0) { 6349 DERR(vswp, "%s(%d): unable to copy in data from " 6350 "%d cookie(s)", __func__, 6351 ldcp->ldc_id, ncookies); 6352 freemsg(mp); 6353 return; 6354 } else { 6355 D2(vswp, "%s(%d): copied in %ld bytes using %d " 6356 "cookies", __func__, ldcp->ldc_id, nbytes, 6357 ncookies); 6358 } 6359 6360 /* point to the actual end of data */ 6361 mp->b_wptr = mp->b_rptr + datalen; 6362 6363 /* 6364 * We ACK back every in-band descriptor message we process 6365 */ 6366 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; 6367 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; 6368 (void) vsw_send_msg(ldcp, (void *)ibnd_desc, 6369 sizeof (vnet_ibnd_desc_t), B_TRUE); 6370 6371 /* send the packet to be switched */ 6372 vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, 6373 ldcp->ldc_port, NULL); 6374 6375 break; 6376 6377 case VIO_SUBTYPE_ACK: 6378 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 6379 6380 /* Verify the ACK is valid */ 6381 idx = ibnd_desc->hdr.desc_handle; 6382 6383 if (idx >= VSW_RING_NUM_EL) { 6384 cmn_err(CE_WARN, "!vsw%d: corrupted ACK received " 6385 "(idx %ld)", vswp->instance, idx); 6386 return; 6387 } 6388 6389 if ((dp = ldcp->lane_out.dringp) == NULL) { 6390 DERR(vswp, "%s: no dring found", __func__); 6391 return; 6392 } 6393 6394 len = dp->num_descriptors; 6395 /* 6396 * If the descriptor we are being ACK'ed for is not the 6397 * one we expected, then pkts were lost somwhere, either 6398 * when we tried to send a msg, or a previous ACK msg from 6399 * our peer. In either case we now reclaim the descriptors 6400 * in the range from the last ACK we received up to the 6401 * current ACK. 6402 */ 6403 if (idx != dp->last_ack_recv) { 6404 DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)", 6405 __func__, dp->last_ack_recv, idx); 6406 num = idx >= dp->last_ack_recv ? 6407 idx - dp->last_ack_recv + 1: 6408 (len - dp->last_ack_recv + 1) + idx; 6409 } 6410 6411 /* 6412 * When we sent the in-band message to our peer we 6413 * marked the copy in our private ring as READY. We now 6414 * check that the descriptor we are being ACK'ed for is in 6415 * fact READY, i.e. it is one we have shared with our peer. 6416 * 6417 * If its not we flag an error, but still reset the descr 6418 * back to FREE. 6419 */ 6420 for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) { 6421 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6422 mutex_enter(&priv_addr->dstate_lock); 6423 if (priv_addr->dstate != VIO_DESC_READY) { 6424 DERR(vswp, "%s: (%ld) desc at index %ld not " 6425 "READY (0x%lx)", __func__, 6426 ldcp->ldc_id, idx, priv_addr->dstate); 6427 DERR(vswp, "%s: bound %d: ncookies %ld : " 6428 "datalen %ld", __func__, 6429 priv_addr->bound, priv_addr->ncookies, 6430 priv_addr->datalen); 6431 } 6432 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, 6433 ldcp->ldc_id, idx); 6434 /* release resources associated with sent msg */ 6435 bzero(priv_addr->datap, priv_addr->datalen); 6436 priv_addr->datalen = 0; 6437 priv_addr->dstate = VIO_DESC_FREE; 6438 mutex_exit(&priv_addr->dstate_lock); 6439 } 6440 /* update to next expected value */ 6441 dp->last_ack_recv = (idx + 1) % dp->num_descriptors; 6442 6443 break; 6444 6445 case VIO_SUBTYPE_NACK: 6446 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 6447 6448 /* 6449 * We should only get a NACK if our peer doesn't like 6450 * something about a message we have sent it. If this 6451 * happens we just release the resources associated with 6452 * the message. (We are relying on higher layers to decide 6453 * whether or not to resend. 6454 */ 6455 6456 /* limit check */ 6457 idx = ibnd_desc->hdr.desc_handle; 6458 6459 if (idx >= VSW_RING_NUM_EL) { 6460 DERR(vswp, "%s: corrupted NACK received (idx %lld)", 6461 __func__, idx); 6462 return; 6463 } 6464 6465 if ((dp = ldcp->lane_out.dringp) == NULL) { 6466 DERR(vswp, "%s: no dring found", __func__); 6467 return; 6468 } 6469 6470 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 6471 6472 /* move to correct location in ring */ 6473 priv_addr += idx; 6474 6475 /* release resources associated with sent msg */ 6476 mutex_enter(&priv_addr->dstate_lock); 6477 bzero(priv_addr->datap, priv_addr->datalen); 6478 priv_addr->datalen = 0; 6479 priv_addr->dstate = VIO_DESC_FREE; 6480 mutex_exit(&priv_addr->dstate_lock); 6481 6482 break; 6483 6484 default: 6485 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 6486 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); 6487 } 6488 6489 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 6490 } 6491 6492 static void 6493 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag) 6494 { 6495 _NOTE(ARGUNUSED(epkt)) 6496 6497 vsw_t *vswp = ldcp->ldc_vswp; 6498 uint16_t env = tag.vio_subtype_env; 6499 6500 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 6501 6502 /* 6503 * Error vio_subtypes have yet to be defined. So for 6504 * the moment we can't do anything. 6505 */ 6506 D2(vswp, "%s: (%x) vio_subtype env", __func__, env); 6507 6508 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 6509 } 6510 6511 /* 6512 * Switch the given ethernet frame when operating in layer 2 mode. 6513 * 6514 * vswp: pointer to the vsw instance 6515 * mp: pointer to chain of ethernet frame(s) to be switched 6516 * caller: identifies the source of this frame as: 6517 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 6518 * 2. VSW_PHYSDEV - the physical ethernet device 6519 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 6520 * arg: argument provided by the caller. 6521 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 6522 * 2. for PHYSDEV - NULL 6523 * 3. for LOCALDEV - pointer to to this vsw_t(self) 6524 */ 6525 void 6526 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 6527 vsw_port_t *arg, mac_resource_handle_t mrh) 6528 { 6529 struct ether_header *ehp; 6530 vsw_port_t *port = NULL; 6531 mblk_t *bp, *ret_m; 6532 mblk_t *nmp = NULL; 6533 vsw_port_list_t *plist = &vswp->plist; 6534 6535 D1(vswp, "%s: enter (caller %d)", __func__, caller); 6536 6537 /* 6538 * PERF: rather than breaking up the chain here, scan it 6539 * to find all mblks heading to same destination and then 6540 * pass that sub-chain to the lower transmit functions. 6541 */ 6542 6543 /* process the chain of packets */ 6544 bp = mp; 6545 while (bp) { 6546 mp = bp; 6547 bp = bp->b_next; 6548 mp->b_next = mp->b_prev = NULL; 6549 ehp = (struct ether_header *)mp->b_rptr; 6550 6551 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 6552 __func__, MBLKSIZE(mp), MBLKL(mp)); 6553 6554 READ_ENTER(&vswp->if_lockrw); 6555 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 6556 /* 6557 * If destination is VSW_LOCALDEV (vsw as an eth 6558 * interface) and if the device is up & running, 6559 * send the packet up the stack on this host. 6560 * If the virtual interface is down, drop the packet. 6561 */ 6562 if (caller != VSW_LOCALDEV) { 6563 if (vswp->if_state & VSW_IF_UP) { 6564 RW_EXIT(&vswp->if_lockrw); 6565 mac_rx(vswp->if_mh, mrh, mp); 6566 } else { 6567 RW_EXIT(&vswp->if_lockrw); 6568 /* Interface down, drop pkt */ 6569 freemsg(mp); 6570 } 6571 } else { 6572 RW_EXIT(&vswp->if_lockrw); 6573 freemsg(mp); 6574 } 6575 continue; 6576 } 6577 RW_EXIT(&vswp->if_lockrw); 6578 6579 READ_ENTER(&plist->lockrw); 6580 port = vsw_lookup_fdb(vswp, ehp); 6581 if (port) { 6582 /* 6583 * Mark the port as in-use. 6584 */ 6585 mutex_enter(&port->ref_lock); 6586 port->ref_cnt++; 6587 mutex_exit(&port->ref_lock); 6588 RW_EXIT(&plist->lockrw); 6589 6590 /* 6591 * If plumbed and in promisc mode then copy msg 6592 * and send up the stack. 6593 */ 6594 READ_ENTER(&vswp->if_lockrw); 6595 if (VSW_U_P(vswp->if_state)) { 6596 RW_EXIT(&vswp->if_lockrw); 6597 nmp = copymsg(mp); 6598 if (nmp) 6599 mac_rx(vswp->if_mh, mrh, nmp); 6600 } else { 6601 RW_EXIT(&vswp->if_lockrw); 6602 } 6603 6604 /* 6605 * If the destination is in FDB, the packet 6606 * should be forwarded to the correponding 6607 * vsw_port (connected to a vnet device - 6608 * VSW_VNETPORT) 6609 */ 6610 (void) vsw_portsend(port, mp); 6611 6612 /* 6613 * Decrement use count in port and check if 6614 * should wake delete thread. 6615 */ 6616 mutex_enter(&port->ref_lock); 6617 port->ref_cnt--; 6618 if (port->ref_cnt == 0) 6619 cv_signal(&port->ref_cv); 6620 mutex_exit(&port->ref_lock); 6621 } else { 6622 RW_EXIT(&plist->lockrw); 6623 /* 6624 * Destination not in FDB. 6625 * 6626 * If the destination is broadcast or 6627 * multicast forward the packet to all 6628 * (VNETPORTs, PHYSDEV, LOCALDEV), 6629 * except the caller. 6630 */ 6631 if (IS_BROADCAST(ehp)) { 6632 D3(vswp, "%s: BROADCAST pkt", __func__); 6633 (void) vsw_forward_all(vswp, mp, 6634 caller, arg); 6635 } else if (IS_MULTICAST(ehp)) { 6636 D3(vswp, "%s: MULTICAST pkt", __func__); 6637 (void) vsw_forward_grp(vswp, mp, 6638 caller, arg); 6639 } else { 6640 /* 6641 * If the destination is unicast, and came 6642 * from either a logical network device or 6643 * the switch itself when it is plumbed, then 6644 * send it out on the physical device and also 6645 * up the stack if the logical interface is 6646 * in promiscious mode. 6647 * 6648 * NOTE: The assumption here is that if we 6649 * cannot find the destination in our fdb, its 6650 * a unicast address, and came from either a 6651 * vnet or down the stack (when plumbed) it 6652 * must be destinded for an ethernet device 6653 * outside our ldoms. 6654 */ 6655 if (caller == VSW_VNETPORT) { 6656 READ_ENTER(&vswp->if_lockrw); 6657 if (VSW_U_P(vswp->if_state)) { 6658 RW_EXIT(&vswp->if_lockrw); 6659 nmp = copymsg(mp); 6660 if (nmp) 6661 mac_rx(vswp->if_mh, 6662 mrh, nmp); 6663 } else { 6664 RW_EXIT(&vswp->if_lockrw); 6665 } 6666 if ((ret_m = vsw_tx_msg(vswp, mp)) 6667 != NULL) { 6668 DERR(vswp, "%s: drop mblks to " 6669 "phys dev", __func__); 6670 freemsg(ret_m); 6671 } 6672 6673 } else if (caller == VSW_PHYSDEV) { 6674 /* 6675 * Pkt seen because card in promisc 6676 * mode. Send up stack if plumbed in 6677 * promisc mode, else drop it. 6678 */ 6679 READ_ENTER(&vswp->if_lockrw); 6680 if (VSW_U_P(vswp->if_state)) { 6681 RW_EXIT(&vswp->if_lockrw); 6682 mac_rx(vswp->if_mh, mrh, mp); 6683 } else { 6684 RW_EXIT(&vswp->if_lockrw); 6685 freemsg(mp); 6686 } 6687 6688 } else if (caller == VSW_LOCALDEV) { 6689 /* 6690 * Pkt came down the stack, send out 6691 * over physical device. 6692 */ 6693 if ((ret_m = vsw_tx_msg(vswp, mp)) 6694 != NULL) { 6695 DERR(vswp, "%s: drop mblks to " 6696 "phys dev", __func__); 6697 freemsg(ret_m); 6698 } 6699 } 6700 } 6701 } 6702 } 6703 D1(vswp, "%s: exit\n", __func__); 6704 } 6705 6706 /* 6707 * Switch ethernet frame when in layer 3 mode (i.e. using IP 6708 * layer to do the routing). 6709 * 6710 * There is a large amount of overlap between this function and 6711 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 6712 * both these functions. 6713 */ 6714 void 6715 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 6716 vsw_port_t *arg, mac_resource_handle_t mrh) 6717 { 6718 struct ether_header *ehp; 6719 vsw_port_t *port = NULL; 6720 mblk_t *bp = NULL; 6721 vsw_port_list_t *plist = &vswp->plist; 6722 6723 D1(vswp, "%s: enter (caller %d)", __func__, caller); 6724 6725 /* 6726 * In layer 3 mode should only ever be switching packets 6727 * between IP layer and vnet devices. So make sure thats 6728 * who is invoking us. 6729 */ 6730 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 6731 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 6732 freemsgchain(mp); 6733 return; 6734 } 6735 6736 /* process the chain of packets */ 6737 bp = mp; 6738 while (bp) { 6739 mp = bp; 6740 bp = bp->b_next; 6741 mp->b_next = mp->b_prev = NULL; 6742 ehp = (struct ether_header *)mp->b_rptr; 6743 6744 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 6745 __func__, MBLKSIZE(mp), MBLKL(mp)); 6746 6747 READ_ENTER(&plist->lockrw); 6748 port = vsw_lookup_fdb(vswp, ehp); 6749 if (port) { 6750 /* 6751 * Mark port as in-use. 6752 */ 6753 mutex_enter(&port->ref_lock); 6754 port->ref_cnt++; 6755 mutex_exit(&port->ref_lock); 6756 RW_EXIT(&plist->lockrw); 6757 6758 D2(vswp, "%s: sending to target port", __func__); 6759 (void) vsw_portsend(port, mp); 6760 6761 /* 6762 * Finished with port so decrement ref count and 6763 * check if should wake delete thread. 6764 */ 6765 mutex_enter(&port->ref_lock); 6766 port->ref_cnt--; 6767 if (port->ref_cnt == 0) 6768 cv_signal(&port->ref_cv); 6769 mutex_exit(&port->ref_lock); 6770 } else { 6771 RW_EXIT(&plist->lockrw); 6772 /* 6773 * Destination not in FDB 6774 * 6775 * If the destination is broadcast or 6776 * multicast forward the packet to all 6777 * (VNETPORTs, PHYSDEV, LOCALDEV), 6778 * except the caller. 6779 */ 6780 if (IS_BROADCAST(ehp)) { 6781 D2(vswp, "%s: BROADCAST pkt", __func__); 6782 (void) vsw_forward_all(vswp, mp, 6783 caller, arg); 6784 } else if (IS_MULTICAST(ehp)) { 6785 D2(vswp, "%s: MULTICAST pkt", __func__); 6786 (void) vsw_forward_grp(vswp, mp, 6787 caller, arg); 6788 } else { 6789 /* 6790 * Unicast pkt from vnet that we don't have 6791 * an FDB entry for, so must be destinded for 6792 * the outside world. Attempt to send up to the 6793 * IP layer to allow it to deal with it. 6794 */ 6795 if (caller == VSW_VNETPORT) { 6796 READ_ENTER(&vswp->if_lockrw); 6797 if (vswp->if_state & VSW_IF_UP) { 6798 RW_EXIT(&vswp->if_lockrw); 6799 D2(vswp, "%s: sending up", 6800 __func__); 6801 mac_rx(vswp->if_mh, mrh, mp); 6802 } else { 6803 RW_EXIT(&vswp->if_lockrw); 6804 /* Interface down, drop pkt */ 6805 D2(vswp, "%s I/F down", 6806 __func__); 6807 freemsg(mp); 6808 } 6809 } 6810 } 6811 } 6812 } 6813 6814 D1(vswp, "%s: exit", __func__); 6815 } 6816 6817 /* 6818 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 6819 * except the caller (port on which frame arrived). 6820 */ 6821 static int 6822 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6823 { 6824 vsw_port_list_t *plist = &vswp->plist; 6825 vsw_port_t *portp; 6826 mblk_t *nmp = NULL; 6827 mblk_t *ret_m = NULL; 6828 int skip_port = 0; 6829 6830 D1(vswp, "vsw_forward_all: enter\n"); 6831 6832 /* 6833 * Broadcast message from inside ldoms so send to outside 6834 * world if in either of layer 2 modes. 6835 */ 6836 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6837 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6838 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 6839 6840 nmp = dupmsg(mp); 6841 if (nmp) { 6842 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6843 DERR(vswp, "%s: dropping pkt(s) " 6844 "consisting of %ld bytes of data for" 6845 " physical device", __func__, MBLKL(ret_m)); 6846 freemsg(ret_m); 6847 } 6848 } 6849 } 6850 6851 if (caller == VSW_VNETPORT) 6852 skip_port = 1; 6853 6854 /* 6855 * Broadcast message from other vnet (layer 2 or 3) or outside 6856 * world (layer 2 only), send up stack if plumbed. 6857 */ 6858 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 6859 READ_ENTER(&vswp->if_lockrw); 6860 if (vswp->if_state & VSW_IF_UP) { 6861 RW_EXIT(&vswp->if_lockrw); 6862 nmp = copymsg(mp); 6863 if (nmp) 6864 mac_rx(vswp->if_mh, NULL, nmp); 6865 } else { 6866 RW_EXIT(&vswp->if_lockrw); 6867 } 6868 } 6869 6870 /* send it to all VNETPORTs */ 6871 READ_ENTER(&plist->lockrw); 6872 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 6873 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 6874 /* 6875 * Caution ! - don't reorder these two checks as arg 6876 * will be NULL if the caller is PHYSDEV. skip_port is 6877 * only set if caller is VNETPORT. 6878 */ 6879 if ((skip_port) && (portp == arg)) 6880 continue; 6881 else { 6882 nmp = dupmsg(mp); 6883 if (nmp) { 6884 (void) vsw_portsend(portp, nmp); 6885 } else { 6886 DERR(vswp, "vsw_forward_all: nmp NULL"); 6887 } 6888 } 6889 } 6890 RW_EXIT(&plist->lockrw); 6891 6892 freemsg(mp); 6893 6894 D1(vswp, "vsw_forward_all: exit\n"); 6895 return (0); 6896 } 6897 6898 /* 6899 * Forward pkts to any devices or interfaces which have registered 6900 * an interest in them (i.e. multicast groups). 6901 */ 6902 static int 6903 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6904 { 6905 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 6906 mfdb_ent_t *entp = NULL; 6907 mfdb_ent_t *tpp = NULL; 6908 vsw_port_t *port; 6909 uint64_t key = 0; 6910 mblk_t *nmp = NULL; 6911 mblk_t *ret_m = NULL; 6912 boolean_t check_if = B_TRUE; 6913 6914 /* 6915 * Convert address to hash table key 6916 */ 6917 KEY_HASH(key, ehp->ether_dhost); 6918 6919 D1(vswp, "%s: key 0x%llx", __func__, key); 6920 6921 /* 6922 * If pkt came from either a vnet or down the stack (if we are 6923 * plumbed) and we are in layer 2 mode, then we send the pkt out 6924 * over the physical adapter, and then check to see if any other 6925 * vnets are interested in it. 6926 */ 6927 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6928 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6929 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 6930 nmp = dupmsg(mp); 6931 if (nmp) { 6932 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6933 DERR(vswp, "%s: dropping pkt(s) " 6934 "consisting of %ld bytes of " 6935 "data for physical device", 6936 __func__, MBLKL(ret_m)); 6937 freemsg(ret_m); 6938 } 6939 } 6940 } 6941 6942 READ_ENTER(&vswp->mfdbrw); 6943 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 6944 (mod_hash_val_t *)&entp) != 0) { 6945 D3(vswp, "%s: no table entry found for addr 0x%llx", 6946 __func__, key); 6947 } else { 6948 /* 6949 * Send to list of devices associated with this address... 6950 */ 6951 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 6952 6953 /* dont send to ourselves */ 6954 if ((caller == VSW_VNETPORT) && 6955 (tpp->d_addr == (void *)arg)) { 6956 port = (vsw_port_t *)tpp->d_addr; 6957 D3(vswp, "%s: not sending to ourselves" 6958 " : port %d", __func__, 6959 port->p_instance); 6960 continue; 6961 6962 } else if ((caller == VSW_LOCALDEV) && 6963 (tpp->d_type == VSW_LOCALDEV)) { 6964 D3(vswp, "%s: not sending back up stack", 6965 __func__); 6966 continue; 6967 } 6968 6969 if (tpp->d_type == VSW_VNETPORT) { 6970 port = (vsw_port_t *)tpp->d_addr; 6971 D3(vswp, "%s: sending to port %ld for " 6972 " addr 0x%llx", __func__, 6973 port->p_instance, key); 6974 6975 nmp = dupmsg(mp); 6976 if (nmp) 6977 (void) vsw_portsend(port, nmp); 6978 } else { 6979 if (vswp->if_state & VSW_IF_UP) { 6980 nmp = copymsg(mp); 6981 if (nmp) 6982 mac_rx(vswp->if_mh, NULL, nmp); 6983 check_if = B_FALSE; 6984 D3(vswp, "%s: sending up stack" 6985 " for addr 0x%llx", __func__, 6986 key); 6987 } 6988 } 6989 } 6990 } 6991 6992 RW_EXIT(&vswp->mfdbrw); 6993 6994 /* 6995 * If the pkt came from either a vnet or from physical device, 6996 * and if we havent already sent the pkt up the stack then we 6997 * check now if we can/should (i.e. the interface is plumbed 6998 * and in promisc mode). 6999 */ 7000 if ((check_if) && 7001 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 7002 READ_ENTER(&vswp->if_lockrw); 7003 if (VSW_U_P(vswp->if_state)) { 7004 RW_EXIT(&vswp->if_lockrw); 7005 D3(vswp, "%s: (caller %d) finally sending up stack" 7006 " for addr 0x%llx", __func__, caller, key); 7007 nmp = copymsg(mp); 7008 if (nmp) 7009 mac_rx(vswp->if_mh, NULL, nmp); 7010 } else { 7011 RW_EXIT(&vswp->if_lockrw); 7012 } 7013 } 7014 7015 freemsg(mp); 7016 7017 D1(vswp, "%s: exit", __func__); 7018 7019 return (0); 7020 } 7021 7022 /* transmit the packet over the given port */ 7023 static int 7024 vsw_portsend(vsw_port_t *port, mblk_t *mp) 7025 { 7026 vsw_ldc_list_t *ldcl = &port->p_ldclist; 7027 vsw_ldc_t *ldcp; 7028 int status = 0; 7029 7030 7031 READ_ENTER(&ldcl->lockrw); 7032 /* 7033 * Note for now, we have a single channel. 7034 */ 7035 ldcp = ldcl->head; 7036 if (ldcp == NULL) { 7037 DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n"); 7038 freemsg(mp); 7039 RW_EXIT(&ldcl->lockrw); 7040 return (1); 7041 } 7042 7043 /* 7044 * Send the message out using the appropriate 7045 * transmit function which will free mblock when it 7046 * is finished with it. 7047 */ 7048 mutex_enter(&port->tx_lock); 7049 if (port->transmit != NULL) 7050 status = (*port->transmit)(ldcp, mp); 7051 else { 7052 freemsg(mp); 7053 } 7054 mutex_exit(&port->tx_lock); 7055 7056 RW_EXIT(&ldcl->lockrw); 7057 7058 return (status); 7059 } 7060 7061 /* 7062 * Send packet out via descriptor ring to a logical device. 7063 */ 7064 static int 7065 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) 7066 { 7067 vio_dring_msg_t dring_pkt; 7068 dring_info_t *dp = NULL; 7069 vsw_private_desc_t *priv_desc = NULL; 7070 vnet_public_desc_t *pub = NULL; 7071 vsw_t *vswp = ldcp->ldc_vswp; 7072 mblk_t *bp; 7073 size_t n, size; 7074 caddr_t bufp; 7075 int idx; 7076 int status = LDC_TX_SUCCESS; 7077 7078 D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); 7079 7080 /* TODO: make test a macro */ 7081 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 7082 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 7083 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " 7084 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, 7085 ldcp->lane_out.lstate); 7086 freemsg(mp); 7087 return (LDC_TX_FAILURE); 7088 } 7089 7090 /* 7091 * Note - using first ring only, this may change 7092 * in the future. 7093 */ 7094 READ_ENTER(&ldcp->lane_out.dlistrw); 7095 if ((dp = ldcp->lane_out.dringp) == NULL) { 7096 RW_EXIT(&ldcp->lane_out.dlistrw); 7097 DERR(vswp, "%s(%lld): no dring for outbound lane on" 7098 " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); 7099 freemsg(mp); 7100 return (LDC_TX_FAILURE); 7101 } 7102 7103 size = msgsize(mp); 7104 if (size > (size_t)ETHERMAX) { 7105 RW_EXIT(&ldcp->lane_out.dlistrw); 7106 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 7107 ldcp->ldc_id, size); 7108 freemsg(mp); 7109 return (LDC_TX_FAILURE); 7110 } 7111 7112 /* 7113 * Find a free descriptor 7114 * 7115 * Note: for the moment we are assuming that we will only 7116 * have one dring going from the switch to each of its 7117 * peers. This may change in the future. 7118 */ 7119 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 7120 D2(vswp, "%s(%lld): no descriptor available for ring " 7121 "at 0x%llx", __func__, ldcp->ldc_id, dp); 7122 7123 /* nothing more we can do */ 7124 status = LDC_TX_NORESOURCES; 7125 goto vsw_dringsend_free_exit; 7126 } else { 7127 D2(vswp, "%s(%lld): free private descriptor found at pos " 7128 "%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx, 7129 priv_desc); 7130 } 7131 7132 /* copy data into the descriptor */ 7133 bufp = priv_desc->datap; 7134 bufp += VNET_IPALIGN; 7135 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 7136 n = MBLKL(bp); 7137 bcopy(bp->b_rptr, bufp, n); 7138 bufp += n; 7139 } 7140 7141 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 7142 7143 pub = priv_desc->descp; 7144 pub->nbytes = priv_desc->datalen; 7145 7146 mutex_enter(&priv_desc->dstate_lock); 7147 pub->hdr.dstate = VIO_DESC_READY; 7148 mutex_exit(&priv_desc->dstate_lock); 7149 7150 /* 7151 * Determine whether or not we need to send a message to our 7152 * peer prompting them to read our newly updated descriptor(s). 7153 */ 7154 mutex_enter(&dp->restart_lock); 7155 if (dp->restart_reqd) { 7156 dp->restart_reqd = B_FALSE; 7157 mutex_exit(&dp->restart_lock); 7158 7159 /* 7160 * Send a vio_dring_msg to peer to prompt them to read 7161 * the updated descriptor ring. 7162 */ 7163 dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; 7164 dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 7165 dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; 7166 dring_pkt.tag.vio_sid = ldcp->local_session; 7167 7168 /* Note - for now using first ring */ 7169 dring_pkt.dring_ident = dp->ident; 7170 7171 mutex_enter(&ldcp->lane_out.seq_lock); 7172 dring_pkt.seq_num = ldcp->lane_out.seq_num++; 7173 mutex_exit(&ldcp->lane_out.seq_lock); 7174 7175 /* 7176 * If last_ack_recv is -1 then we know we've not 7177 * received any ack's yet, so this must be the first 7178 * msg sent, so set the start to the begining of the ring. 7179 */ 7180 mutex_enter(&dp->dlock); 7181 if (dp->last_ack_recv == -1) { 7182 dring_pkt.start_idx = 0; 7183 } else { 7184 dring_pkt.start_idx = (dp->last_ack_recv + 1) % 7185 dp->num_descriptors; 7186 } 7187 dring_pkt.end_idx = -1; 7188 mutex_exit(&dp->dlock); 7189 7190 D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, 7191 ldcp->ldc_id, dp, dring_pkt.dring_ident); 7192 D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", 7193 __func__, ldcp->ldc_id, dring_pkt.start_idx, 7194 dring_pkt.end_idx, dring_pkt.seq_num); 7195 7196 RW_EXIT(&ldcp->lane_out.dlistrw); 7197 7198 (void) vsw_send_msg(ldcp, (void *)&dring_pkt, 7199 sizeof (vio_dring_msg_t), B_TRUE); 7200 7201 /* free the message block */ 7202 freemsg(mp); 7203 return (status); 7204 7205 } else { 7206 mutex_exit(&dp->restart_lock); 7207 D2(vswp, "%s(%lld): updating descp %d", __func__, 7208 ldcp->ldc_id, idx); 7209 } 7210 7211 vsw_dringsend_free_exit: 7212 7213 RW_EXIT(&ldcp->lane_out.dlistrw); 7214 7215 /* free the message block */ 7216 freemsg(mp); 7217 7218 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 7219 return (status); 7220 } 7221 7222 /* 7223 * Send an in-band descriptor message over ldc. 7224 */ 7225 static int 7226 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) 7227 { 7228 vsw_t *vswp = ldcp->ldc_vswp; 7229 vnet_ibnd_desc_t ibnd_msg; 7230 vsw_private_desc_t *priv_desc = NULL; 7231 dring_info_t *dp = NULL; 7232 size_t n, size = 0; 7233 caddr_t bufp; 7234 mblk_t *bp; 7235 int idx, i; 7236 int status = LDC_TX_SUCCESS; 7237 static int warn_msg = 1; 7238 7239 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 7240 7241 ASSERT(mp != NULL); 7242 7243 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 7244 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 7245 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", 7246 __func__, ldcp->ldc_id, ldcp->ldc_status, 7247 ldcp->lane_out.lstate); 7248 freemsg(mp); 7249 return (LDC_TX_FAILURE); 7250 } 7251 7252 /* 7253 * only expect single dring to exist, which we use 7254 * as an internal buffer, rather than a transfer channel. 7255 */ 7256 READ_ENTER(&ldcp->lane_out.dlistrw); 7257 if ((dp = ldcp->lane_out.dringp) == NULL) { 7258 DERR(vswp, "%s(%lld): no dring for outbound lane", 7259 __func__, ldcp->ldc_id); 7260 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", 7261 __func__, ldcp->ldc_id, ldcp->ldc_status, 7262 ldcp->lane_out.lstate); 7263 RW_EXIT(&ldcp->lane_out.dlistrw); 7264 freemsg(mp); 7265 return (LDC_TX_FAILURE); 7266 } 7267 7268 size = msgsize(mp); 7269 if (size > (size_t)ETHERMAX) { 7270 RW_EXIT(&ldcp->lane_out.dlistrw); 7271 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 7272 ldcp->ldc_id, size); 7273 freemsg(mp); 7274 return (LDC_TX_FAILURE); 7275 } 7276 7277 /* 7278 * Find a free descriptor in our buffer ring 7279 */ 7280 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 7281 RW_EXIT(&ldcp->lane_out.dlistrw); 7282 if (warn_msg) { 7283 DERR(vswp, "%s(%lld): no descriptor available for ring " 7284 "at 0x%llx", __func__, ldcp->ldc_id, dp); 7285 warn_msg = 0; 7286 } 7287 7288 /* nothing more we can do */ 7289 status = LDC_TX_NORESOURCES; 7290 goto vsw_descrsend_free_exit; 7291 } else { 7292 D2(vswp, "%s(%lld): free private descriptor found at pos " 7293 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, 7294 priv_desc); 7295 warn_msg = 1; 7296 } 7297 7298 /* copy data into the descriptor */ 7299 bufp = priv_desc->datap; 7300 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 7301 n = MBLKL(bp); 7302 bcopy(bp->b_rptr, bufp, n); 7303 bufp += n; 7304 } 7305 7306 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 7307 7308 /* create and send the in-band descp msg */ 7309 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; 7310 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; 7311 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; 7312 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; 7313 7314 mutex_enter(&ldcp->lane_out.seq_lock); 7315 ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; 7316 mutex_exit(&ldcp->lane_out.seq_lock); 7317 7318 /* 7319 * Copy the mem cookies describing the data from the 7320 * private region of the descriptor ring into the inband 7321 * descriptor. 7322 */ 7323 for (i = 0; i < priv_desc->ncookies; i++) { 7324 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], 7325 sizeof (ldc_mem_cookie_t)); 7326 } 7327 7328 ibnd_msg.hdr.desc_handle = idx; 7329 ibnd_msg.ncookies = priv_desc->ncookies; 7330 ibnd_msg.nbytes = size; 7331 7332 RW_EXIT(&ldcp->lane_out.dlistrw); 7333 7334 (void) vsw_send_msg(ldcp, (void *)&ibnd_msg, 7335 sizeof (vnet_ibnd_desc_t), B_TRUE); 7336 7337 vsw_descrsend_free_exit: 7338 7339 /* free the allocated message blocks */ 7340 freemsg(mp); 7341 7342 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 7343 return (status); 7344 } 7345 7346 static void 7347 vsw_send_ver(void *arg) 7348 { 7349 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 7350 vsw_t *vswp = ldcp->ldc_vswp; 7351 lane_t *lp = &ldcp->lane_out; 7352 vio_ver_msg_t ver_msg; 7353 7354 D1(vswp, "%s enter", __func__); 7355 7356 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7357 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7358 ver_msg.tag.vio_subtype_env = VIO_VER_INFO; 7359 ver_msg.tag.vio_sid = ldcp->local_session; 7360 7361 ver_msg.ver_major = vsw_versions[0].ver_major; 7362 ver_msg.ver_minor = vsw_versions[0].ver_minor; 7363 ver_msg.dev_class = VDEV_NETWORK_SWITCH; 7364 7365 lp->lstate |= VSW_VER_INFO_SENT; 7366 lp->ver_major = ver_msg.ver_major; 7367 lp->ver_minor = ver_msg.ver_minor; 7368 7369 DUMP_TAG(ver_msg.tag); 7370 7371 (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE); 7372 7373 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); 7374 } 7375 7376 static void 7377 vsw_send_attr(vsw_ldc_t *ldcp) 7378 { 7379 vsw_t *vswp = ldcp->ldc_vswp; 7380 lane_t *lp = &ldcp->lane_out; 7381 vnet_attr_msg_t attr_msg; 7382 7383 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 7384 7385 /* 7386 * Subtype is set to INFO by default 7387 */ 7388 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7389 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7390 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; 7391 attr_msg.tag.vio_sid = ldcp->local_session; 7392 7393 /* payload copied from default settings for lane */ 7394 attr_msg.mtu = lp->mtu; 7395 attr_msg.addr_type = lp->addr_type; 7396 attr_msg.xfer_mode = lp->xfer_mode; 7397 attr_msg.ack_freq = lp->xfer_mode; 7398 7399 READ_ENTER(&vswp->if_lockrw); 7400 bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL); 7401 RW_EXIT(&vswp->if_lockrw); 7402 7403 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; 7404 7405 DUMP_TAG(attr_msg.tag); 7406 7407 (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE); 7408 7409 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 7410 } 7411 7412 /* 7413 * Create dring info msg (which also results in the creation of 7414 * a dring). 7415 */ 7416 static vio_dring_reg_msg_t * 7417 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp) 7418 { 7419 vio_dring_reg_msg_t *mp; 7420 dring_info_t *dp; 7421 vsw_t *vswp = ldcp->ldc_vswp; 7422 7423 D1(vswp, "vsw_create_dring_info_pkt enter\n"); 7424 7425 /* 7426 * If we can't create a dring, obviously no point sending 7427 * a message. 7428 */ 7429 if ((dp = vsw_create_dring(ldcp)) == NULL) 7430 return (NULL); 7431 7432 mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); 7433 7434 mp->tag.vio_msgtype = VIO_TYPE_CTRL; 7435 mp->tag.vio_subtype = VIO_SUBTYPE_INFO; 7436 mp->tag.vio_subtype_env = VIO_DRING_REG; 7437 mp->tag.vio_sid = ldcp->local_session; 7438 7439 /* payload */ 7440 mp->num_descriptors = dp->num_descriptors; 7441 mp->descriptor_size = dp->descriptor_size; 7442 mp->options = dp->options; 7443 mp->ncookies = dp->ncookies; 7444 bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); 7445 7446 mp->dring_ident = 0; 7447 7448 D1(vswp, "vsw_create_dring_info_pkt exit\n"); 7449 7450 return (mp); 7451 } 7452 7453 static void 7454 vsw_send_dring_info(vsw_ldc_t *ldcp) 7455 { 7456 vio_dring_reg_msg_t *dring_msg; 7457 vsw_t *vswp = ldcp->ldc_vswp; 7458 7459 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); 7460 7461 dring_msg = vsw_create_dring_info_pkt(ldcp); 7462 if (dring_msg == NULL) { 7463 cmn_err(CE_WARN, "!vsw%d: %s: error creating msg", 7464 vswp->instance, __func__); 7465 return; 7466 } 7467 7468 ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT; 7469 7470 DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); 7471 7472 (void) vsw_send_msg(ldcp, dring_msg, 7473 sizeof (vio_dring_reg_msg_t), B_TRUE); 7474 7475 kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); 7476 7477 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); 7478 } 7479 7480 static void 7481 vsw_send_rdx(vsw_ldc_t *ldcp) 7482 { 7483 vsw_t *vswp = ldcp->ldc_vswp; 7484 vio_rdx_msg_t rdx_msg; 7485 7486 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 7487 7488 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7489 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7490 rdx_msg.tag.vio_subtype_env = VIO_RDX; 7491 rdx_msg.tag.vio_sid = ldcp->local_session; 7492 7493 ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT; 7494 7495 DUMP_TAG(rdx_msg.tag); 7496 7497 (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE); 7498 7499 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 7500 } 7501 7502 /* 7503 * Generic routine to send message out over ldc channel. 7504 * 7505 * It is possible that when we attempt to write over the ldc channel 7506 * that we get notified that it has been reset. Depending on the value 7507 * of the handle_reset flag we either handle that event here or simply 7508 * notify the caller that the channel was reset. 7509 */ 7510 static int 7511 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset) 7512 { 7513 int rv; 7514 size_t msglen = size; 7515 vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; 7516 vsw_t *vswp = ldcp->ldc_vswp; 7517 7518 D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", 7519 ldcp->ldc_id, size); 7520 7521 D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); 7522 D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); 7523 D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); 7524 7525 mutex_enter(&ldcp->ldc_txlock); 7526 do { 7527 msglen = size; 7528 rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); 7529 } while (rv == EWOULDBLOCK && --vsw_wretries > 0); 7530 7531 if ((rv != 0) || (msglen != size)) { 7532 DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) " 7533 "rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id, 7534 rv, size, msglen); 7535 } 7536 mutex_exit(&ldcp->ldc_txlock); 7537 7538 /* 7539 * If channel has been reset we either handle it here or 7540 * simply report back that it has been reset and let caller 7541 * decide what to do. 7542 */ 7543 if (rv == ECONNRESET) { 7544 DWARN(vswp, "%s (%lld) channel reset", 7545 __func__, ldcp->ldc_id); 7546 7547 /* 7548 * N.B - must never be holding the dlistrw lock when 7549 * we do a reset of the channel. 7550 */ 7551 if (handle_reset) { 7552 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 7553 } 7554 } 7555 7556 return (rv); 7557 } 7558 7559 /* 7560 * Add an entry into FDB, for the given mac address and port_id. 7561 * Returns 0 on success, 1 on failure. 7562 * 7563 * Lock protecting FDB must be held by calling process. 7564 */ 7565 static int 7566 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 7567 { 7568 uint64_t addr = 0; 7569 7570 D1(vswp, "%s: enter", __func__); 7571 7572 KEY_HASH(addr, port->p_macaddr); 7573 7574 D2(vswp, "%s: key = 0x%llx", __func__, addr); 7575 7576 /* 7577 * Note: duplicate keys will be rejected by mod_hash. 7578 */ 7579 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 7580 (mod_hash_val_t)port) != 0) { 7581 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 7582 return (1); 7583 } 7584 7585 D1(vswp, "%s: exit", __func__); 7586 return (0); 7587 } 7588 7589 /* 7590 * Remove an entry from FDB. 7591 * Returns 0 on success, 1 on failure. 7592 */ 7593 static int 7594 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 7595 { 7596 uint64_t addr = 0; 7597 7598 D1(vswp, "%s: enter", __func__); 7599 7600 KEY_HASH(addr, port->p_macaddr); 7601 7602 D2(vswp, "%s: key = 0x%llx", __func__, addr); 7603 7604 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 7605 7606 D1(vswp, "%s: enter", __func__); 7607 7608 return (0); 7609 } 7610 7611 /* 7612 * Search fdb for a given mac address. 7613 * Returns pointer to the entry if found, else returns NULL. 7614 */ 7615 static vsw_port_t * 7616 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 7617 { 7618 uint64_t key = 0; 7619 vsw_port_t *port = NULL; 7620 7621 D1(vswp, "%s: enter", __func__); 7622 7623 KEY_HASH(key, ehp->ether_dhost); 7624 7625 D2(vswp, "%s: key = 0x%llx", __func__, key); 7626 7627 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 7628 (mod_hash_val_t *)&port) != 0) { 7629 D2(vswp, "%s: no port found", __func__); 7630 return (NULL); 7631 } 7632 7633 D1(vswp, "%s: exit", __func__); 7634 7635 return (port); 7636 } 7637 7638 /* 7639 * Add or remove multicast address(es). 7640 * 7641 * Returns 0 on success, 1 on failure. 7642 */ 7643 static int 7644 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 7645 { 7646 mcst_addr_t *mcst_p = NULL; 7647 vsw_t *vswp = port->p_vswp; 7648 uint64_t addr = 0x0; 7649 int i; 7650 7651 D1(vswp, "%s: enter", __func__); 7652 7653 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 7654 7655 mutex_enter(&vswp->mac_lock); 7656 if (vswp->mh == NULL) { 7657 mutex_exit(&vswp->mac_lock); 7658 return (1); 7659 } 7660 mutex_exit(&vswp->mac_lock); 7661 7662 for (i = 0; i < mcst_pkt->count; i++) { 7663 /* 7664 * Convert address into form that can be used 7665 * as hash table key. 7666 */ 7667 KEY_HASH(addr, mcst_pkt->mca[i]); 7668 7669 /* 7670 * Add or delete the specified address/port combination. 7671 */ 7672 if (mcst_pkt->set == 0x1) { 7673 D3(vswp, "%s: adding multicast address 0x%llx for " 7674 "port %ld", __func__, addr, port->p_instance); 7675 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 7676 /* 7677 * Update the list of multicast 7678 * addresses contained within the 7679 * port structure to include this new 7680 * one. 7681 */ 7682 mcst_p = kmem_alloc(sizeof (mcst_addr_t), 7683 KM_NOSLEEP); 7684 if (mcst_p == NULL) { 7685 DERR(vswp, "%s: unable to alloc mem", 7686 __func__); 7687 return (1); 7688 } 7689 7690 mcst_p->nextp = NULL; 7691 mcst_p->addr = addr; 7692 7693 mutex_enter(&port->mca_lock); 7694 mcst_p->nextp = port->mcap; 7695 port->mcap = mcst_p; 7696 mutex_exit(&port->mca_lock); 7697 7698 /* 7699 * Program the address into HW. If the addr 7700 * has already been programmed then the MAC 7701 * just increments a ref counter (which is 7702 * used when the address is being deleted) 7703 */ 7704 mutex_enter(&vswp->mac_lock); 7705 if ((vswp->mh == NULL) || 7706 mac_multicst_add(vswp->mh, 7707 (uchar_t *)&mcst_pkt->mca[i])) { 7708 mutex_exit(&vswp->mac_lock); 7709 cmn_err(CE_WARN, "!vsw%d: unable to " 7710 "add multicast address", 7711 vswp->instance); 7712 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 7713 addr, port); 7714 vsw_del_addr(VSW_VNETPORT, port, addr); 7715 return (1); 7716 } 7717 mutex_exit(&vswp->mac_lock); 7718 7719 } else { 7720 DERR(vswp, "%s: error adding multicast " 7721 "address 0x%llx for port %ld", 7722 __func__, addr, port->p_instance); 7723 return (1); 7724 } 7725 } else { 7726 /* 7727 * Delete an entry from the multicast hash 7728 * table and update the address list 7729 * appropriately. 7730 */ 7731 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 7732 D3(vswp, "%s: deleting multicast address " 7733 "0x%llx for port %ld", __func__, addr, 7734 port->p_instance); 7735 7736 vsw_del_addr(VSW_VNETPORT, port, addr); 7737 7738 /* 7739 * Remove the address from HW. The address 7740 * will actually only be removed once the ref 7741 * count within the MAC layer has dropped to 7742 * zero. I.e. we can safely call this fn even 7743 * if other ports are interested in this 7744 * address. 7745 */ 7746 mutex_enter(&vswp->mac_lock); 7747 if ((vswp->mh == NULL) || 7748 mac_multicst_remove(vswp->mh, 7749 (uchar_t *)&mcst_pkt->mca[i])) { 7750 mutex_exit(&vswp->mac_lock); 7751 cmn_err(CE_WARN, "!vsw%d: unable to " 7752 "remove multicast address", 7753 vswp->instance); 7754 return (1); 7755 } 7756 mutex_exit(&vswp->mac_lock); 7757 7758 } else { 7759 DERR(vswp, "%s: error deleting multicast " 7760 "addr 0x%llx for port %ld", 7761 __func__, addr, port->p_instance); 7762 return (1); 7763 } 7764 } 7765 } 7766 D1(vswp, "%s: exit", __func__); 7767 return (0); 7768 } 7769 7770 /* 7771 * Add a new multicast entry. 7772 * 7773 * Search hash table based on address. If match found then 7774 * update associated val (which is chain of ports), otherwise 7775 * create new key/val (addr/port) pair and insert into table. 7776 */ 7777 static int 7778 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 7779 { 7780 int dup = 0; 7781 int rv = 0; 7782 mfdb_ent_t *ment = NULL; 7783 mfdb_ent_t *tmp_ent = NULL; 7784 mfdb_ent_t *new_ent = NULL; 7785 void *tgt = NULL; 7786 7787 if (devtype == VSW_VNETPORT) { 7788 /* 7789 * Being invoked from a vnet. 7790 */ 7791 ASSERT(arg != NULL); 7792 tgt = arg; 7793 D2(NULL, "%s: port %d : address 0x%llx", __func__, 7794 ((vsw_port_t *)arg)->p_instance, addr); 7795 } else { 7796 /* 7797 * We are being invoked via the m_multicst mac entry 7798 * point. 7799 */ 7800 D2(NULL, "%s: address 0x%llx", __func__, addr); 7801 tgt = (void *)vswp; 7802 } 7803 7804 WRITE_ENTER(&vswp->mfdbrw); 7805 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7806 (mod_hash_val_t *)&ment) != 0) { 7807 7808 /* address not currently in table */ 7809 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7810 ment->d_addr = (void *)tgt; 7811 ment->d_type = devtype; 7812 ment->nextp = NULL; 7813 7814 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 7815 (mod_hash_val_t)ment) != 0) { 7816 DERR(vswp, "%s: hash table insertion failed", __func__); 7817 kmem_free(ment, sizeof (mfdb_ent_t)); 7818 rv = 1; 7819 } else { 7820 D2(vswp, "%s: added initial entry for 0x%llx to " 7821 "table", __func__, addr); 7822 } 7823 } else { 7824 /* 7825 * Address in table. Check to see if specified port 7826 * is already associated with the address. If not add 7827 * it now. 7828 */ 7829 tmp_ent = ment; 7830 while (tmp_ent != NULL) { 7831 if (tmp_ent->d_addr == (void *)tgt) { 7832 if (devtype == VSW_VNETPORT) { 7833 DERR(vswp, "%s: duplicate port entry " 7834 "found for portid %ld and key " 7835 "0x%llx", __func__, 7836 ((vsw_port_t *)arg)->p_instance, 7837 addr); 7838 } else { 7839 DERR(vswp, "%s: duplicate entry found" 7840 "for key 0x%llx", 7841 __func__, addr); 7842 } 7843 rv = 1; 7844 dup = 1; 7845 break; 7846 } 7847 tmp_ent = tmp_ent->nextp; 7848 } 7849 7850 /* 7851 * Port not on list so add it to end now. 7852 */ 7853 if (0 == dup) { 7854 D2(vswp, "%s: added entry for 0x%llx to table", 7855 __func__, addr); 7856 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7857 new_ent->d_addr = (void *)tgt; 7858 new_ent->d_type = devtype; 7859 new_ent->nextp = NULL; 7860 7861 tmp_ent = ment; 7862 while (tmp_ent->nextp != NULL) 7863 tmp_ent = tmp_ent->nextp; 7864 7865 tmp_ent->nextp = new_ent; 7866 } 7867 } 7868 7869 RW_EXIT(&vswp->mfdbrw); 7870 return (rv); 7871 } 7872 7873 /* 7874 * Remove a multicast entry from the hashtable. 7875 * 7876 * Search hash table based on address. If match found, scan 7877 * list of ports associated with address. If specified port 7878 * found remove it from list. 7879 */ 7880 static int 7881 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 7882 { 7883 mfdb_ent_t *ment = NULL; 7884 mfdb_ent_t *curr_p, *prev_p; 7885 void *tgt = NULL; 7886 7887 D1(vswp, "%s: enter", __func__); 7888 7889 if (devtype == VSW_VNETPORT) { 7890 tgt = (vsw_port_t *)arg; 7891 D2(vswp, "%s: removing port %d from mFDB for address" 7892 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, 7893 addr); 7894 } else { 7895 D2(vswp, "%s: removing entry", __func__); 7896 tgt = (void *)vswp; 7897 } 7898 7899 WRITE_ENTER(&vswp->mfdbrw); 7900 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7901 (mod_hash_val_t *)&ment) != 0) { 7902 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 7903 RW_EXIT(&vswp->mfdbrw); 7904 return (1); 7905 } 7906 7907 prev_p = curr_p = ment; 7908 7909 while (curr_p != NULL) { 7910 if (curr_p->d_addr == (void *)tgt) { 7911 if (devtype == VSW_VNETPORT) { 7912 D2(vswp, "%s: port %d found", __func__, 7913 ((vsw_port_t *)tgt)->p_instance); 7914 } else { 7915 D2(vswp, "%s: instance found", __func__); 7916 } 7917 7918 if (prev_p == curr_p) { 7919 /* 7920 * head of list, if no other element is in 7921 * list then destroy this entry, otherwise 7922 * just replace it with updated value. 7923 */ 7924 ment = curr_p->nextp; 7925 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7926 if (ment == NULL) { 7927 (void) mod_hash_destroy(vswp->mfdb, 7928 (mod_hash_val_t)addr); 7929 } else { 7930 (void) mod_hash_replace(vswp->mfdb, 7931 (mod_hash_key_t)addr, 7932 (mod_hash_val_t)ment); 7933 } 7934 } else { 7935 /* 7936 * Not head of list, no need to do 7937 * replacement, just adjust list pointers. 7938 */ 7939 prev_p->nextp = curr_p->nextp; 7940 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7941 } 7942 break; 7943 } 7944 7945 prev_p = curr_p; 7946 curr_p = curr_p->nextp; 7947 } 7948 7949 RW_EXIT(&vswp->mfdbrw); 7950 7951 D1(vswp, "%s: exit", __func__); 7952 7953 return (0); 7954 } 7955 7956 /* 7957 * Port is being deleted, but has registered an interest in one 7958 * or more multicast groups. Using the list of addresses maintained 7959 * within the port structure find the appropriate entry in the hash 7960 * table and remove this port from the list of interested ports. 7961 */ 7962 static void 7963 vsw_del_mcst_port(vsw_port_t *port) 7964 { 7965 mcst_addr_t *mcst_p = NULL; 7966 vsw_t *vswp = port->p_vswp; 7967 7968 D1(vswp, "%s: enter", __func__); 7969 7970 mutex_enter(&port->mca_lock); 7971 while (port->mcap != NULL) { 7972 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 7973 port->mcap->addr, port); 7974 7975 mcst_p = port->mcap->nextp; 7976 kmem_free(port->mcap, sizeof (mcst_addr_t)); 7977 port->mcap = mcst_p; 7978 } 7979 mutex_exit(&port->mca_lock); 7980 7981 D1(vswp, "%s: exit", __func__); 7982 } 7983 7984 /* 7985 * This vsw instance is detaching, but has registered an interest in one 7986 * or more multicast groups. Using the list of addresses maintained 7987 * within the vsw structure find the appropriate entry in the hash 7988 * table and remove this instance from the list of interested ports. 7989 */ 7990 static void 7991 vsw_del_mcst_vsw(vsw_t *vswp) 7992 { 7993 mcst_addr_t *next_p = NULL; 7994 7995 D1(vswp, "%s: enter", __func__); 7996 7997 mutex_enter(&vswp->mca_lock); 7998 7999 while (vswp->mcap != NULL) { 8000 DERR(vswp, "%s: deleting addr 0x%llx", 8001 __func__, vswp->mcap->addr); 8002 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, 8003 vswp->mcap->addr, NULL); 8004 8005 next_p = vswp->mcap->nextp; 8006 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 8007 vswp->mcap = next_p; 8008 } 8009 8010 vswp->mcap = NULL; 8011 mutex_exit(&vswp->mca_lock); 8012 8013 D1(vswp, "%s: exit", __func__); 8014 } 8015 8016 8017 /* 8018 * Remove the specified address from the list of address maintained 8019 * in this port node. 8020 */ 8021 static void 8022 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) 8023 { 8024 vsw_t *vswp = NULL; 8025 vsw_port_t *port = NULL; 8026 mcst_addr_t *prev_p = NULL; 8027 mcst_addr_t *curr_p = NULL; 8028 8029 D1(NULL, "%s: enter : devtype %d : addr 0x%llx", 8030 __func__, devtype, addr); 8031 8032 if (devtype == VSW_VNETPORT) { 8033 port = (vsw_port_t *)arg; 8034 mutex_enter(&port->mca_lock); 8035 prev_p = curr_p = port->mcap; 8036 } else { 8037 vswp = (vsw_t *)arg; 8038 mutex_enter(&vswp->mca_lock); 8039 prev_p = curr_p = vswp->mcap; 8040 } 8041 8042 while (curr_p != NULL) { 8043 if (curr_p->addr == addr) { 8044 D2(NULL, "%s: address found", __func__); 8045 /* match found */ 8046 if (prev_p == curr_p) { 8047 /* list head */ 8048 if (devtype == VSW_VNETPORT) 8049 port->mcap = curr_p->nextp; 8050 else 8051 vswp->mcap = curr_p->nextp; 8052 } else { 8053 prev_p->nextp = curr_p->nextp; 8054 } 8055 kmem_free(curr_p, sizeof (mcst_addr_t)); 8056 break; 8057 } else { 8058 prev_p = curr_p; 8059 curr_p = curr_p->nextp; 8060 } 8061 } 8062 8063 if (devtype == VSW_VNETPORT) 8064 mutex_exit(&port->mca_lock); 8065 else 8066 mutex_exit(&vswp->mca_lock); 8067 8068 D1(NULL, "%s: exit", __func__); 8069 } 8070 8071 /* 8072 * Creates a descriptor ring (dring) and links it into the 8073 * link of outbound drings for this channel. 8074 * 8075 * Returns NULL if creation failed. 8076 */ 8077 static dring_info_t * 8078 vsw_create_dring(vsw_ldc_t *ldcp) 8079 { 8080 vsw_private_desc_t *priv_addr = NULL; 8081 vsw_t *vswp = ldcp->ldc_vswp; 8082 ldc_mem_info_t minfo; 8083 dring_info_t *dp, *tp; 8084 int i; 8085 8086 dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 8087 8088 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 8089 8090 /* create public section of ring */ 8091 if ((ldc_mem_dring_create(VSW_RING_NUM_EL, 8092 VSW_PUB_SIZE, &dp->handle)) != 0) { 8093 8094 DERR(vswp, "vsw_create_dring(%lld): ldc dring create " 8095 "failed", ldcp->ldc_id); 8096 goto create_fail_exit; 8097 } 8098 8099 ASSERT(dp->handle != NULL); 8100 8101 /* 8102 * Get the base address of the public section of the ring. 8103 */ 8104 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 8105 DERR(vswp, "vsw_create_dring(%lld): dring info failed\n", 8106 ldcp->ldc_id); 8107 goto dring_fail_exit; 8108 } else { 8109 ASSERT(minfo.vaddr != 0); 8110 dp->pub_addr = minfo.vaddr; 8111 } 8112 8113 dp->num_descriptors = VSW_RING_NUM_EL; 8114 dp->descriptor_size = VSW_PUB_SIZE; 8115 dp->options = VIO_TX_DRING; 8116 dp->ncookies = 1; /* guaranteed by ldc */ 8117 8118 /* 8119 * create private portion of ring 8120 */ 8121 dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( 8122 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 8123 8124 if (vsw_setup_ring(ldcp, dp)) { 8125 DERR(vswp, "%s: unable to setup ring", __func__); 8126 goto dring_fail_exit; 8127 } 8128 8129 /* haven't used any descriptors yet */ 8130 dp->end_idx = 0; 8131 dp->last_ack_recv = -1; 8132 8133 /* bind dring to the channel */ 8134 if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, 8135 LDC_SHADOW_MAP, LDC_MEM_RW, 8136 &dp->cookie[0], &dp->ncookies)) != 0) { 8137 DERR(vswp, "vsw_create_dring: unable to bind to channel " 8138 "%lld", ldcp->ldc_id); 8139 goto dring_fail_exit; 8140 } 8141 8142 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 8143 dp->restart_reqd = B_TRUE; 8144 8145 /* 8146 * Only ever create rings for outgoing lane. Link it onto 8147 * end of list. 8148 */ 8149 WRITE_ENTER(&ldcp->lane_out.dlistrw); 8150 if (ldcp->lane_out.dringp == NULL) { 8151 D2(vswp, "vsw_create_dring: adding first outbound ring"); 8152 ldcp->lane_out.dringp = dp; 8153 } else { 8154 tp = ldcp->lane_out.dringp; 8155 while (tp->next != NULL) 8156 tp = tp->next; 8157 8158 tp->next = dp; 8159 } 8160 RW_EXIT(&ldcp->lane_out.dlistrw); 8161 8162 return (dp); 8163 8164 dring_fail_exit: 8165 (void) ldc_mem_dring_destroy(dp->handle); 8166 8167 create_fail_exit: 8168 if (dp->priv_addr != NULL) { 8169 priv_addr = dp->priv_addr; 8170 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8171 if (priv_addr->memhandle != NULL) 8172 (void) ldc_mem_free_handle( 8173 priv_addr->memhandle); 8174 priv_addr++; 8175 } 8176 kmem_free(dp->priv_addr, 8177 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 8178 } 8179 mutex_destroy(&dp->dlock); 8180 8181 kmem_free(dp, sizeof (dring_info_t)); 8182 return (NULL); 8183 } 8184 8185 /* 8186 * Create a ring consisting of just a private portion and link 8187 * it into the list of rings for the outbound lane. 8188 * 8189 * These type of rings are used primarily for temporary data 8190 * storage (i.e. as data buffers). 8191 */ 8192 void 8193 vsw_create_privring(vsw_ldc_t *ldcp) 8194 { 8195 dring_info_t *dp, *tp; 8196 vsw_t *vswp = ldcp->ldc_vswp; 8197 8198 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 8199 8200 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 8201 8202 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 8203 8204 /* no public section */ 8205 dp->pub_addr = NULL; 8206 8207 dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) * 8208 VSW_RING_NUM_EL), KM_SLEEP); 8209 8210 dp->num_descriptors = VSW_RING_NUM_EL; 8211 8212 if (vsw_setup_ring(ldcp, dp)) { 8213 DERR(vswp, "%s: setup of ring failed", __func__); 8214 kmem_free(dp->priv_addr, 8215 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 8216 mutex_destroy(&dp->dlock); 8217 kmem_free(dp, sizeof (dring_info_t)); 8218 return; 8219 } 8220 8221 /* haven't used any descriptors yet */ 8222 dp->end_idx = 0; 8223 8224 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 8225 dp->restart_reqd = B_TRUE; 8226 8227 /* 8228 * Only ever create rings for outgoing lane. Link it onto 8229 * end of list. 8230 */ 8231 WRITE_ENTER(&ldcp->lane_out.dlistrw); 8232 if (ldcp->lane_out.dringp == NULL) { 8233 D2(vswp, "%s: adding first outbound privring", __func__); 8234 ldcp->lane_out.dringp = dp; 8235 } else { 8236 tp = ldcp->lane_out.dringp; 8237 while (tp->next != NULL) 8238 tp = tp->next; 8239 8240 tp->next = dp; 8241 } 8242 RW_EXIT(&ldcp->lane_out.dlistrw); 8243 8244 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 8245 } 8246 8247 /* 8248 * Setup the descriptors in the dring. Returns 0 on success, 1 on 8249 * failure. 8250 */ 8251 int 8252 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) 8253 { 8254 vnet_public_desc_t *pub_addr = NULL; 8255 vsw_private_desc_t *priv_addr = NULL; 8256 vsw_t *vswp = ldcp->ldc_vswp; 8257 uint64_t *tmpp; 8258 uint64_t offset = 0; 8259 uint32_t ncookies = 0; 8260 static char *name = "vsw_setup_ring"; 8261 int i, j, nc, rv; 8262 8263 priv_addr = dp->priv_addr; 8264 pub_addr = dp->pub_addr; 8265 8266 /* public section may be null but private should never be */ 8267 ASSERT(priv_addr != NULL); 8268 8269 /* 8270 * Allocate the region of memory which will be used to hold 8271 * the data the descriptors will refer to. 8272 */ 8273 dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ); 8274 dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); 8275 8276 D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, 8277 dp->data_sz, dp->data_addr); 8278 8279 tmpp = (uint64_t *)dp->data_addr; 8280 offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp); 8281 8282 /* 8283 * Initialise some of the private and public (if they exist) 8284 * descriptor fields. 8285 */ 8286 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8287 mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); 8288 8289 if ((ldc_mem_alloc_handle(ldcp->ldc_handle, 8290 &priv_addr->memhandle)) != 0) { 8291 DERR(vswp, "%s: alloc mem handle failed", name); 8292 goto setup_ring_cleanup; 8293 } 8294 8295 priv_addr->datap = (void *)tmpp; 8296 8297 rv = ldc_mem_bind_handle(priv_addr->memhandle, 8298 (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ, 8299 LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, 8300 &(priv_addr->memcookie[0]), &ncookies); 8301 if (rv != 0) { 8302 DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " 8303 "(rv %d)", name, ldcp->ldc_id, rv); 8304 goto setup_ring_cleanup; 8305 } 8306 priv_addr->bound = 1; 8307 8308 D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", 8309 name, i, priv_addr->memcookie[0].addr, 8310 priv_addr->memcookie[0].size); 8311 8312 if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { 8313 DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " 8314 "invalid num of cookies (%d) for size 0x%llx", 8315 name, ldcp->ldc_id, ncookies, 8316 VSW_RING_EL_DATA_SZ); 8317 8318 goto setup_ring_cleanup; 8319 } else { 8320 for (j = 1; j < ncookies; j++) { 8321 rv = ldc_mem_nextcookie(priv_addr->memhandle, 8322 &(priv_addr->memcookie[j])); 8323 if (rv != 0) { 8324 DERR(vswp, "%s: ldc_mem_nextcookie " 8325 "failed rv (%d)", name, rv); 8326 goto setup_ring_cleanup; 8327 } 8328 D3(vswp, "%s: memcookie %d : addr 0x%llx : " 8329 "size 0x%llx", name, j, 8330 priv_addr->memcookie[j].addr, 8331 priv_addr->memcookie[j].size); 8332 } 8333 8334 } 8335 priv_addr->ncookies = ncookies; 8336 priv_addr->dstate = VIO_DESC_FREE; 8337 8338 if (pub_addr != NULL) { 8339 8340 /* link pub and private sides */ 8341 priv_addr->descp = pub_addr; 8342 8343 pub_addr->ncookies = priv_addr->ncookies; 8344 8345 for (nc = 0; nc < pub_addr->ncookies; nc++) { 8346 bcopy(&priv_addr->memcookie[nc], 8347 &pub_addr->memcookie[nc], 8348 sizeof (ldc_mem_cookie_t)); 8349 } 8350 8351 pub_addr->hdr.dstate = VIO_DESC_FREE; 8352 pub_addr++; 8353 } 8354 8355 /* 8356 * move to next element in the dring and the next 8357 * position in the data buffer. 8358 */ 8359 priv_addr++; 8360 tmpp += offset; 8361 } 8362 8363 return (0); 8364 8365 setup_ring_cleanup: 8366 priv_addr = dp->priv_addr; 8367 8368 for (j = 0; j < i; j++) { 8369 (void) ldc_mem_unbind_handle(priv_addr->memhandle); 8370 (void) ldc_mem_free_handle(priv_addr->memhandle); 8371 8372 mutex_destroy(&priv_addr->dstate_lock); 8373 8374 priv_addr++; 8375 } 8376 kmem_free(dp->data_addr, dp->data_sz); 8377 8378 return (1); 8379 } 8380 8381 /* 8382 * Searches the private section of a ring for a free descriptor, 8383 * starting at the location of the last free descriptor found 8384 * previously. 8385 * 8386 * Returns 0 if free descriptor is available, and updates state 8387 * of private descriptor to VIO_DESC_READY, otherwise returns 1. 8388 * 8389 * FUTURE: might need to return contiguous range of descriptors 8390 * as dring info msg assumes all will be contiguous. 8391 */ 8392 static int 8393 vsw_dring_find_free_desc(dring_info_t *dringp, 8394 vsw_private_desc_t **priv_p, int *idx) 8395 { 8396 vsw_private_desc_t *addr = NULL; 8397 int num = VSW_RING_NUM_EL; 8398 int ret = 1; 8399 8400 D1(NULL, "%s enter\n", __func__); 8401 8402 ASSERT(dringp->priv_addr != NULL); 8403 8404 D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", 8405 __func__, dringp, dringp->end_idx); 8406 8407 addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; 8408 8409 mutex_enter(&addr->dstate_lock); 8410 if (addr->dstate == VIO_DESC_FREE) { 8411 addr->dstate = VIO_DESC_READY; 8412 *priv_p = addr; 8413 *idx = dringp->end_idx; 8414 dringp->end_idx = (dringp->end_idx + 1) % num; 8415 ret = 0; 8416 8417 } 8418 mutex_exit(&addr->dstate_lock); 8419 8420 /* ring full */ 8421 if (ret == 1) { 8422 D2(NULL, "%s: no desp free: started at %d", __func__, 8423 dringp->end_idx); 8424 } 8425 8426 D1(NULL, "%s: exit\n", __func__); 8427 8428 return (ret); 8429 } 8430 8431 /* 8432 * Map from a dring identifier to the ring itself. Returns 8433 * pointer to ring or NULL if no match found. 8434 * 8435 * Should be called with dlistrw rwlock held as reader. 8436 */ 8437 static dring_info_t * 8438 vsw_ident2dring(lane_t *lane, uint64_t ident) 8439 { 8440 dring_info_t *dp = NULL; 8441 8442 if ((dp = lane->dringp) == NULL) { 8443 return (NULL); 8444 } else { 8445 if (dp->ident == ident) 8446 return (dp); 8447 8448 while (dp != NULL) { 8449 if (dp->ident == ident) 8450 break; 8451 dp = dp->next; 8452 } 8453 } 8454 8455 return (dp); 8456 } 8457 8458 /* 8459 * Set the default lane attributes. These are copied into 8460 * the attr msg we send to our peer. If they are not acceptable 8461 * then (currently) the handshake ends. 8462 */ 8463 static void 8464 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) 8465 { 8466 bzero(lp, sizeof (lane_t)); 8467 8468 READ_ENTER(&vswp->if_lockrw); 8469 ether_copy(&(vswp->if_addr), &(lp->addr)); 8470 RW_EXIT(&vswp->if_lockrw); 8471 8472 lp->mtu = VSW_MTU; 8473 lp->addr_type = ADDR_TYPE_MAC; 8474 lp->xfer_mode = VIO_DRING_MODE; 8475 lp->ack_freq = 0; /* for shared mode */ 8476 8477 mutex_enter(&lp->seq_lock); 8478 lp->seq_num = VNET_ISS; 8479 mutex_exit(&lp->seq_lock); 8480 } 8481 8482 /* 8483 * Verify that the attributes are acceptable. 8484 * 8485 * FUTURE: If some attributes are not acceptable, change them 8486 * our desired values. 8487 */ 8488 static int 8489 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port) 8490 { 8491 int ret = 0; 8492 8493 D1(NULL, "vsw_check_attr enter\n"); 8494 8495 /* 8496 * Note we currently only support in-band descriptors 8497 * and descriptor rings, not packet based transfer (VIO_PKT_MODE) 8498 */ 8499 if ((pkt->xfer_mode != VIO_DESC_MODE) && 8500 (pkt->xfer_mode != VIO_DRING_MODE)) { 8501 D2(NULL, "vsw_check_attr: unknown mode %x\n", 8502 pkt->xfer_mode); 8503 ret = 1; 8504 } 8505 8506 /* Only support MAC addresses at moment. */ 8507 if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) { 8508 D2(NULL, "vsw_check_attr: invalid addr_type %x, " 8509 "or address 0x%llx\n", pkt->addr_type, 8510 pkt->addr); 8511 ret = 1; 8512 } 8513 8514 /* 8515 * MAC address supplied by device should match that stored 8516 * in the vsw-port OBP node. Need to decide what to do if they 8517 * don't match, for the moment just warn but don't fail. 8518 */ 8519 if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) { 8520 DERR(NULL, "vsw_check_attr: device supplied address " 8521 "0x%llx doesn't match node address 0x%llx\n", 8522 pkt->addr, port->p_macaddr); 8523 } 8524 8525 /* 8526 * Ack freq only makes sense in pkt mode, in shared 8527 * mode the ring descriptors say whether or not to 8528 * send back an ACK. 8529 */ 8530 if ((pkt->xfer_mode == VIO_DRING_MODE) && 8531 (pkt->ack_freq > 0)) { 8532 D2(NULL, "vsw_check_attr: non zero ack freq " 8533 " in SHM mode\n"); 8534 ret = 1; 8535 } 8536 8537 /* 8538 * Note: for the moment we only support ETHER 8539 * frames. This may change in the future. 8540 */ 8541 if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) { 8542 D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n", 8543 pkt->mtu); 8544 ret = 1; 8545 } 8546 8547 D1(NULL, "vsw_check_attr exit\n"); 8548 8549 return (ret); 8550 } 8551 8552 /* 8553 * Returns 1 if there is a problem, 0 otherwise. 8554 */ 8555 static int 8556 vsw_check_dring_info(vio_dring_reg_msg_t *pkt) 8557 { 8558 _NOTE(ARGUNUSED(pkt)) 8559 8560 int ret = 0; 8561 8562 D1(NULL, "vsw_check_dring_info enter\n"); 8563 8564 if ((pkt->num_descriptors == 0) || 8565 (pkt->descriptor_size == 0) || 8566 (pkt->ncookies != 1)) { 8567 DERR(NULL, "vsw_check_dring_info: invalid dring msg"); 8568 ret = 1; 8569 } 8570 8571 D1(NULL, "vsw_check_dring_info exit\n"); 8572 8573 return (ret); 8574 } 8575 8576 /* 8577 * Returns 1 if two memory cookies match. Otherwise returns 0. 8578 */ 8579 static int 8580 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2) 8581 { 8582 if ((m1->addr != m2->addr) || 8583 (m2->size != m2->size)) { 8584 return (0); 8585 } else { 8586 return (1); 8587 } 8588 } 8589 8590 /* 8591 * Returns 1 if ring described in reg message matches that 8592 * described by dring_info structure. Otherwise returns 0. 8593 */ 8594 static int 8595 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) 8596 { 8597 if ((msg->descriptor_size != dp->descriptor_size) || 8598 (msg->num_descriptors != dp->num_descriptors) || 8599 (msg->ncookies != dp->ncookies) || 8600 !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) { 8601 return (0); 8602 } else { 8603 return (1); 8604 } 8605 8606 } 8607 8608 static caddr_t 8609 vsw_print_ethaddr(uint8_t *a, char *ebuf) 8610 { 8611 (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", 8612 a[0], a[1], a[2], a[3], a[4], a[5]); 8613 return (ebuf); 8614 } 8615 8616 /* 8617 * Reset and free all the resources associated with 8618 * the channel. 8619 */ 8620 static void 8621 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) 8622 { 8623 dring_info_t *dp, *dpp; 8624 lane_t *lp = NULL; 8625 int rv = 0; 8626 8627 ASSERT(ldcp != NULL); 8628 8629 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); 8630 8631 if (dir == INBOUND) { 8632 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" 8633 " of channel %lld", __func__, ldcp->ldc_id); 8634 lp = &ldcp->lane_in; 8635 } else { 8636 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" 8637 " of channel %lld", __func__, ldcp->ldc_id); 8638 lp = &ldcp->lane_out; 8639 } 8640 8641 lp->lstate = VSW_LANE_INACTIV; 8642 mutex_enter(&lp->seq_lock); 8643 lp->seq_num = VNET_ISS; 8644 mutex_exit(&lp->seq_lock); 8645 if (lp->dringp) { 8646 if (dir == INBOUND) { 8647 WRITE_ENTER(&lp->dlistrw); 8648 dp = lp->dringp; 8649 while (dp != NULL) { 8650 dpp = dp->next; 8651 if (dp->handle != NULL) 8652 (void) ldc_mem_dring_unmap(dp->handle); 8653 kmem_free(dp, sizeof (dring_info_t)); 8654 dp = dpp; 8655 } 8656 RW_EXIT(&lp->dlistrw); 8657 } else { 8658 /* 8659 * unbind, destroy exported dring, free dring struct 8660 */ 8661 WRITE_ENTER(&lp->dlistrw); 8662 dp = lp->dringp; 8663 rv = vsw_free_ring(dp); 8664 RW_EXIT(&lp->dlistrw); 8665 } 8666 if (rv == 0) { 8667 lp->dringp = NULL; 8668 } 8669 } 8670 8671 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); 8672 } 8673 8674 /* 8675 * Free ring and all associated resources. 8676 * 8677 * Should be called with dlistrw rwlock held as writer. 8678 */ 8679 static int 8680 vsw_free_ring(dring_info_t *dp) 8681 { 8682 vsw_private_desc_t *paddr = NULL; 8683 dring_info_t *dpp; 8684 int i, rv = 1; 8685 8686 while (dp != NULL) { 8687 mutex_enter(&dp->dlock); 8688 dpp = dp->next; 8689 if (dp->priv_addr != NULL) { 8690 /* 8691 * First unbind and free the memory handles 8692 * stored in each descriptor within the ring. 8693 */ 8694 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8695 paddr = (vsw_private_desc_t *) 8696 dp->priv_addr + i; 8697 if (paddr->memhandle != NULL) { 8698 if (paddr->bound == 1) { 8699 rv = ldc_mem_unbind_handle( 8700 paddr->memhandle); 8701 8702 if (rv != 0) { 8703 DERR(NULL, "error " 8704 "unbinding handle for " 8705 "ring 0x%llx at pos %d", 8706 dp, i); 8707 mutex_exit(&dp->dlock); 8708 return (rv); 8709 } 8710 paddr->bound = 0; 8711 } 8712 8713 rv = ldc_mem_free_handle( 8714 paddr->memhandle); 8715 if (rv != 0) { 8716 DERR(NULL, "error freeing " 8717 "handle for ring " 8718 "0x%llx at pos %d", 8719 dp, i); 8720 mutex_exit(&dp->dlock); 8721 return (rv); 8722 } 8723 paddr->memhandle = NULL; 8724 } 8725 mutex_destroy(&paddr->dstate_lock); 8726 } 8727 kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t) 8728 * VSW_RING_NUM_EL)); 8729 } 8730 8731 /* 8732 * Now unbind and destroy the ring itself. 8733 */ 8734 if (dp->handle != NULL) { 8735 (void) ldc_mem_dring_unbind(dp->handle); 8736 (void) ldc_mem_dring_destroy(dp->handle); 8737 } 8738 8739 if (dp->data_addr != NULL) { 8740 kmem_free(dp->data_addr, dp->data_sz); 8741 } 8742 8743 mutex_exit(&dp->dlock); 8744 mutex_destroy(&dp->dlock); 8745 mutex_destroy(&dp->restart_lock); 8746 kmem_free(dp, sizeof (dring_info_t)); 8747 8748 dp = dpp; 8749 } 8750 return (0); 8751 } 8752 8753 /* 8754 * Debugging routines 8755 */ 8756 static void 8757 display_state(void) 8758 { 8759 vsw_t *vswp; 8760 vsw_port_list_t *plist; 8761 vsw_port_t *port; 8762 vsw_ldc_list_t *ldcl; 8763 vsw_ldc_t *ldcp; 8764 8765 cmn_err(CE_NOTE, "***** system state *****"); 8766 8767 for (vswp = vsw_head; vswp; vswp = vswp->next) { 8768 plist = &vswp->plist; 8769 READ_ENTER(&plist->lockrw); 8770 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", 8771 vswp->instance, plist->num_ports); 8772 8773 for (port = plist->head; port != NULL; port = port->p_next) { 8774 ldcl = &port->p_ldclist; 8775 cmn_err(CE_CONT, "port %d : %d ldcs attached\n", 8776 port->p_instance, ldcl->num_ldcs); 8777 READ_ENTER(&ldcl->lockrw); 8778 ldcp = ldcl->head; 8779 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 8780 cmn_err(CE_CONT, "chan %lu : dev %d : " 8781 "status %d : phase %u\n", 8782 ldcp->ldc_id, ldcp->dev_class, 8783 ldcp->ldc_status, ldcp->hphase); 8784 cmn_err(CE_CONT, "chan %lu : lsession %lu : " 8785 "psession %lu\n", 8786 ldcp->ldc_id, 8787 ldcp->local_session, 8788 ldcp->peer_session); 8789 8790 cmn_err(CE_CONT, "Inbound lane:\n"); 8791 display_lane(&ldcp->lane_in); 8792 cmn_err(CE_CONT, "Outbound lane:\n"); 8793 display_lane(&ldcp->lane_out); 8794 } 8795 RW_EXIT(&ldcl->lockrw); 8796 } 8797 RW_EXIT(&plist->lockrw); 8798 } 8799 cmn_err(CE_NOTE, "***** system state *****"); 8800 } 8801 8802 static void 8803 display_lane(lane_t *lp) 8804 { 8805 dring_info_t *drp; 8806 8807 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", 8808 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); 8809 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", 8810 lp->addr_type, lp->addr, lp->xfer_mode); 8811 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); 8812 8813 cmn_err(CE_CONT, "Dring info:\n"); 8814 for (drp = lp->dringp; drp != NULL; drp = drp->next) { 8815 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", 8816 drp->num_descriptors, drp->descriptor_size); 8817 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle); 8818 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", 8819 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); 8820 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", 8821 drp->ident, drp->end_idx); 8822 display_ring(drp); 8823 } 8824 } 8825 8826 static void 8827 display_ring(dring_info_t *dringp) 8828 { 8829 uint64_t i; 8830 uint64_t priv_count = 0; 8831 uint64_t pub_count = 0; 8832 vnet_public_desc_t *pub_addr = NULL; 8833 vsw_private_desc_t *priv_addr = NULL; 8834 8835 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8836 if (dringp->pub_addr != NULL) { 8837 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; 8838 8839 if (pub_addr->hdr.dstate == VIO_DESC_FREE) 8840 pub_count++; 8841 } 8842 8843 if (dringp->priv_addr != NULL) { 8844 priv_addr = 8845 (vsw_private_desc_t *)dringp->priv_addr + i; 8846 8847 if (priv_addr->dstate == VIO_DESC_FREE) 8848 priv_count++; 8849 } 8850 } 8851 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", 8852 i, priv_count, pub_count); 8853 } 8854 8855 static void 8856 dump_flags(uint64_t state) 8857 { 8858 int i; 8859 8860 typedef struct flag_name { 8861 int flag_val; 8862 char *flag_name; 8863 } flag_name_t; 8864 8865 flag_name_t flags[] = { 8866 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", 8867 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", 8868 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", 8869 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", 8870 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", 8871 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", 8872 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", 8873 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", 8874 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", 8875 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", 8876 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", 8877 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", 8878 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", 8879 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", 8880 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", 8881 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", 8882 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", 8883 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", 8884 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", 8885 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", 8886 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", 8887 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", 8888 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", 8889 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", 8890 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", 8891 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", 8892 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", 8893 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", 8894 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", 8895 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", 8896 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; 8897 8898 DERR(NULL, "DUMP_FLAGS: %llx\n", state); 8899 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { 8900 if (state & flags[i].flag_val) 8901 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); 8902 } 8903 } 8904