1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 74 /* 75 * Function prototypes. 76 */ 77 static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); 78 static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); 79 static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 80 static int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *); 81 static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *); 82 static int vsw_get_physaddr(vsw_t *); 83 static int vsw_setup_switching(vsw_t *); 84 static int vsw_setup_layer2(vsw_t *); 85 static int vsw_setup_layer3(vsw_t *); 86 87 /* MAC Ring table functions. */ 88 static void vsw_mac_ring_tbl_init(vsw_t *vswp); 89 static void vsw_mac_ring_tbl_destroy(vsw_t *vswp); 90 static void vsw_queue_worker(vsw_mac_ring_t *rrp); 91 static void vsw_queue_stop(vsw_queue_t *vqp); 92 static vsw_queue_t *vsw_queue_create(); 93 static void vsw_queue_destroy(vsw_queue_t *vqp); 94 95 /* MAC layer routines */ 96 static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg, 97 mac_resource_t *mrp); 98 static int vsw_get_hw_maddr(vsw_t *); 99 static int vsw_set_hw(vsw_t *, vsw_port_t *, int); 100 static int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *); 101 static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int); 102 static int vsw_unset_hw(vsw_t *, vsw_port_t *, int); 103 static int vsw_unset_hw_addr(vsw_t *, int); 104 static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int); 105 static void vsw_reconfig_hw(vsw_t *); 106 static int vsw_prog_if(vsw_t *); 107 static int vsw_prog_ports(vsw_t *); 108 static int vsw_mac_attach(vsw_t *vswp); 109 static void vsw_mac_detach(vsw_t *vswp); 110 111 static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *); 112 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); 113 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 114 static int vsw_mac_register(vsw_t *); 115 static int vsw_mac_unregister(vsw_t *); 116 static int vsw_m_stat(void *, uint_t, uint64_t *); 117 static void vsw_m_stop(void *arg); 118 static int vsw_m_start(void *arg); 119 static int vsw_m_unicst(void *arg, const uint8_t *); 120 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *); 121 static int vsw_m_promisc(void *arg, boolean_t); 122 static mblk_t *vsw_m_tx(void *arg, mblk_t *); 123 124 /* MDEG routines */ 125 static int vsw_mdeg_register(vsw_t *vswp); 126 static void vsw_mdeg_unregister(vsw_t *vswp); 127 static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); 128 static int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *); 129 static void vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t); 130 static void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t); 131 132 /* Port add/deletion routines */ 133 static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); 134 static int vsw_port_attach(vsw_t *vswp, int p_instance, 135 uint64_t *ldcids, int nids, struct ether_addr *macaddr); 136 static int vsw_detach_ports(vsw_t *vswp); 137 static int vsw_port_detach(vsw_t *vswp, int p_instance); 138 static int vsw_port_delete(vsw_port_t *port); 139 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); 140 static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id); 141 static int vsw_init_ldcs(vsw_port_t *port); 142 static int vsw_uninit_ldcs(vsw_port_t *port); 143 static int vsw_ldc_init(vsw_ldc_t *ldcp); 144 static int vsw_ldc_uninit(vsw_ldc_t *ldcp); 145 static int vsw_drain_ldcs(vsw_port_t *port); 146 static int vsw_drain_port_taskq(vsw_port_t *port); 147 static void vsw_marker_task(void *); 148 static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); 149 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); 150 151 /* Interrupt routines */ 152 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); 153 154 /* Handshake routines */ 155 static void vsw_ldc_reinit(vsw_ldc_t *); 156 static void vsw_process_conn_evt(vsw_ldc_t *, uint16_t); 157 static void vsw_conn_task(void *); 158 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); 159 static void vsw_next_milestone(vsw_ldc_t *); 160 static int vsw_supported_version(vio_ver_msg_t *); 161 162 /* Data processing routines */ 163 static void vsw_process_pkt(void *); 164 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t); 165 static void vsw_process_ctrl_pkt(void *); 166 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); 167 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); 168 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); 169 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); 170 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); 171 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); 172 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 173 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *); 174 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *); 175 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); 176 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 177 178 /* Switching/data transmit routines */ 179 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 180 vsw_port_t *port, mac_resource_handle_t); 181 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 182 vsw_port_t *port, mac_resource_handle_t); 183 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, 184 vsw_port_t *port); 185 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, 186 vsw_port_t *port); 187 static int vsw_portsend(vsw_port_t *, mblk_t *); 188 static int vsw_dringsend(vsw_ldc_t *, mblk_t *); 189 static int vsw_descrsend(vsw_ldc_t *, mblk_t *); 190 191 /* Packet creation routines */ 192 static void vsw_send_ver(void *); 193 static void vsw_send_attr(vsw_ldc_t *); 194 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *); 195 static void vsw_send_dring_info(vsw_ldc_t *); 196 static void vsw_send_rdx(vsw_ldc_t *); 197 198 static int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t); 199 200 /* Forwarding database (FDB) routines */ 201 static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 202 static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 203 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 204 static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 205 static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 206 static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 207 static void vsw_del_addr(uint8_t, void *, uint64_t); 208 static void vsw_del_mcst_port(vsw_port_t *); 209 static void vsw_del_mcst_vsw(vsw_t *); 210 211 /* Dring routines */ 212 static dring_info_t *vsw_create_dring(vsw_ldc_t *); 213 static void vsw_create_privring(vsw_ldc_t *); 214 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); 215 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, 216 int *); 217 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); 218 219 static void vsw_set_lane_attr(vsw_t *, lane_t *); 220 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *); 221 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg); 222 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *); 223 static int vsw_check_dring_info(vio_dring_reg_msg_t *); 224 225 /* Misc support routines */ 226 static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); 227 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); 228 static int vsw_free_ring(dring_info_t *); 229 230 /* Debugging routines */ 231 static void dump_flags(uint64_t); 232 static void display_state(void); 233 static void display_lane(lane_t *); 234 static void display_ring(dring_info_t *); 235 236 int vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */ 237 int vsw_wretries = 100; /* # of write attempts */ 238 int vsw_chain_len = 150; /* max # of mblks in msg chain */ 239 int vsw_desc_delay = 0; /* delay in us */ 240 int vsw_read_attempts = 5; /* # of reads of descriptor */ 241 242 uint32_t vsw_mblk_size = VSW_MBLK_SIZE; 243 uint32_t vsw_num_mblks = VSW_NUM_MBLKS; 244 245 static mac_callbacks_t vsw_m_callbacks = { 246 0, 247 vsw_m_stat, 248 vsw_m_start, 249 vsw_m_stop, 250 vsw_m_promisc, 251 vsw_m_multicst, 252 vsw_m_unicst, 253 vsw_m_tx, 254 NULL, 255 NULL, 256 NULL 257 }; 258 259 static struct cb_ops vsw_cb_ops = { 260 nulldev, /* cb_open */ 261 nulldev, /* cb_close */ 262 nodev, /* cb_strategy */ 263 nodev, /* cb_print */ 264 nodev, /* cb_dump */ 265 nodev, /* cb_read */ 266 nodev, /* cb_write */ 267 nodev, /* cb_ioctl */ 268 nodev, /* cb_devmap */ 269 nodev, /* cb_mmap */ 270 nodev, /* cb_segmap */ 271 nochpoll, /* cb_chpoll */ 272 ddi_prop_op, /* cb_prop_op */ 273 NULL, /* cb_stream */ 274 D_MP, /* cb_flag */ 275 CB_REV, /* rev */ 276 nodev, /* int (*cb_aread)() */ 277 nodev /* int (*cb_awrite)() */ 278 }; 279 280 static struct dev_ops vsw_ops = { 281 DEVO_REV, /* devo_rev */ 282 0, /* devo_refcnt */ 283 vsw_getinfo, /* devo_getinfo */ 284 nulldev, /* devo_identify */ 285 nulldev, /* devo_probe */ 286 vsw_attach, /* devo_attach */ 287 vsw_detach, /* devo_detach */ 288 nodev, /* devo_reset */ 289 &vsw_cb_ops, /* devo_cb_ops */ 290 (struct bus_ops *)NULL, /* devo_bus_ops */ 291 ddi_power /* devo_power */ 292 }; 293 294 extern struct mod_ops mod_driverops; 295 static struct modldrv vswmodldrv = { 296 &mod_driverops, 297 "sun4v Virtual Switch", 298 &vsw_ops, 299 }; 300 301 #define LDC_ENTER_LOCK(ldcp) \ 302 mutex_enter(&((ldcp)->ldc_cblock));\ 303 mutex_enter(&((ldcp)->ldc_txlock)); 304 #define LDC_EXIT_LOCK(ldcp) \ 305 mutex_exit(&((ldcp)->ldc_txlock));\ 306 mutex_exit(&((ldcp)->ldc_cblock)); 307 308 /* Driver soft state ptr */ 309 static void *vsw_state; 310 311 /* 312 * Linked list of "vsw_t" structures - one per instance. 313 */ 314 vsw_t *vsw_head = NULL; 315 krwlock_t vsw_rw; 316 317 /* 318 * Property names 319 */ 320 static char vdev_propname[] = "virtual-device"; 321 static char vsw_propname[] = "virtual-network-switch"; 322 static char physdev_propname[] = "vsw-phys-dev"; 323 static char smode_propname[] = "vsw-switch-mode"; 324 static char macaddr_propname[] = "local-mac-address"; 325 static char remaddr_propname[] = "remote-mac-address"; 326 static char ldcids_propname[] = "ldc-ids"; 327 static char chan_propname[] = "channel-endpoint"; 328 static char id_propname[] = "id"; 329 static char reg_propname[] = "reg"; 330 331 /* supported versions */ 332 static ver_sup_t vsw_versions[] = { {1, 0} }; 333 334 /* 335 * Matching criteria passed to the MDEG to register interest 336 * in changes to 'virtual-device-port' nodes identified by their 337 * 'id' property. 338 */ 339 static md_prop_match_t vport_prop_match[] = { 340 { MDET_PROP_VAL, "id" }, 341 { MDET_LIST_END, NULL } 342 }; 343 344 static mdeg_node_match_t vport_match = { "virtual-device-port", 345 vport_prop_match }; 346 347 /* 348 * Matching criteria passed to the MDEG to register interest 349 * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified 350 * by their 'name' and 'cfg-handle' properties. 351 */ 352 static md_prop_match_t vdev_prop_match[] = { 353 { MDET_PROP_STR, "name" }, 354 { MDET_PROP_VAL, "cfg-handle" }, 355 { MDET_LIST_END, NULL } 356 }; 357 358 static mdeg_node_match_t vdev_match = { "virtual-device", 359 vdev_prop_match }; 360 361 362 /* 363 * Specification of an MD node passed to the MDEG to filter any 364 * 'vport' nodes that do not belong to the specified node. This 365 * template is copied for each vsw instance and filled in with 366 * the appropriate 'cfg-handle' value before being passed to the MDEG. 367 */ 368 static mdeg_prop_spec_t vsw_prop_template[] = { 369 { MDET_PROP_STR, "name", vsw_propname }, 370 { MDET_PROP_VAL, "cfg-handle", NULL }, 371 { MDET_LIST_END, NULL, NULL } 372 }; 373 374 #define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 375 376 /* 377 * From /etc/system enable/disable thread per ring. This is a mode 378 * selection that is done a vsw driver attach time. 379 */ 380 boolean_t vsw_multi_ring_enable = B_FALSE; 381 int vsw_mac_rx_rings = VSW_MAC_RX_RINGS; 382 383 /* 384 * Print debug messages - set to 0x1f to enable all msgs 385 * or 0x0 to turn all off. 386 */ 387 int vswdbg = 0x0; 388 389 /* 390 * debug levels: 391 * 0x01: Function entry/exit tracing 392 * 0x02: Internal function messages 393 * 0x04: Verbose internal messages 394 * 0x08: Warning messages 395 * 0x10: Error messages 396 */ 397 398 static void 399 vswdebug(vsw_t *vswp, const char *fmt, ...) 400 { 401 char buf[512]; 402 va_list ap; 403 404 va_start(ap, fmt); 405 (void) vsprintf(buf, fmt, ap); 406 va_end(ap); 407 408 if (vswp == NULL) 409 cmn_err(CE_CONT, "%s\n", buf); 410 else 411 cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf); 412 } 413 414 /* 415 * For the moment the state dump routines have their own 416 * private flag. 417 */ 418 #define DUMP_STATE 0 419 420 #if DUMP_STATE 421 422 #define DUMP_TAG(tag) \ 423 { \ 424 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ 425 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ 426 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ 427 } 428 429 #define DUMP_TAG_PTR(tag) \ 430 { \ 431 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ 432 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ 433 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ 434 } 435 436 #define DUMP_FLAGS(flags) dump_flags(flags); 437 #define DISPLAY_STATE() display_state() 438 439 #else 440 441 #define DUMP_TAG(tag) 442 #define DUMP_TAG_PTR(tag) 443 #define DUMP_FLAGS(state) 444 #define DISPLAY_STATE() 445 446 #endif /* DUMP_STATE */ 447 448 #ifdef DEBUG 449 450 #define D1 \ 451 if (vswdbg & 0x01) \ 452 vswdebug 453 454 #define D2 \ 455 if (vswdbg & 0x02) \ 456 vswdebug 457 458 #define D3 \ 459 if (vswdbg & 0x04) \ 460 vswdebug 461 462 #define DWARN \ 463 if (vswdbg & 0x08) \ 464 vswdebug 465 466 #define DERR \ 467 if (vswdbg & 0x10) \ 468 vswdebug 469 470 #else 471 472 #define DERR if (0) vswdebug 473 #define DWARN if (0) vswdebug 474 #define D1 if (0) vswdebug 475 #define D2 if (0) vswdebug 476 #define D3 if (0) vswdebug 477 478 #endif /* DEBUG */ 479 480 static struct modlinkage modlinkage = { 481 MODREV_1, 482 &vswmodldrv, 483 NULL 484 }; 485 486 int 487 _init(void) 488 { 489 int status; 490 491 rw_init(&vsw_rw, NULL, RW_DRIVER, NULL); 492 493 status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1); 494 if (status != 0) { 495 return (status); 496 } 497 498 mac_init_ops(&vsw_ops, "vsw"); 499 status = mod_install(&modlinkage); 500 if (status != 0) { 501 ddi_soft_state_fini(&vsw_state); 502 } 503 return (status); 504 } 505 506 int 507 _fini(void) 508 { 509 int status; 510 511 status = mod_remove(&modlinkage); 512 if (status != 0) 513 return (status); 514 mac_fini_ops(&vsw_ops); 515 ddi_soft_state_fini(&vsw_state); 516 517 rw_destroy(&vsw_rw); 518 519 return (status); 520 } 521 522 int 523 _info(struct modinfo *modinfop) 524 { 525 return (mod_info(&modlinkage, modinfop)); 526 } 527 528 static int 529 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 530 { 531 vsw_t *vswp; 532 int instance; 533 char hashname[MAXNAMELEN]; 534 char qname[TASKQ_NAMELEN]; 535 enum { PROG_init = 0x00, 536 PROG_if_lock = 0x01, 537 PROG_fdb = 0x02, 538 PROG_mfdb = 0x04, 539 PROG_report_dev = 0x08, 540 PROG_plist = 0x10, 541 PROG_taskq = 0x20} 542 progress; 543 544 progress = PROG_init; 545 546 switch (cmd) { 547 case DDI_ATTACH: 548 break; 549 case DDI_RESUME: 550 /* nothing to do for this non-device */ 551 return (DDI_SUCCESS); 552 case DDI_PM_RESUME: 553 default: 554 return (DDI_FAILURE); 555 } 556 557 instance = ddi_get_instance(dip); 558 if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) { 559 DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance); 560 return (DDI_FAILURE); 561 } 562 vswp = ddi_get_soft_state(vsw_state, instance); 563 564 if (vswp == NULL) { 565 DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance); 566 goto vsw_attach_fail; 567 } 568 569 vswp->dip = dip; 570 vswp->instance = instance; 571 ddi_set_driver_private(dip, (caddr_t)vswp); 572 573 mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL); 574 mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL); 575 rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); 576 progress |= PROG_if_lock; 577 578 /* setup the unicast forwarding database */ 579 (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", 580 vswp->instance); 581 D2(vswp, "creating unicast hash table (%s)...", hashname); 582 vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 583 mod_hash_null_valdtor, sizeof (void *)); 584 585 progress |= PROG_fdb; 586 587 /* setup the multicast fowarding database */ 588 (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d", 589 vswp->instance); 590 D2(vswp, "creating multicast hash table %s)...", hashname); 591 rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); 592 vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 593 mod_hash_null_valdtor, sizeof (void *)); 594 595 progress |= PROG_mfdb; 596 597 /* 598 * create lock protecting list of multicast addresses 599 * which could come via m_multicst() entry point when plumbed. 600 */ 601 mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); 602 vswp->mcap = NULL; 603 604 ddi_report_dev(vswp->dip); 605 606 progress |= PROG_report_dev; 607 608 WRITE_ENTER(&vsw_rw); 609 vswp->next = vsw_head; 610 vsw_head = vswp; 611 RW_EXIT(&vsw_rw); 612 613 /* setup the port list */ 614 rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); 615 vswp->plist.head = NULL; 616 617 progress |= PROG_plist; 618 619 /* 620 * Create the taskq which will process all the VIO 621 * control messages. 622 */ 623 (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); 624 if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, 625 TASKQ_DEFAULTPRI, 0)) == NULL) { 626 cmn_err(CE_WARN, "!vsw%d: Unable to create task queue", 627 vswp->instance); 628 goto vsw_attach_fail; 629 } 630 631 progress |= PROG_taskq; 632 633 /* prevent auto-detaching */ 634 if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, 635 DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { 636 cmn_err(CE_NOTE, "!Unable to set \"%s\" property for " 637 "instance %u", DDI_NO_AUTODETACH, instance); 638 } 639 640 /* 641 * Now we have everything setup, register an interest in 642 * specific MD nodes. 643 * 644 * The callback is invoked in 2 cases, firstly if upon mdeg 645 * registration there are existing nodes which match our specified 646 * criteria, and secondly if the MD is changed (and again, there 647 * are nodes which we are interested in present within it. Note 648 * that our callback will be invoked even if our specified nodes 649 * have not actually changed). 650 * 651 * Until the callback is invoked we cannot switch any pkts as 652 * we don't know basic information such as what mode we are 653 * operating in. However we expect the callback to be invoked 654 * immediately upon registration as this driver should only 655 * be attaching if there are vsw nodes in the MD. 656 */ 657 if (vsw_mdeg_register(vswp)) 658 goto vsw_attach_fail; 659 660 return (DDI_SUCCESS); 661 662 vsw_attach_fail: 663 DERR(NULL, "vsw_attach: failed"); 664 665 if (progress & PROG_taskq) 666 ddi_taskq_destroy(vswp->taskq_p); 667 668 if (progress & PROG_plist) 669 rw_destroy(&vswp->plist.lockrw); 670 671 if (progress & PROG_report_dev) { 672 ddi_remove_minor_node(dip, NULL); 673 mutex_destroy(&vswp->mca_lock); 674 } 675 676 if (progress & PROG_mfdb) { 677 mod_hash_destroy_hash(vswp->mfdb); 678 vswp->mfdb = NULL; 679 rw_destroy(&vswp->mfdbrw); 680 } 681 682 if (progress & PROG_fdb) { 683 mod_hash_destroy_hash(vswp->fdb); 684 vswp->fdb = NULL; 685 } 686 687 if (progress & PROG_if_lock) { 688 rw_destroy(&vswp->if_lockrw); 689 mutex_destroy(&vswp->mac_lock); 690 mutex_destroy(&vswp->hw_lock); 691 } 692 693 ddi_soft_state_free(vsw_state, instance); 694 return (DDI_FAILURE); 695 } 696 697 static int 698 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 699 { 700 vio_mblk_pool_t *poolp, *npoolp; 701 vsw_t **vswpp, *vswp; 702 int instance; 703 704 instance = ddi_get_instance(dip); 705 vswp = ddi_get_soft_state(vsw_state, instance); 706 707 if (vswp == NULL) { 708 return (DDI_FAILURE); 709 } 710 711 switch (cmd) { 712 case DDI_DETACH: 713 break; 714 case DDI_SUSPEND: 715 case DDI_PM_SUSPEND: 716 default: 717 return (DDI_FAILURE); 718 } 719 720 D2(vswp, "detaching instance %d", instance); 721 722 if (vswp->if_state & VSW_IF_REG) { 723 if (vsw_mac_unregister(vswp) != 0) { 724 cmn_err(CE_WARN, "!vsw%d: Unable to detach from " 725 "MAC layer", vswp->instance); 726 return (DDI_FAILURE); 727 } 728 } 729 730 vsw_mdeg_unregister(vswp); 731 732 /* remove mac layer callback */ 733 mutex_enter(&vswp->mac_lock); 734 if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { 735 mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE); 736 vswp->mrh = NULL; 737 } 738 mutex_exit(&vswp->mac_lock); 739 740 if (vsw_detach_ports(vswp) != 0) { 741 cmn_err(CE_WARN, "!vsw%d: Unable to detach ports", 742 vswp->instance); 743 return (DDI_FAILURE); 744 } 745 746 rw_destroy(&vswp->if_lockrw); 747 748 mutex_destroy(&vswp->hw_lock); 749 750 /* 751 * Now that the ports have been deleted, stop and close 752 * the physical device. 753 */ 754 mutex_enter(&vswp->mac_lock); 755 if (vswp->mh != NULL) { 756 if (vswp->mstarted) 757 mac_stop(vswp->mh); 758 if (vswp->mresources) 759 mac_resource_set(vswp->mh, NULL, NULL); 760 mac_close(vswp->mh); 761 762 vswp->mh = NULL; 763 vswp->txinfo = NULL; 764 } 765 mutex_exit(&vswp->mac_lock); 766 mutex_destroy(&vswp->mac_lock); 767 768 /* 769 * Destroy any free pools that may still exist. 770 */ 771 poolp = vswp->rxh; 772 while (poolp != NULL) { 773 npoolp = vswp->rxh = poolp->nextp; 774 if (vio_destroy_mblks(poolp) != 0) { 775 vswp->rxh = poolp; 776 return (DDI_FAILURE); 777 } 778 poolp = npoolp; 779 } 780 781 /* 782 * Remove this instance from any entries it may be on in 783 * the hash table by using the list of addresses maintained 784 * in the vsw_t structure. 785 */ 786 vsw_del_mcst_vsw(vswp); 787 788 vswp->mcap = NULL; 789 mutex_destroy(&vswp->mca_lock); 790 791 /* 792 * By now any pending tasks have finished and the underlying 793 * ldc's have been destroyed, so its safe to delete the control 794 * message taskq. 795 */ 796 if (vswp->taskq_p != NULL) 797 ddi_taskq_destroy(vswp->taskq_p); 798 799 /* 800 * At this stage all the data pointers in the hash table 801 * should be NULL, as all the ports have been removed and will 802 * have deleted themselves from the port lists which the data 803 * pointers point to. Hence we can destroy the table using the 804 * default destructors. 805 */ 806 D2(vswp, "vsw_detach: destroying hash tables.."); 807 mod_hash_destroy_hash(vswp->fdb); 808 vswp->fdb = NULL; 809 810 WRITE_ENTER(&vswp->mfdbrw); 811 mod_hash_destroy_hash(vswp->mfdb); 812 vswp->mfdb = NULL; 813 RW_EXIT(&vswp->mfdbrw); 814 rw_destroy(&vswp->mfdbrw); 815 816 ddi_remove_minor_node(dip, NULL); 817 818 rw_destroy(&vswp->plist.lockrw); 819 WRITE_ENTER(&vsw_rw); 820 for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) { 821 if (*vswpp == vswp) { 822 *vswpp = vswp->next; 823 break; 824 } 825 } 826 RW_EXIT(&vsw_rw); 827 ddi_soft_state_free(vsw_state, instance); 828 829 return (DDI_SUCCESS); 830 } 831 832 static int 833 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 834 { 835 _NOTE(ARGUNUSED(dip)) 836 837 vsw_t *vswp = NULL; 838 dev_t dev = (dev_t)arg; 839 int instance; 840 841 instance = getminor(dev); 842 843 switch (infocmd) { 844 case DDI_INFO_DEVT2DEVINFO: 845 if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) { 846 *result = NULL; 847 return (DDI_FAILURE); 848 } 849 *result = vswp->dip; 850 return (DDI_SUCCESS); 851 852 case DDI_INFO_DEVT2INSTANCE: 853 *result = (void *)(uintptr_t)instance; 854 return (DDI_SUCCESS); 855 856 default: 857 *result = NULL; 858 return (DDI_FAILURE); 859 } 860 } 861 862 /* 863 * Get the value of the "vsw-phys-dev" property in the specified 864 * node. This property is the name of the physical device that 865 * the virtual switch will use to talk to the outside world. 866 * 867 * Note it is valid for this property to be NULL (but the property 868 * itself must exist). Callers of this routine should verify that 869 * the value returned is what they expected (i.e. either NULL or non NULL). 870 * 871 * On success returns value of the property in region pointed to by 872 * the 'name' argument, and with return value of 0. Otherwise returns 1. 873 */ 874 static int 875 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name) 876 { 877 int len = 0; 878 char *physname = NULL; 879 char *dev; 880 881 if (md_get_prop_data(mdp, node, physdev_propname, 882 (uint8_t **)(&physname), &len) != 0) { 883 cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical " 884 "device(s) from MD", vswp->instance); 885 return (1); 886 } else if ((strlen(physname) + 1) > LIFNAMSIZ) { 887 cmn_err(CE_WARN, "!vsw%d: %s is too long a device name", 888 vswp->instance, physname); 889 return (1); 890 } else { 891 (void) strncpy(name, physname, strlen(physname) + 1); 892 D2(vswp, "%s: using first device specified (%s)", 893 __func__, physname); 894 } 895 896 #ifdef DEBUG 897 /* 898 * As a temporary measure to aid testing we check to see if there 899 * is a vsw.conf file present. If there is we use the value of the 900 * vsw_physname property in the file as the name of the physical 901 * device, overriding the value from the MD. 902 * 903 * There may be multiple devices listed, but for the moment 904 * we just use the first one. 905 */ 906 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, 907 "vsw_physname", &dev) == DDI_PROP_SUCCESS) { 908 if ((strlen(dev) + 1) > LIFNAMSIZ) { 909 cmn_err(CE_WARN, "vsw%d: %s is too long a device name", 910 vswp->instance, dev); 911 ddi_prop_free(dev); 912 return (1); 913 } else { 914 cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from " 915 "config file", vswp->instance, dev); 916 917 (void) strncpy(name, dev, strlen(dev) + 1); 918 } 919 920 ddi_prop_free(dev); 921 } 922 #endif 923 924 return (0); 925 } 926 927 /* 928 * Read the 'vsw-switch-mode' property from the specified MD node. 929 * 930 * Returns 0 on success and the number of modes found in 'found', 931 * otherwise returns 1. 932 */ 933 static int 934 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, 935 uint8_t *modes, int *found) 936 { 937 int len = 0; 938 int smode_num = 0; 939 char *smode = NULL; 940 char *curr_mode = NULL; 941 942 D1(vswp, "%s: enter", __func__); 943 944 /* 945 * Get the switch-mode property. The modes are listed in 946 * decreasing order of preference, i.e. prefered mode is 947 * first item in list. 948 */ 949 len = 0; 950 smode_num = 0; 951 if (md_get_prop_data(mdp, node, smode_propname, 952 (uint8_t **)(&smode), &len) != 0) { 953 /* 954 * Unable to get switch-mode property from MD, nothing 955 * more we can do. 956 */ 957 cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property" 958 " from the MD", vswp->instance); 959 *found = 0; 960 return (1); 961 } 962 963 curr_mode = smode; 964 /* 965 * Modes of operation: 966 * 'switched' - layer 2 switching, underlying HW in 967 * programmed mode. 968 * 'promiscuous' - layer 2 switching, underlying HW in 969 * promiscuous mode. 970 * 'routed' - layer 3 (i.e. IP) routing, underlying HW 971 * in non-promiscuous mode. 972 */ 973 while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) { 974 D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); 975 if (strcmp(curr_mode, "switched") == 0) { 976 modes[smode_num++] = VSW_LAYER2; 977 } else if (strcmp(curr_mode, "promiscuous") == 0) { 978 modes[smode_num++] = VSW_LAYER2_PROMISC; 979 } else if (strcmp(curr_mode, "routed") == 0) { 980 modes[smode_num++] = VSW_LAYER3; 981 } else { 982 cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, " 983 "setting to default switched mode", 984 vswp->instance, curr_mode); 985 modes[smode_num++] = VSW_LAYER2; 986 } 987 curr_mode += strlen(curr_mode) + 1; 988 } 989 *found = smode_num; 990 991 D2(vswp, "%s: %d modes found", __func__, smode_num); 992 993 D1(vswp, "%s: exit", __func__); 994 995 return (0); 996 } 997 998 /* 999 * Get the mac address of the physical device. 1000 * 1001 * Returns 0 on success, 1 on failure. 1002 */ 1003 static int 1004 vsw_get_physaddr(vsw_t *vswp) 1005 { 1006 mac_handle_t mh; 1007 char drv[LIFNAMSIZ]; 1008 uint_t ddi_instance; 1009 1010 D1(vswp, "%s: enter", __func__); 1011 1012 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) 1013 return (1); 1014 1015 if (mac_open(vswp->physname, ddi_instance, &mh) != 0) { 1016 cmn_err(CE_WARN, "!vsw%d: mac_open %s failed", 1017 vswp->instance, vswp->physname); 1018 return (1); 1019 } 1020 1021 READ_ENTER(&vswp->if_lockrw); 1022 mac_unicst_get(mh, vswp->if_addr.ether_addr_octet); 1023 RW_EXIT(&vswp->if_lockrw); 1024 1025 mac_close(mh); 1026 1027 vswp->mdprops |= VSW_DEV_MACADDR; 1028 1029 D1(vswp, "%s: exit", __func__); 1030 1031 return (0); 1032 } 1033 1034 /* 1035 * Check to see if the card supports the setting of multiple unicst 1036 * addresses. 1037 * 1038 * Returns 0 if card supports the programming of multiple unicast addresses, 1039 * otherwise returns 1. 1040 */ 1041 static int 1042 vsw_get_hw_maddr(vsw_t *vswp) 1043 { 1044 D1(vswp, "%s: enter", __func__); 1045 1046 mutex_enter(&vswp->mac_lock); 1047 if (vswp->mh == NULL) { 1048 mutex_exit(&vswp->mac_lock); 1049 return (1); 1050 } 1051 1052 if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { 1053 cmn_err(CE_WARN, "!vsw%d: device (%s) does not support " 1054 "setting multiple unicast addresses", vswp->instance, 1055 vswp->physname); 1056 mutex_exit(&vswp->mac_lock); 1057 return (1); 1058 } 1059 mutex_exit(&vswp->mac_lock); 1060 1061 D2(vswp, "%s: %d addrs : %d free", __func__, 1062 vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree); 1063 1064 D1(vswp, "%s: exit", __func__); 1065 1066 return (0); 1067 } 1068 1069 /* 1070 * Setup the required switching mode. 1071 * 1072 * Returns 0 on success, 1 on failure. 1073 */ 1074 static int 1075 vsw_setup_switching(vsw_t *vswp) 1076 { 1077 int i, rv = 1; 1078 1079 D1(vswp, "%s: enter", __func__); 1080 1081 /* select best switching mode */ 1082 for (i = 0; i < vswp->smode_num; i++) { 1083 vswp->smode_idx = i; 1084 switch (vswp->smode[i]) { 1085 case VSW_LAYER2: 1086 case VSW_LAYER2_PROMISC: 1087 rv = vsw_setup_layer2(vswp); 1088 break; 1089 1090 case VSW_LAYER3: 1091 rv = vsw_setup_layer3(vswp); 1092 break; 1093 1094 default: 1095 DERR(vswp, "unknown switch mode"); 1096 rv = 1; 1097 break; 1098 } 1099 1100 if (rv == 0) 1101 break; 1102 } 1103 1104 if (rv == 1) { 1105 cmn_err(CE_WARN, "!vsw%d: Unable to setup specified " 1106 "switching mode", vswp->instance); 1107 return (rv); 1108 } 1109 1110 D2(vswp, "%s: Operating in mode %d", __func__, 1111 vswp->smode[vswp->smode_idx]); 1112 1113 D1(vswp, "%s: exit", __func__); 1114 1115 return (0); 1116 } 1117 1118 /* 1119 * Setup for layer 2 switching. 1120 * 1121 * Returns 0 on success, 1 on failure. 1122 */ 1123 static int 1124 vsw_setup_layer2(vsw_t *vswp) 1125 { 1126 D1(vswp, "%s: enter", __func__); 1127 1128 vswp->vsw_switch_frame = vsw_switch_l2_frame; 1129 1130 /* 1131 * Attempt to link into the MAC layer so we can get 1132 * and send packets out over the physical adapter. 1133 */ 1134 if (vswp->mdprops & VSW_MD_PHYSNAME) { 1135 if (vsw_mac_attach(vswp) != 0) { 1136 /* 1137 * Registration with the MAC layer has failed, 1138 * so return 1 so that can fall back to next 1139 * prefered switching method. 1140 */ 1141 cmn_err(CE_WARN, "!vsw%d: Unable to join as MAC layer " 1142 "client", vswp->instance); 1143 return (1); 1144 } 1145 1146 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 1147 /* 1148 * Verify that underlying device can support multiple 1149 * unicast mac addresses. 1150 */ 1151 if (vsw_get_hw_maddr(vswp) != 0) { 1152 cmn_err(CE_WARN, "!vsw%d: Unable to setup " 1153 "layer2 switching", vswp->instance); 1154 vsw_mac_detach(vswp); 1155 return (1); 1156 } 1157 } 1158 1159 } else { 1160 /* 1161 * No physical device name found in MD which is 1162 * required for layer 2. 1163 */ 1164 cmn_err(CE_WARN, "!vsw%d: no physical device name specified", 1165 vswp->instance); 1166 return (1); 1167 } 1168 1169 D1(vswp, "%s: exit", __func__); 1170 1171 return (0); 1172 } 1173 1174 static int 1175 vsw_setup_layer3(vsw_t *vswp) 1176 { 1177 D1(vswp, "%s: enter", __func__); 1178 1179 D2(vswp, "%s: operating in layer 3 mode", __func__); 1180 vswp->vsw_switch_frame = vsw_switch_l3_frame; 1181 1182 D1(vswp, "%s: exit", __func__); 1183 1184 return (0); 1185 } 1186 1187 /* 1188 * Link into the MAC layer to gain access to the services provided by 1189 * the underlying physical device driver (which should also have 1190 * registered with the MAC layer). 1191 * 1192 * Only when in layer 2 mode. 1193 */ 1194 static int 1195 vsw_mac_attach(vsw_t *vswp) 1196 { 1197 char drv[LIFNAMSIZ]; 1198 uint_t ddi_instance; 1199 1200 D1(vswp, "%s: enter", __func__); 1201 1202 ASSERT(vswp->mh == NULL); 1203 ASSERT(vswp->mrh == NULL); 1204 ASSERT(vswp->mstarted == B_FALSE); 1205 ASSERT(vswp->mresources == B_FALSE); 1206 1207 ASSERT(vswp->mdprops & VSW_MD_PHYSNAME); 1208 1209 mutex_enter(&vswp->mac_lock); 1210 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) { 1211 cmn_err(CE_WARN, "!vsw%d: invalid device name: %s", 1212 vswp->instance, vswp->physname); 1213 goto mac_fail_exit; 1214 } 1215 1216 if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) { 1217 cmn_err(CE_WARN, "!vsw%d: mac_open %s failed", 1218 vswp->instance, vswp->physname); 1219 goto mac_fail_exit; 1220 } 1221 1222 ASSERT(vswp->mh != NULL); 1223 1224 D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); 1225 1226 if (vsw_multi_ring_enable) { 1227 /* 1228 * Initialize the ring table. 1229 */ 1230 vsw_mac_ring_tbl_init(vswp); 1231 1232 /* 1233 * Register our rx callback function. 1234 */ 1235 vswp->mrh = mac_rx_add(vswp->mh, 1236 vsw_rx_queue_cb, (void *)vswp); 1237 ASSERT(vswp->mrh != NULL); 1238 1239 /* 1240 * Register our mac resource callback. 1241 */ 1242 mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp); 1243 vswp->mresources = B_TRUE; 1244 1245 /* 1246 * Get the ring resources available to us from 1247 * the mac below us. 1248 */ 1249 mac_resources(vswp->mh); 1250 } else { 1251 /* 1252 * Just register our rx callback function 1253 */ 1254 vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); 1255 ASSERT(vswp->mrh != NULL); 1256 } 1257 1258 /* Get the MAC tx fn */ 1259 vswp->txinfo = mac_tx_get(vswp->mh); 1260 1261 /* start the interface */ 1262 if (mac_start(vswp->mh) != 0) { 1263 cmn_err(CE_WARN, "!vsw%d: Could not start mac interface", 1264 vswp->instance); 1265 goto mac_fail_exit; 1266 } 1267 1268 mutex_exit(&vswp->mac_lock); 1269 1270 vswp->mstarted = B_TRUE; 1271 1272 D1(vswp, "%s: exit", __func__); 1273 return (0); 1274 1275 mac_fail_exit: 1276 mutex_exit(&vswp->mac_lock); 1277 vsw_mac_detach(vswp); 1278 1279 D1(vswp, "%s: exit", __func__); 1280 return (1); 1281 } 1282 1283 static void 1284 vsw_mac_detach(vsw_t *vswp) 1285 { 1286 D1(vswp, "vsw_mac_detach: enter"); 1287 1288 ASSERT(vswp != NULL); 1289 1290 if (vsw_multi_ring_enable) { 1291 vsw_mac_ring_tbl_destroy(vswp); 1292 } 1293 1294 mutex_enter(&vswp->mac_lock); 1295 1296 if (vswp->mh != NULL) { 1297 if (vswp->mstarted) 1298 mac_stop(vswp->mh); 1299 if (vswp->mrh != NULL) 1300 mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE); 1301 if (vswp->mresources) 1302 mac_resource_set(vswp->mh, NULL, NULL); 1303 mac_close(vswp->mh); 1304 } 1305 1306 vswp->mrh = NULL; 1307 vswp->mh = NULL; 1308 vswp->txinfo = NULL; 1309 vswp->mstarted = B_FALSE; 1310 1311 mutex_exit(&vswp->mac_lock); 1312 1313 D1(vswp, "vsw_mac_detach: exit"); 1314 } 1315 1316 /* 1317 * Depending on the mode specified, the capabilites and capacity 1318 * of the underlying device setup the physical device. 1319 * 1320 * If in layer 3 mode, then do nothing. 1321 * 1322 * If in layer 2 programmed mode attempt to program the unicast address 1323 * associated with the port into the physical device. If this is not 1324 * possible due to resource exhaustion or simply because the device does 1325 * not support multiple unicast addresses then if required fallback onto 1326 * putting the card into promisc mode. 1327 * 1328 * If in promisc mode then simply set the card into promisc mode. 1329 * 1330 * Returns 0 success, 1 on failure. 1331 */ 1332 static int 1333 vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type) 1334 { 1335 mac_multi_addr_t mac_addr; 1336 int err; 1337 1338 D1(vswp, "%s: enter", __func__); 1339 1340 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1341 ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); 1342 1343 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1344 return (0); 1345 1346 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) { 1347 return (vsw_set_hw_promisc(vswp, port, type)); 1348 } 1349 1350 /* 1351 * Attempt to program the unicast address into the HW. 1352 */ 1353 mac_addr.mma_addrlen = ETHERADDRL; 1354 if (type == VSW_VNETPORT) { 1355 ASSERT(port != NULL); 1356 ether_copy(&port->p_macaddr, &mac_addr.mma_addr); 1357 } else { 1358 READ_ENTER(&vswp->if_lockrw); 1359 /* 1360 * Don't program if the interface is not UP. This 1361 * is possible if the address has just been changed 1362 * in the MD node, but the interface has not yet been 1363 * plumbed. 1364 */ 1365 if (!(vswp->if_state & VSW_IF_UP)) { 1366 RW_EXIT(&vswp->if_lockrw); 1367 return (0); 1368 } 1369 ether_copy(&vswp->if_addr, &mac_addr.mma_addr); 1370 RW_EXIT(&vswp->if_lockrw); 1371 } 1372 1373 err = vsw_set_hw_addr(vswp, &mac_addr); 1374 if (err != 0) { 1375 /* 1376 * Mark that attempt should be made to re-config sometime 1377 * in future if a port is deleted. 1378 */ 1379 vswp->recfg_reqd = B_TRUE; 1380 1381 /* 1382 * Only 1 mode specified, nothing more to do. 1383 */ 1384 if (vswp->smode_num == 1) 1385 return (err); 1386 1387 /* 1388 * If promiscuous was next mode specified try to 1389 * set the card into that mode. 1390 */ 1391 if ((vswp->smode_idx <= (vswp->smode_num - 2)) && 1392 (vswp->smode[vswp->smode_idx + 1] == 1393 VSW_LAYER2_PROMISC)) { 1394 vswp->smode_idx += 1; 1395 return (vsw_set_hw_promisc(vswp, port, type)); 1396 } 1397 return (err); 1398 } 1399 1400 if (type == VSW_VNETPORT) { 1401 port->addr_slot = mac_addr.mma_slot; 1402 port->addr_set = VSW_ADDR_HW; 1403 } else { 1404 vswp->addr_slot = mac_addr.mma_slot; 1405 vswp->addr_set = VSW_ADDR_HW; 1406 } 1407 1408 D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x into slot %d " 1409 "of device %s", 1410 mac_addr.mma_addr[0], mac_addr.mma_addr[1], 1411 mac_addr.mma_addr[2], mac_addr.mma_addr[3], 1412 mac_addr.mma_addr[4], mac_addr.mma_addr[5], 1413 mac_addr.mma_slot, vswp->physname); 1414 1415 D1(vswp, "%s: exit", __func__); 1416 1417 return (0); 1418 } 1419 1420 /* 1421 * If in layer 3 mode do nothing. 1422 * 1423 * If in layer 2 switched mode remove the address from the physical 1424 * device. 1425 * 1426 * If in layer 2 promiscuous mode disable promisc mode. 1427 * 1428 * Returns 0 on success. 1429 */ 1430 static int 1431 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type) 1432 { 1433 mac_addr_slot_t slot; 1434 int rv; 1435 1436 D1(vswp, "%s: enter", __func__); 1437 1438 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1439 1440 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1441 return (0); 1442 1443 switch (type) { 1444 case VSW_VNETPORT: 1445 ASSERT(port != NULL); 1446 1447 if (port->addr_set == VSW_ADDR_PROMISC) { 1448 return (vsw_unset_hw_promisc(vswp, port, type)); 1449 1450 } else if (port->addr_set == VSW_ADDR_HW) { 1451 slot = port->addr_slot; 1452 if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0) 1453 port->addr_set = VSW_ADDR_UNSET; 1454 } 1455 1456 break; 1457 1458 case VSW_LOCALDEV: 1459 if (vswp->addr_set == VSW_ADDR_PROMISC) { 1460 return (vsw_unset_hw_promisc(vswp, NULL, type)); 1461 1462 } else if (vswp->addr_set == VSW_ADDR_HW) { 1463 slot = vswp->addr_slot; 1464 if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0) 1465 vswp->addr_set = VSW_ADDR_UNSET; 1466 } 1467 1468 break; 1469 1470 default: 1471 /* should never happen */ 1472 DERR(vswp, "%s: unknown type %d", __func__, type); 1473 ASSERT(0); 1474 return (1); 1475 } 1476 1477 D1(vswp, "%s: exit", __func__); 1478 return (rv); 1479 } 1480 1481 /* 1482 * Attempt to program a unicast address into HW. 1483 * 1484 * Returns 0 on sucess, 1 on failure. 1485 */ 1486 static int 1487 vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac) 1488 { 1489 void *mah; 1490 int rv; 1491 1492 D1(vswp, "%s: enter", __func__); 1493 1494 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1495 1496 if (vswp->maddr.maddr_handle == NULL) 1497 return (1); 1498 1499 mah = vswp->maddr.maddr_handle; 1500 1501 rv = vswp->maddr.maddr_add(mah, mac); 1502 1503 if (rv == 0) 1504 return (0); 1505 1506 /* 1507 * Its okay for the add to fail because we have exhausted 1508 * all the resouces in the hardware device. Any other error 1509 * we want to flag. 1510 */ 1511 if (rv != ENOSPC) { 1512 cmn_err(CE_WARN, "!vsw%d: error programming " 1513 "address %x:%x:%x:%x:%x:%x into HW " 1514 "err (%d)", vswp->instance, 1515 mac->mma_addr[0], mac->mma_addr[1], 1516 mac->mma_addr[2], mac->mma_addr[3], 1517 mac->mma_addr[4], mac->mma_addr[5], rv); 1518 } 1519 D1(vswp, "%s: exit", __func__); 1520 return (1); 1521 } 1522 1523 /* 1524 * Remove a unicast mac address which has previously been programmed 1525 * into HW. 1526 * 1527 * Returns 0 on sucess, 1 on failure. 1528 */ 1529 static int 1530 vsw_unset_hw_addr(vsw_t *vswp, int slot) 1531 { 1532 void *mah; 1533 int rv; 1534 1535 D1(vswp, "%s: enter", __func__); 1536 1537 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1538 ASSERT(slot >= 0); 1539 1540 if (vswp->maddr.maddr_handle == NULL) 1541 return (1); 1542 1543 mah = vswp->maddr.maddr_handle; 1544 1545 rv = vswp->maddr.maddr_remove(mah, slot); 1546 if (rv != 0) { 1547 cmn_err(CE_WARN, "!vsw%d: unable to remove address " 1548 "from slot %d in device %s (err %d)", 1549 vswp->instance, slot, vswp->physname, rv); 1550 return (1); 1551 } 1552 1553 D2(vswp, "removed addr from slot %d in device %s", 1554 slot, vswp->physname); 1555 1556 D1(vswp, "%s: exit", __func__); 1557 return (0); 1558 } 1559 1560 /* 1561 * Set network card into promisc mode. 1562 * 1563 * Returns 0 on success, 1 on failure. 1564 */ 1565 static int 1566 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type) 1567 { 1568 D1(vswp, "%s: enter", __func__); 1569 1570 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1571 ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); 1572 1573 mutex_enter(&vswp->mac_lock); 1574 if (vswp->mh == NULL) { 1575 mutex_exit(&vswp->mac_lock); 1576 return (1); 1577 } 1578 1579 if (vswp->promisc_cnt++ == 0) { 1580 if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { 1581 vswp->promisc_cnt--; 1582 mutex_exit(&vswp->mac_lock); 1583 return (1); 1584 } 1585 cmn_err(CE_NOTE, "!vsw%d: switching device %s into " 1586 "promiscuous mode", vswp->instance, vswp->physname); 1587 } 1588 mutex_exit(&vswp->mac_lock); 1589 1590 if (type == VSW_VNETPORT) { 1591 ASSERT(port != NULL); 1592 port->addr_set = VSW_ADDR_PROMISC; 1593 } else { 1594 vswp->addr_set = VSW_ADDR_PROMISC; 1595 } 1596 1597 D1(vswp, "%s: exit", __func__); 1598 1599 return (0); 1600 } 1601 1602 /* 1603 * Turn off promiscuous mode on network card. 1604 * 1605 * Returns 0 on success, 1 on failure. 1606 */ 1607 static int 1608 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type) 1609 { 1610 vsw_port_list_t *plist = &vswp->plist; 1611 1612 D2(vswp, "%s: enter", __func__); 1613 1614 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1615 ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); 1616 1617 mutex_enter(&vswp->mac_lock); 1618 if (vswp->mh == NULL) { 1619 mutex_exit(&vswp->mac_lock); 1620 return (1); 1621 } 1622 1623 if (--vswp->promisc_cnt == 0) { 1624 if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { 1625 vswp->promisc_cnt++; 1626 mutex_exit(&vswp->mac_lock); 1627 return (1); 1628 } 1629 1630 /* 1631 * We are exiting promisc mode either because we were 1632 * only in promisc mode because we had failed over from 1633 * switched mode due to HW resource issues, or the user 1634 * wanted the card in promisc mode for all the ports and 1635 * the last port is now being deleted. Tweak the message 1636 * accordingly. 1637 */ 1638 if (plist->num_ports != 0) { 1639 cmn_err(CE_NOTE, "!vsw%d: switching device %s back to " 1640 "programmed mode", vswp->instance, vswp->physname); 1641 } else { 1642 cmn_err(CE_NOTE, "!vsw%d: switching device %s out of " 1643 "promiscuous mode", vswp->instance, vswp->physname); 1644 } 1645 } 1646 mutex_exit(&vswp->mac_lock); 1647 1648 if (type == VSW_VNETPORT) { 1649 ASSERT(port != NULL); 1650 ASSERT(port->addr_set == VSW_ADDR_PROMISC); 1651 port->addr_set = VSW_ADDR_UNSET; 1652 } else { 1653 ASSERT(vswp->addr_set == VSW_ADDR_PROMISC); 1654 vswp->addr_set = VSW_ADDR_UNSET; 1655 } 1656 1657 D1(vswp, "%s: exit", __func__); 1658 return (0); 1659 } 1660 1661 /* 1662 * Determine whether or not we are operating in our prefered 1663 * mode and if not whether the physical resources now allow us 1664 * to operate in it. 1665 * 1666 * If a port is being removed should only be invoked after port has been 1667 * removed from the port list. 1668 */ 1669 static void 1670 vsw_reconfig_hw(vsw_t *vswp) 1671 { 1672 int s_idx; 1673 1674 D1(vswp, "%s: enter", __func__); 1675 1676 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1677 1678 if (vswp->maddr.maddr_handle == NULL) { 1679 return; 1680 } 1681 1682 /* 1683 * If we are in layer 2 (i.e. switched) or would like to be 1684 * in layer 2 then check if any ports or the vswitch itself 1685 * need to be programmed into the HW. 1686 * 1687 * This can happen in two cases - switched was specified as 1688 * the prefered mode of operation but we exhausted the HW 1689 * resources and so failed over to the next specifed mode, 1690 * or switched was the only mode specified so after HW 1691 * resources were exhausted there was nothing more we 1692 * could do. 1693 */ 1694 if (vswp->smode_idx > 0) 1695 s_idx = vswp->smode_idx - 1; 1696 else 1697 s_idx = vswp->smode_idx; 1698 1699 if (vswp->smode[s_idx] != VSW_LAYER2) { 1700 return; 1701 } 1702 1703 D2(vswp, "%s: attempting reconfig..", __func__); 1704 1705 /* 1706 * First, attempt to set the vswitch mac address into HW, 1707 * if required. 1708 */ 1709 if (vsw_prog_if(vswp)) { 1710 return; 1711 } 1712 1713 /* 1714 * Next, attempt to set any ports which have not yet been 1715 * programmed into HW. 1716 */ 1717 if (vsw_prog_ports(vswp)) { 1718 return; 1719 } 1720 1721 /* 1722 * By now we know that have programmed all desired ports etc 1723 * into HW, so safe to mark reconfiguration as complete. 1724 */ 1725 vswp->recfg_reqd = B_FALSE; 1726 1727 vswp->smode_idx = s_idx; 1728 1729 D1(vswp, "%s: exit", __func__); 1730 } 1731 1732 /* 1733 * Check to see if vsw itself is plumbed, and if so whether or not 1734 * its mac address should be written into HW. 1735 * 1736 * Returns 0 if could set address, or didn't have to set it. 1737 * Returns 1 if failed to set address. 1738 */ 1739 static int 1740 vsw_prog_if(vsw_t *vswp) 1741 { 1742 mac_multi_addr_t addr; 1743 1744 D1(vswp, "%s: enter", __func__); 1745 1746 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1747 1748 READ_ENTER(&vswp->if_lockrw); 1749 if ((vswp->if_state & VSW_IF_UP) && 1750 (vswp->addr_set != VSW_ADDR_HW)) { 1751 1752 addr.mma_addrlen = ETHERADDRL; 1753 ether_copy(&vswp->if_addr, &addr.mma_addr); 1754 1755 if (vsw_set_hw_addr(vswp, &addr) != 0) { 1756 RW_EXIT(&vswp->if_lockrw); 1757 return (1); 1758 } 1759 1760 vswp->addr_slot = addr.mma_slot; 1761 1762 /* 1763 * If previously when plumbed had had to place 1764 * interface into promisc mode, now reverse that. 1765 * 1766 * Note that interface will only actually be set into 1767 * non-promisc mode when last port/interface has been 1768 * programmed into HW. 1769 */ 1770 if (vswp->addr_set == VSW_ADDR_PROMISC) 1771 (void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV); 1772 1773 vswp->addr_set = VSW_ADDR_HW; 1774 } 1775 RW_EXIT(&vswp->if_lockrw); 1776 1777 D1(vswp, "%s: exit", __func__); 1778 return (0); 1779 } 1780 1781 /* 1782 * Scan the port list for any ports which have not yet been set 1783 * into HW. For those found attempt to program their mac addresses 1784 * into the physical device. 1785 * 1786 * Returns 0 if able to program all required ports (can be 0) into HW. 1787 * Returns 1 if failed to set at least one mac address. 1788 */ 1789 static int 1790 vsw_prog_ports(vsw_t *vswp) 1791 { 1792 mac_multi_addr_t addr; 1793 vsw_port_list_t *plist = &vswp->plist; 1794 vsw_port_t *tp; 1795 int rv = 0; 1796 1797 D1(vswp, "%s: enter", __func__); 1798 1799 ASSERT(MUTEX_HELD(&vswp->hw_lock)); 1800 1801 READ_ENTER(&plist->lockrw); 1802 for (tp = plist->head; tp != NULL; tp = tp->p_next) { 1803 if (tp->addr_set != VSW_ADDR_HW) { 1804 addr.mma_addrlen = ETHERADDRL; 1805 ether_copy(&tp->p_macaddr, &addr.mma_addr); 1806 1807 if (vsw_set_hw_addr(vswp, &addr) != 0) { 1808 rv = 1; 1809 break; 1810 } 1811 1812 tp->addr_slot = addr.mma_slot; 1813 1814 /* 1815 * If when this port had first attached we had 1816 * had to place the interface into promisc mode, 1817 * then now reverse that. 1818 * 1819 * Note that the interface will not actually 1820 * change to non-promisc mode until all ports 1821 * have been programmed. 1822 */ 1823 if (tp->addr_set == VSW_ADDR_PROMISC) 1824 (void) vsw_unset_hw_promisc(vswp, 1825 tp, VSW_VNETPORT); 1826 1827 tp->addr_set = VSW_ADDR_HW; 1828 } 1829 } 1830 RW_EXIT(&plist->lockrw); 1831 1832 D1(vswp, "%s: exit", __func__); 1833 return (rv); 1834 } 1835 1836 static void 1837 vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp) 1838 { 1839 ringp->ring_state = VSW_MAC_RING_FREE; 1840 ringp->ring_arg = NULL; 1841 ringp->ring_blank = NULL; 1842 ringp->ring_vqp = NULL; 1843 ringp->ring_vswp = vswp; 1844 } 1845 1846 static void 1847 vsw_mac_ring_tbl_init(vsw_t *vswp) 1848 { 1849 int i; 1850 1851 mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL); 1852 1853 vswp->mac_ring_tbl_sz = vsw_mac_rx_rings; 1854 vswp->mac_ring_tbl = 1855 kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP); 1856 1857 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) 1858 vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]); 1859 } 1860 1861 static void 1862 vsw_mac_ring_tbl_destroy(vsw_t *vswp) 1863 { 1864 int i; 1865 vsw_mac_ring_t *ringp; 1866 1867 mutex_enter(&vswp->mac_ring_lock); 1868 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1869 ringp = &vswp->mac_ring_tbl[i]; 1870 1871 if (ringp->ring_state != VSW_MAC_RING_FREE) { 1872 /* 1873 * Destroy the queue. 1874 */ 1875 vsw_queue_stop(ringp->ring_vqp); 1876 vsw_queue_destroy(ringp->ring_vqp); 1877 1878 /* 1879 * Re-initialize the structure. 1880 */ 1881 vsw_mac_ring_tbl_entry_init(vswp, ringp); 1882 } 1883 } 1884 mutex_exit(&vswp->mac_ring_lock); 1885 1886 mutex_destroy(&vswp->mac_ring_lock); 1887 kmem_free(vswp->mac_ring_tbl, 1888 vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t)); 1889 vswp->mac_ring_tbl_sz = 0; 1890 } 1891 1892 /* 1893 * Handle resource add callbacks from the driver below. 1894 */ 1895 static mac_resource_handle_t 1896 vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp) 1897 { 1898 vsw_t *vswp = (vsw_t *)arg; 1899 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 1900 vsw_mac_ring_t *ringp; 1901 vsw_queue_t *vqp; 1902 int i; 1903 1904 ASSERT(vswp != NULL); 1905 ASSERT(mrp != NULL); 1906 ASSERT(vswp->mac_ring_tbl != NULL); 1907 1908 D1(vswp, "%s: enter", __func__); 1909 1910 /* 1911 * Check to make sure we have the correct resource type. 1912 */ 1913 if (mrp->mr_type != MAC_RX_FIFO) 1914 return (NULL); 1915 1916 /* 1917 * Find a open entry in the ring table. 1918 */ 1919 mutex_enter(&vswp->mac_ring_lock); 1920 for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { 1921 ringp = &vswp->mac_ring_tbl[i]; 1922 1923 /* 1924 * Check for an empty slot, if found, then setup queue 1925 * and thread. 1926 */ 1927 if (ringp->ring_state == VSW_MAC_RING_FREE) { 1928 /* 1929 * Create the queue for this ring. 1930 */ 1931 vqp = vsw_queue_create(); 1932 1933 /* 1934 * Initialize the ring data structure. 1935 */ 1936 ringp->ring_vqp = vqp; 1937 ringp->ring_arg = mrfp->mrf_arg; 1938 ringp->ring_blank = mrfp->mrf_blank; 1939 ringp->ring_state = VSW_MAC_RING_INUSE; 1940 1941 /* 1942 * Create the worker thread. 1943 */ 1944 vqp->vq_worker = thread_create(NULL, 0, 1945 vsw_queue_worker, ringp, 0, &p0, 1946 TS_RUN, minclsyspri); 1947 if (vqp->vq_worker == NULL) { 1948 vsw_queue_destroy(vqp); 1949 vsw_mac_ring_tbl_entry_init(vswp, ringp); 1950 ringp = NULL; 1951 } 1952 1953 if (ringp != NULL) { 1954 /* 1955 * Make sure thread get's running state for 1956 * this ring. 1957 */ 1958 mutex_enter(&vqp->vq_lock); 1959 while ((vqp->vq_state != VSW_QUEUE_RUNNING) && 1960 (vqp->vq_state != VSW_QUEUE_DRAINED)) { 1961 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 1962 } 1963 1964 /* 1965 * If the thread is not running, cleanup. 1966 */ 1967 if (vqp->vq_state == VSW_QUEUE_DRAINED) { 1968 vsw_queue_destroy(vqp); 1969 vsw_mac_ring_tbl_entry_init(vswp, 1970 ringp); 1971 ringp = NULL; 1972 } 1973 mutex_exit(&vqp->vq_lock); 1974 } 1975 1976 mutex_exit(&vswp->mac_ring_lock); 1977 D1(vswp, "%s: exit", __func__); 1978 return ((mac_resource_handle_t)ringp); 1979 } 1980 } 1981 mutex_exit(&vswp->mac_ring_lock); 1982 1983 /* 1984 * No slots in the ring table available. 1985 */ 1986 D1(vswp, "%s: exit", __func__); 1987 return (NULL); 1988 } 1989 1990 static void 1991 vsw_queue_stop(vsw_queue_t *vqp) 1992 { 1993 mutex_enter(&vqp->vq_lock); 1994 1995 if (vqp->vq_state == VSW_QUEUE_RUNNING) { 1996 vqp->vq_state = VSW_QUEUE_STOP; 1997 cv_signal(&vqp->vq_cv); 1998 1999 while (vqp->vq_state != VSW_QUEUE_DRAINED) 2000 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 2001 } 2002 2003 vqp->vq_state = VSW_QUEUE_STOPPED; 2004 2005 mutex_exit(&vqp->vq_lock); 2006 } 2007 2008 static vsw_queue_t * 2009 vsw_queue_create() 2010 { 2011 vsw_queue_t *vqp; 2012 2013 vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP); 2014 2015 mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL); 2016 cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL); 2017 vqp->vq_first = NULL; 2018 vqp->vq_last = NULL; 2019 vqp->vq_state = VSW_QUEUE_STOPPED; 2020 2021 return (vqp); 2022 } 2023 2024 static void 2025 vsw_queue_destroy(vsw_queue_t *vqp) 2026 { 2027 cv_destroy(&vqp->vq_cv); 2028 mutex_destroy(&vqp->vq_lock); 2029 kmem_free(vqp, sizeof (vsw_queue_t)); 2030 } 2031 2032 static void 2033 vsw_queue_worker(vsw_mac_ring_t *rrp) 2034 { 2035 mblk_t *mp; 2036 vsw_queue_t *vqp = rrp->ring_vqp; 2037 vsw_t *vswp = rrp->ring_vswp; 2038 2039 mutex_enter(&vqp->vq_lock); 2040 2041 ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED); 2042 2043 /* 2044 * Set the state to running, since the thread is now active. 2045 */ 2046 vqp->vq_state = VSW_QUEUE_RUNNING; 2047 cv_signal(&vqp->vq_cv); 2048 2049 while (vqp->vq_state == VSW_QUEUE_RUNNING) { 2050 /* 2051 * Wait for work to do or the state has changed 2052 * to not running. 2053 */ 2054 while ((vqp->vq_state == VSW_QUEUE_RUNNING) && 2055 (vqp->vq_first == NULL)) { 2056 cv_wait(&vqp->vq_cv, &vqp->vq_lock); 2057 } 2058 2059 /* 2060 * Process packets that we received from the interface. 2061 */ 2062 if (vqp->vq_first != NULL) { 2063 mp = vqp->vq_first; 2064 2065 vqp->vq_first = NULL; 2066 vqp->vq_last = NULL; 2067 2068 mutex_exit(&vqp->vq_lock); 2069 2070 /* switch the chain of packets received */ 2071 vswp->vsw_switch_frame(vswp, mp, 2072 VSW_PHYSDEV, NULL, NULL); 2073 2074 mutex_enter(&vqp->vq_lock); 2075 } 2076 } 2077 2078 /* 2079 * We are drained and signal we are done. 2080 */ 2081 vqp->vq_state = VSW_QUEUE_DRAINED; 2082 cv_signal(&vqp->vq_cv); 2083 2084 /* 2085 * Exit lock and drain the remaining packets. 2086 */ 2087 mutex_exit(&vqp->vq_lock); 2088 2089 /* 2090 * Exit the thread 2091 */ 2092 thread_exit(); 2093 } 2094 2095 /* 2096 * static void 2097 * vsw_rx_queue_cb() - Receive callback routine when 2098 * vsw_multi_ring_enable is non-zero. Queue the packets 2099 * to a packet queue for a worker thread to process. 2100 */ 2101 static void 2102 vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 2103 { 2104 vsw_mac_ring_t *ringp = (vsw_mac_ring_t *)mrh; 2105 vsw_t *vswp = (vsw_t *)arg; 2106 vsw_queue_t *vqp; 2107 mblk_t *bp, *last; 2108 2109 ASSERT(mrh != NULL); 2110 ASSERT(vswp != NULL); 2111 ASSERT(mp != NULL); 2112 2113 D1(vswp, "%s: enter", __func__); 2114 2115 /* 2116 * Find the last element in the mblk chain. 2117 */ 2118 bp = mp; 2119 do { 2120 last = bp; 2121 bp = bp->b_next; 2122 } while (bp != NULL); 2123 2124 /* Get the queue for the packets */ 2125 vqp = ringp->ring_vqp; 2126 2127 /* 2128 * Grab the lock such we can queue the packets. 2129 */ 2130 mutex_enter(&vqp->vq_lock); 2131 2132 if (vqp->vq_state != VSW_QUEUE_RUNNING) { 2133 freemsg(mp); 2134 mutex_exit(&vqp->vq_lock); 2135 goto vsw_rx_queue_cb_exit; 2136 } 2137 2138 /* 2139 * Add the mblk chain to the queue. If there 2140 * is some mblks in the queue, then add the new 2141 * chain to the end. 2142 */ 2143 if (vqp->vq_first == NULL) 2144 vqp->vq_first = mp; 2145 else 2146 vqp->vq_last->b_next = mp; 2147 2148 vqp->vq_last = last; 2149 2150 /* 2151 * Signal the worker thread that there is work to 2152 * do. 2153 */ 2154 cv_signal(&vqp->vq_cv); 2155 2156 /* 2157 * Let go of the lock and exit. 2158 */ 2159 mutex_exit(&vqp->vq_lock); 2160 2161 vsw_rx_queue_cb_exit: 2162 D1(vswp, "%s: exit", __func__); 2163 } 2164 2165 /* 2166 * receive callback routine. Invoked by MAC layer when there 2167 * are pkts being passed up from physical device. 2168 * 2169 * PERF: It may be more efficient when the card is in promisc 2170 * mode to check the dest address of the pkts here (against 2171 * the FDB) rather than checking later. Needs to be investigated. 2172 */ 2173 static void 2174 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 2175 { 2176 _NOTE(ARGUNUSED(mrh)) 2177 2178 vsw_t *vswp = (vsw_t *)arg; 2179 2180 ASSERT(vswp != NULL); 2181 2182 D1(vswp, "vsw_rx_cb: enter"); 2183 2184 /* switch the chain of packets received */ 2185 vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 2186 2187 D1(vswp, "vsw_rx_cb: exit"); 2188 } 2189 2190 /* 2191 * Send a message out over the physical device via the MAC layer. 2192 * 2193 * Returns any mblks that it was unable to transmit. 2194 */ 2195 static mblk_t * 2196 vsw_tx_msg(vsw_t *vswp, mblk_t *mp) 2197 { 2198 const mac_txinfo_t *mtp; 2199 mblk_t *nextp; 2200 2201 mutex_enter(&vswp->mac_lock); 2202 if (vswp->mh == NULL) { 2203 DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); 2204 mutex_exit(&vswp->mac_lock); 2205 return (mp); 2206 } else { 2207 for (;;) { 2208 nextp = mp->b_next; 2209 mp->b_next = NULL; 2210 2211 mtp = vswp->txinfo; 2212 2213 if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { 2214 mp->b_next = nextp; 2215 break; 2216 } 2217 2218 if ((mp = nextp) == NULL) 2219 break; 2220 } 2221 } 2222 mutex_exit(&vswp->mac_lock); 2223 2224 return (mp); 2225 } 2226 2227 /* 2228 * Register with the MAC layer as a network device, so we 2229 * can be plumbed if necessary. 2230 */ 2231 static int 2232 vsw_mac_register(vsw_t *vswp) 2233 { 2234 mac_register_t *macp; 2235 int rv; 2236 2237 D1(vswp, "%s: enter", __func__); 2238 2239 if ((macp = mac_alloc(MAC_VERSION)) == NULL) 2240 return (EINVAL); 2241 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 2242 macp->m_driver = vswp; 2243 macp->m_dip = vswp->dip; 2244 macp->m_src_addr = (uint8_t *)&vswp->if_addr; 2245 macp->m_callbacks = &vsw_m_callbacks; 2246 macp->m_min_sdu = 0; 2247 macp->m_max_sdu = ETHERMTU; 2248 rv = mac_register(macp, &vswp->if_mh); 2249 mac_free(macp); 2250 if (rv == 0) 2251 vswp->if_state |= VSW_IF_REG; 2252 2253 D1(vswp, "%s: exit", __func__); 2254 2255 return (rv); 2256 } 2257 2258 static int 2259 vsw_mac_unregister(vsw_t *vswp) 2260 { 2261 int rv = 0; 2262 2263 D1(vswp, "%s: enter", __func__); 2264 2265 WRITE_ENTER(&vswp->if_lockrw); 2266 2267 if (vswp->if_state & VSW_IF_REG) { 2268 rv = mac_unregister(vswp->if_mh); 2269 if (rv != 0) { 2270 DWARN(vswp, "%s: unable to unregister from MAC " 2271 "framework", __func__); 2272 2273 RW_EXIT(&vswp->if_lockrw); 2274 D1(vswp, "%s: fail exit", __func__); 2275 return (rv); 2276 } 2277 2278 /* mark i/f as down and unregistered */ 2279 vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG); 2280 } 2281 RW_EXIT(&vswp->if_lockrw); 2282 2283 D1(vswp, "%s: exit", __func__); 2284 2285 return (rv); 2286 } 2287 2288 static int 2289 vsw_m_stat(void *arg, uint_t stat, uint64_t *val) 2290 { 2291 vsw_t *vswp = (vsw_t *)arg; 2292 2293 D1(vswp, "%s: enter", __func__); 2294 2295 mutex_enter(&vswp->mac_lock); 2296 if (vswp->mh == NULL) { 2297 mutex_exit(&vswp->mac_lock); 2298 return (EINVAL); 2299 } 2300 2301 /* return stats from underlying device */ 2302 *val = mac_stat_get(vswp->mh, stat); 2303 2304 mutex_exit(&vswp->mac_lock); 2305 2306 return (0); 2307 } 2308 2309 static void 2310 vsw_m_stop(void *arg) 2311 { 2312 vsw_t *vswp = (vsw_t *)arg; 2313 2314 D1(vswp, "%s: enter", __func__); 2315 2316 WRITE_ENTER(&vswp->if_lockrw); 2317 vswp->if_state &= ~VSW_IF_UP; 2318 RW_EXIT(&vswp->if_lockrw); 2319 2320 mutex_enter(&vswp->hw_lock); 2321 2322 (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); 2323 2324 if (vswp->recfg_reqd) 2325 vsw_reconfig_hw(vswp); 2326 2327 mutex_exit(&vswp->hw_lock); 2328 2329 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2330 } 2331 2332 static int 2333 vsw_m_start(void *arg) 2334 { 2335 vsw_t *vswp = (vsw_t *)arg; 2336 2337 D1(vswp, "%s: enter", __func__); 2338 2339 WRITE_ENTER(&vswp->if_lockrw); 2340 vswp->if_state |= VSW_IF_UP; 2341 RW_EXIT(&vswp->if_lockrw); 2342 2343 mutex_enter(&vswp->hw_lock); 2344 (void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV); 2345 mutex_exit(&vswp->hw_lock); 2346 2347 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 2348 return (0); 2349 } 2350 2351 /* 2352 * Change the local interface address. 2353 * 2354 * Note: we don't support this entry point. The local 2355 * mac address of the switch can only be changed via its 2356 * MD node properties. 2357 */ 2358 static int 2359 vsw_m_unicst(void *arg, const uint8_t *macaddr) 2360 { 2361 _NOTE(ARGUNUSED(arg, macaddr)) 2362 2363 return (DDI_FAILURE); 2364 } 2365 2366 static int 2367 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) 2368 { 2369 vsw_t *vswp = (vsw_t *)arg; 2370 mcst_addr_t *mcst_p = NULL; 2371 uint64_t addr = 0x0; 2372 int i, ret = 0; 2373 2374 D1(vswp, "%s: enter", __func__); 2375 2376 /* 2377 * Convert address into form that can be used 2378 * as hash table key. 2379 */ 2380 for (i = 0; i < ETHERADDRL; i++) { 2381 addr = (addr << 8) | mca[i]; 2382 } 2383 2384 D2(vswp, "%s: addr = 0x%llx", __func__, addr); 2385 2386 if (add) { 2387 D2(vswp, "%s: adding multicast", __func__); 2388 if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2389 /* 2390 * Update the list of multicast addresses 2391 * contained within the vsw_t structure to 2392 * include this new one. 2393 */ 2394 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP); 2395 if (mcst_p == NULL) { 2396 DERR(vswp, "%s unable to alloc mem", __func__); 2397 return (1); 2398 } 2399 mcst_p->addr = addr; 2400 2401 mutex_enter(&vswp->mca_lock); 2402 mcst_p->nextp = vswp->mcap; 2403 vswp->mcap = mcst_p; 2404 mutex_exit(&vswp->mca_lock); 2405 2406 /* 2407 * Call into the underlying driver to program the 2408 * address into HW. 2409 */ 2410 mutex_enter(&vswp->mac_lock); 2411 if (vswp->mh != NULL) { 2412 ret = mac_multicst_add(vswp->mh, mca); 2413 if (ret != 0) { 2414 cmn_err(CE_WARN, "!vsw%d: unable to " 2415 "add multicast address", 2416 vswp->instance); 2417 mutex_exit(&vswp->mac_lock); 2418 goto vsw_remove_addr; 2419 } 2420 } 2421 mutex_exit(&vswp->mac_lock); 2422 } else { 2423 cmn_err(CE_WARN, "!vsw%d: unable to add multicast " 2424 "address", vswp->instance); 2425 } 2426 return (ret); 2427 } 2428 2429 vsw_remove_addr: 2430 2431 D2(vswp, "%s: removing multicast", __func__); 2432 /* 2433 * Remove the address from the hash table.. 2434 */ 2435 if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 2436 2437 /* 2438 * ..and then from the list maintained in the 2439 * vsw_t structure. 2440 */ 2441 vsw_del_addr(VSW_LOCALDEV, vswp, addr); 2442 2443 mutex_enter(&vswp->mac_lock); 2444 if (vswp->mh != NULL) 2445 (void) mac_multicst_remove(vswp->mh, mca); 2446 mutex_exit(&vswp->mac_lock); 2447 } 2448 2449 D1(vswp, "%s: exit", __func__); 2450 2451 return (0); 2452 } 2453 2454 static int 2455 vsw_m_promisc(void *arg, boolean_t on) 2456 { 2457 vsw_t *vswp = (vsw_t *)arg; 2458 2459 D1(vswp, "%s: enter", __func__); 2460 2461 WRITE_ENTER(&vswp->if_lockrw); 2462 if (on) 2463 vswp->if_state |= VSW_IF_PROMISC; 2464 else 2465 vswp->if_state &= ~VSW_IF_PROMISC; 2466 RW_EXIT(&vswp->if_lockrw); 2467 2468 D1(vswp, "%s: exit", __func__); 2469 2470 return (0); 2471 } 2472 2473 static mblk_t * 2474 vsw_m_tx(void *arg, mblk_t *mp) 2475 { 2476 vsw_t *vswp = (vsw_t *)arg; 2477 2478 D1(vswp, "%s: enter", __func__); 2479 2480 vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); 2481 2482 D1(vswp, "%s: exit", __func__); 2483 2484 return (NULL); 2485 } 2486 2487 /* 2488 * Register for machine description (MD) updates. 2489 * 2490 * Returns 0 on success, 1 on failure. 2491 */ 2492 static int 2493 vsw_mdeg_register(vsw_t *vswp) 2494 { 2495 mdeg_prop_spec_t *pspecp; 2496 mdeg_node_spec_t *inst_specp; 2497 mdeg_handle_t mdeg_hdl, mdeg_port_hdl; 2498 size_t templatesz; 2499 int inst, rv; 2500 2501 D1(vswp, "%s: enter", __func__); 2502 2503 /* 2504 * In each 'virtual-device' node in the MD there is a 2505 * 'cfg-handle' property which is the MD's concept of 2506 * an instance number (this may be completely different from 2507 * the device drivers instance #). OBP reads that value and 2508 * stores it in the 'reg' property of the appropriate node in 2509 * the device tree. So we use the 'reg' value when registering 2510 * with the mdeg framework, to ensure we get events for the 2511 * correct nodes. 2512 */ 2513 inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 2514 DDI_PROP_DONTPASS, reg_propname, -1); 2515 if (inst == -1) { 2516 cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from " 2517 "OBP device tree", vswp->instance, reg_propname); 2518 return (1); 2519 } 2520 2521 D2(vswp, "%s: instance %d registering with mdeg", __func__, inst); 2522 2523 /* 2524 * Allocate and initialize a per-instance copy 2525 * of the global property spec array that will 2526 * uniquely identify this vsw instance. 2527 */ 2528 templatesz = sizeof (vsw_prop_template); 2529 pspecp = kmem_zalloc(templatesz, KM_SLEEP); 2530 2531 bcopy(vsw_prop_template, pspecp, templatesz); 2532 2533 VSW_SET_MDEG_PROP_INST(pspecp, inst); 2534 2535 /* initialize the complete prop spec structure */ 2536 inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 2537 inst_specp->namep = "virtual-device"; 2538 inst_specp->specp = pspecp; 2539 2540 /* 2541 * Register an interest in 'virtual-device' nodes with a 2542 * 'name' property of 'virtual-network-switch' 2543 */ 2544 rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb, 2545 (void *)vswp, &mdeg_hdl); 2546 if (rv != MDEG_SUCCESS) { 2547 DERR(vswp, "%s: mdeg_register failed (%d) for vsw node", 2548 __func__, rv); 2549 goto mdeg_reg_fail; 2550 } 2551 2552 /* 2553 * Register an interest in 'vsw-port' nodes. 2554 */ 2555 rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb, 2556 (void *)vswp, &mdeg_port_hdl); 2557 if (rv != MDEG_SUCCESS) { 2558 DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); 2559 (void) mdeg_unregister(mdeg_hdl); 2560 goto mdeg_reg_fail; 2561 } 2562 2563 /* save off data that will be needed later */ 2564 vswp->inst_spec = inst_specp; 2565 vswp->mdeg_hdl = mdeg_hdl; 2566 vswp->mdeg_port_hdl = mdeg_port_hdl; 2567 2568 D1(vswp, "%s: exit", __func__); 2569 return (0); 2570 2571 mdeg_reg_fail: 2572 cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks", 2573 vswp->instance); 2574 kmem_free(pspecp, templatesz); 2575 kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); 2576 2577 vswp->mdeg_hdl = NULL; 2578 vswp->mdeg_port_hdl = NULL; 2579 2580 return (1); 2581 } 2582 2583 static void 2584 vsw_mdeg_unregister(vsw_t *vswp) 2585 { 2586 D1(vswp, "vsw_mdeg_unregister: enter"); 2587 2588 if (vswp->mdeg_hdl != NULL) 2589 (void) mdeg_unregister(vswp->mdeg_hdl); 2590 2591 if (vswp->mdeg_port_hdl != NULL) 2592 (void) mdeg_unregister(vswp->mdeg_port_hdl); 2593 2594 if (vswp->inst_spec != NULL) { 2595 if (vswp->inst_spec->specp != NULL) { 2596 (void) kmem_free(vswp->inst_spec->specp, 2597 sizeof (vsw_prop_template)); 2598 vswp->inst_spec->specp = NULL; 2599 } 2600 2601 (void) kmem_free(vswp->inst_spec, sizeof (mdeg_node_spec_t)); 2602 vswp->inst_spec = NULL; 2603 } 2604 2605 D1(vswp, "vsw_mdeg_unregister: exit"); 2606 } 2607 2608 /* 2609 * Mdeg callback invoked for the vsw node itself. 2610 */ 2611 static int 2612 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2613 { 2614 vsw_t *vswp; 2615 int idx; 2616 md_t *mdp; 2617 mde_cookie_t node; 2618 uint64_t inst; 2619 char *node_name = NULL; 2620 2621 if (resp == NULL) 2622 return (MDEG_FAILURE); 2623 2624 vswp = (vsw_t *)cb_argp; 2625 2626 D1(vswp, "%s: added %d : removed %d : curr matched %d" 2627 " : prev matched %d", __func__, resp->added.nelem, 2628 resp->removed.nelem, resp->match_curr.nelem, 2629 resp->match_prev.nelem); 2630 2631 /* 2632 * Expect 'added' to be non-zero if virtual-network-switch 2633 * nodes exist in the MD when the driver attaches. 2634 */ 2635 for (idx = 0; idx < resp->added.nelem; idx++) { 2636 mdp = resp->added.mdp; 2637 node = resp->added.mdep[idx]; 2638 2639 if (md_get_prop_str(mdp, node, "name", &node_name) != 0) { 2640 DERR(vswp, "%s: unable to get node name for " 2641 "node(%d) 0x%lx", __func__, idx, node); 2642 continue; 2643 } 2644 2645 if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) { 2646 DERR(vswp, "%s: prop(cfg-handle) not found port(%d)", 2647 __func__, idx); 2648 continue; 2649 } 2650 2651 D2(vswp, "%s: added node(%d) 0x%lx with name %s " 2652 "and inst %d", __func__, idx, node, node_name, inst); 2653 2654 vsw_get_initial_md_properties(vswp, mdp, node); 2655 } 2656 2657 /* 2658 * A non-zero 'match' value indicates that the MD has been 2659 * updated and that a virtual-network-switch node is present 2660 * which may or may not have been updated. It is up to the clients 2661 * to examine their own nodes and determine if they have changed. 2662 */ 2663 for (idx = 0; idx < resp->match_curr.nelem; idx++) { 2664 mdp = resp->match_curr.mdp; 2665 node = resp->match_curr.mdep[idx]; 2666 2667 if (md_get_prop_str(mdp, node, "name", &node_name) != 0) { 2668 DERR(vswp, "%s: unable to get node name for " 2669 "node(%d) 0x%lx", __func__, idx, node); 2670 continue; 2671 } 2672 2673 if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) { 2674 DERR(vswp, "%s: prop(cfg-handle) not found port(%d)", 2675 __func__, idx); 2676 continue; 2677 } 2678 2679 D2(vswp, "%s: changed node(%d) 0x%lx with name %s " 2680 "and inst %d", __func__, idx, node, node_name, inst); 2681 2682 vsw_update_md_prop(vswp, mdp, node); 2683 } 2684 2685 return (MDEG_SUCCESS); 2686 } 2687 2688 /* 2689 * Mdeg callback invoked for changes to the vsw-port nodes 2690 * under the vsw node. 2691 */ 2692 static int 2693 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2694 { 2695 vsw_t *vswp; 2696 int idx; 2697 md_t *mdp; 2698 mde_cookie_t node; 2699 uint64_t inst; 2700 2701 if ((resp == NULL) || (cb_argp == NULL)) 2702 return (MDEG_FAILURE); 2703 2704 vswp = (vsw_t *)cb_argp; 2705 2706 D2(vswp, "%s: added %d : removed %d : curr matched %d" 2707 " : prev matched %d", __func__, resp->added.nelem, 2708 resp->removed.nelem, resp->match_curr.nelem, 2709 resp->match_prev.nelem); 2710 2711 /* process added ports */ 2712 for (idx = 0; idx < resp->added.nelem; idx++) { 2713 mdp = resp->added.mdp; 2714 node = resp->added.mdep[idx]; 2715 2716 D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); 2717 2718 if (vsw_port_add(vswp, mdp, &node) != 0) { 2719 cmn_err(CE_WARN, "!vsw%d: Unable to add new port " 2720 "(0x%lx)", vswp->instance, node); 2721 } 2722 } 2723 2724 /* process removed ports */ 2725 for (idx = 0; idx < resp->removed.nelem; idx++) { 2726 mdp = resp->removed.mdp; 2727 node = resp->removed.mdep[idx]; 2728 2729 if (md_get_prop_val(mdp, node, id_propname, &inst)) { 2730 DERR(vswp, "%s: prop(%s) not found in port(%d)", 2731 __func__, id_propname, idx); 2732 continue; 2733 } 2734 2735 D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); 2736 2737 if (vsw_port_detach(vswp, inst) != 0) { 2738 cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld", 2739 vswp->instance, inst); 2740 } 2741 } 2742 2743 /* 2744 * Currently no support for updating already active ports. 2745 * So, ignore the match_curr and match_priv arrays for now. 2746 */ 2747 2748 D1(vswp, "%s: exit", __func__); 2749 2750 return (MDEG_SUCCESS); 2751 } 2752 2753 /* 2754 * Read the initial start-of-day values from the specified MD node. 2755 */ 2756 static void 2757 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node) 2758 { 2759 int i; 2760 uint64_t macaddr = 0; 2761 2762 D1(vswp, "%s: enter", __func__); 2763 2764 if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) == 0) { 2765 /* 2766 * Note it is valid for the physname property to 2767 * be NULL so check actual name length to determine 2768 * if we have a actual device name. 2769 */ 2770 if (strlen(vswp->physname) > 0) 2771 vswp->mdprops |= VSW_MD_PHYSNAME; 2772 } else { 2773 cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical " 2774 "device from MD", vswp->instance); 2775 return; 2776 } 2777 2778 /* mac address for vswitch device itself */ 2779 if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { 2780 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", 2781 vswp->instance); 2782 2783 /* 2784 * Fallback to using the mac address of the physical 2785 * device. 2786 */ 2787 if (vsw_get_physaddr(vswp) == 0) { 2788 cmn_err(CE_NOTE, "!vsw%d: Using MAC address from " 2789 "physical device (%s)", vswp->instance, 2790 vswp->physname); 2791 } else { 2792 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address" 2793 "from device %s", vswp->instance, vswp->physname); 2794 } 2795 } else { 2796 WRITE_ENTER(&vswp->if_lockrw); 2797 for (i = ETHERADDRL - 1; i >= 0; i--) { 2798 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 2799 macaddr >>= 8; 2800 } 2801 RW_EXIT(&vswp->if_lockrw); 2802 vswp->mdprops |= VSW_MD_MACADDR; 2803 } 2804 2805 if (vsw_get_md_smodes(vswp, mdp, node, vswp->smode, &vswp->smode_num)) { 2806 cmn_err(CE_WARN, "vsw%d: Unable to read %s property from " 2807 "MD, defaulting to programmed mode", vswp->instance, 2808 smode_propname); 2809 2810 for (i = 0; i < NUM_SMODES; i++) 2811 vswp->smode[i] = VSW_LAYER2; 2812 2813 vswp->smode_num = NUM_SMODES; 2814 } else { 2815 ASSERT(vswp->smode_num != 0); 2816 vswp->mdprops |= VSW_MD_SMODE; 2817 } 2818 2819 /* 2820 * Unable to setup any switching mode, nothing more 2821 * we can do. 2822 */ 2823 if (vsw_setup_switching(vswp)) 2824 return; 2825 2826 WRITE_ENTER(&vswp->if_lockrw); 2827 vswp->if_state &= ~VSW_IF_UP; 2828 RW_EXIT(&vswp->if_lockrw); 2829 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 2830 if (vsw_mac_register(vswp) != 0) { 2831 /* 2832 * Treat this as a non-fatal error as we may be 2833 * able to operate in some other mode. 2834 */ 2835 cmn_err(CE_WARN, "vsw%d: Unable to register as " 2836 "provider with MAC layer", vswp->instance); 2837 } 2838 } 2839 2840 D1(vswp, "%s: exit", __func__); 2841 } 2842 2843 /* 2844 * Check to see if the relevant properties in the specified node have 2845 * changed, and if so take the appropriate action. 2846 * 2847 * If any of the properties are missing or invalid we don't take 2848 * any action, as this function should only be invoked when modifications 2849 * have been made to what we assume is a working configuration, which 2850 * we leave active. 2851 * 2852 * Note it is legal for this routine to be invoked even if none of the 2853 * properties in the port node within the MD have actually changed. 2854 */ 2855 static void 2856 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) 2857 { 2858 char physname[LIFNAMSIZ]; 2859 char drv[LIFNAMSIZ]; 2860 uint_t ddi_instance; 2861 uint8_t new_smode[NUM_SMODES]; 2862 int i, smode_num = 0; 2863 uint64_t macaddr = 0; 2864 vsw_port_list_t *plist = &vswp->plist; 2865 vsw_port_t *port = NULL; 2866 enum {MD_init = 0x1, 2867 MD_physname = 0x2, 2868 MD_macaddr = 0x4, 2869 MD_smode = 0x8} updated; 2870 2871 updated = MD_init; 2872 2873 D1(vswp, "%s: enter", __func__); 2874 2875 /* 2876 * Check if name of physical device in MD has changed. 2877 */ 2878 if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) { 2879 /* 2880 * Do basic sanity check on new device name/instance, 2881 * if its non NULL. It is valid for the device name to 2882 * have changed from a non NULL to a NULL value, i.e. 2883 * the vsw is being changed to 'routed' mode. 2884 */ 2885 if ((strlen(physname) != 0) && 2886 (ddi_parse(physname, drv, &ddi_instance) != DDI_SUCCESS)) { 2887 cmn_err(CE_WARN, "!vsw%d: new device name %s is not" 2888 " a valid device name/instance", 2889 vswp->instance, physname); 2890 goto fail_reconf; 2891 } 2892 2893 if (strcmp(physname, vswp->physname)) { 2894 D2(vswp, "%s: device name changed from %s to %s", 2895 __func__, vswp->physname, physname); 2896 2897 updated |= MD_physname; 2898 } else { 2899 D2(vswp, "%s: device name unchanged at %s", 2900 __func__, vswp->physname); 2901 } 2902 } else { 2903 cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical " 2904 "device from updated MD.", vswp->instance); 2905 goto fail_reconf; 2906 } 2907 2908 /* 2909 * Check if MAC address has changed. 2910 */ 2911 if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) { 2912 cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD", 2913 vswp->instance); 2914 goto fail_reconf; 2915 } else { 2916 READ_ENTER(&vswp->if_lockrw); 2917 for (i = ETHERADDRL - 1; i >= 0; i--) { 2918 if (vswp->if_addr.ether_addr_octet[i] != 2919 (macaddr & 0xFF)) { 2920 D2(vswp, "%s: octet[%d] 0x%x != 0x%x", 2921 __func__, i, 2922 vswp->if_addr.ether_addr_octet[i], 2923 (macaddr & 0xFF)); 2924 updated |= MD_macaddr; 2925 break; 2926 } 2927 macaddr >>= 8; 2928 } 2929 RW_EXIT(&vswp->if_lockrw); 2930 } 2931 2932 /* 2933 * Check if switching modes have changed. 2934 */ 2935 if (vsw_get_md_smodes(vswp, mdp, node, new_smode, &smode_num)) { 2936 cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD", 2937 vswp->instance, smode_propname); 2938 goto fail_reconf; 2939 } else { 2940 ASSERT(smode_num != 0); 2941 if (smode_num != vswp->smode_num) { 2942 D2(vswp, "%s: number of modes changed from %d to %d", 2943 __func__, vswp->smode_num, smode_num); 2944 } 2945 2946 for (i = 0; i < smode_num; i++) { 2947 if (new_smode[i] != vswp->smode[i]) { 2948 D2(vswp, "%s: mode changed from %d to %d", 2949 __func__, vswp->smode[i], new_smode[i]); 2950 updated |= MD_smode; 2951 break; 2952 } 2953 } 2954 } 2955 2956 /* 2957 * Now make any changes which are needed... 2958 */ 2959 2960 if (updated & (MD_physname | MD_smode)) { 2961 /* 2962 * Disconnect all ports from the current card 2963 */ 2964 WRITE_ENTER(&plist->lockrw); 2965 for (port = plist->head; port != NULL; port = port->p_next) { 2966 /* Remove address if was programmed into HW. */ 2967 mutex_enter(&vswp->hw_lock); 2968 if (vsw_unset_hw(vswp, port, VSW_VNETPORT)) { 2969 mutex_exit(&vswp->hw_lock); 2970 RW_EXIT(&plist->lockrw); 2971 goto fail_update; 2972 } 2973 mutex_exit(&vswp->hw_lock); 2974 } 2975 RW_EXIT(&plist->lockrw); 2976 2977 /* 2978 * Stop, detach the old device.. 2979 */ 2980 vsw_mac_detach(vswp); 2981 2982 /* 2983 * Update phys name. 2984 */ 2985 if (updated & MD_physname) { 2986 cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s", 2987 vswp->instance, vswp->physname, physname); 2988 (void) strncpy(vswp->physname, 2989 physname, strlen(physname) + 1); 2990 2991 if (strlen(vswp->physname) > 0) 2992 vswp->mdprops |= VSW_MD_PHYSNAME; 2993 } 2994 2995 /* 2996 * Update array with the new switch mode values. 2997 */ 2998 if (updated & MD_smode) { 2999 for (i = 0; i < smode_num; i++) 3000 vswp->smode[i] = new_smode[i]; 3001 3002 vswp->smode_num = smode_num; 3003 vswp->smode_idx = 0; 3004 } 3005 3006 /* 3007 * ..and attach, start the new device. 3008 */ 3009 if (vsw_setup_switching(vswp)) 3010 goto fail_update; 3011 3012 /* 3013 * Connect ports to new card. 3014 */ 3015 WRITE_ENTER(&plist->lockrw); 3016 for (port = plist->head; port != NULL; port = port->p_next) { 3017 mutex_enter(&vswp->hw_lock); 3018 if (vsw_set_hw(vswp, port, VSW_VNETPORT)) { 3019 mutex_exit(&vswp->hw_lock); 3020 RW_EXIT(&plist->lockrw); 3021 goto fail_update; 3022 } 3023 mutex_exit(&vswp->hw_lock); 3024 } 3025 RW_EXIT(&plist->lockrw); 3026 } 3027 3028 if (updated & MD_macaddr) { 3029 cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx", 3030 vswp->instance, macaddr); 3031 3032 WRITE_ENTER(&vswp->if_lockrw); 3033 for (i = ETHERADDRL - 1; i >= 0; i--) { 3034 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 3035 macaddr >>= 8; 3036 } 3037 RW_EXIT(&vswp->if_lockrw); 3038 3039 /* 3040 * Remove old address from HW (if programmed) and set 3041 * new address. 3042 */ 3043 mutex_enter(&vswp->hw_lock); 3044 (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); 3045 (void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV); 3046 mutex_exit(&vswp->hw_lock); 3047 3048 /* 3049 * Notify the MAC layer of the changed address. 3050 */ 3051 mac_unicst_update(vswp->if_mh, (uint8_t *)&vswp->if_addr); 3052 } 3053 3054 return; 3055 3056 fail_reconf: 3057 cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance); 3058 return; 3059 3060 fail_update: 3061 cmn_err(CE_WARN, "!vsw%d: update of configuration failed", 3062 vswp->instance); 3063 } 3064 3065 /* 3066 * Add a new port to the system. 3067 * 3068 * Returns 0 on success, 1 on failure. 3069 */ 3070 int 3071 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node) 3072 { 3073 uint64_t ldc_id; 3074 uint8_t *addrp; 3075 int i, addrsz; 3076 int num_nodes = 0, nchan = 0; 3077 int listsz = 0; 3078 mde_cookie_t *listp = NULL; 3079 struct ether_addr ea; 3080 uint64_t macaddr; 3081 uint64_t inst = 0; 3082 vsw_port_t *port; 3083 3084 if (md_get_prop_val(mdp, *node, id_propname, &inst)) { 3085 DWARN(vswp, "%s: prop(%s) not found", __func__, 3086 id_propname); 3087 return (1); 3088 } 3089 3090 /* 3091 * Find the channel endpoint node(s) (which should be under this 3092 * port node) which contain the channel id(s). 3093 */ 3094 if ((num_nodes = md_node_count(mdp)) <= 0) { 3095 DERR(vswp, "%s: invalid number of nodes found (%d)", 3096 __func__, num_nodes); 3097 return (1); 3098 } 3099 3100 D2(vswp, "%s: %d nodes found", __func__, num_nodes); 3101 3102 /* allocate enough space for node list */ 3103 listsz = num_nodes * sizeof (mde_cookie_t); 3104 listp = kmem_zalloc(listsz, KM_SLEEP); 3105 3106 nchan = md_scan_dag(mdp, *node, md_find_name(mdp, chan_propname), 3107 md_find_name(mdp, "fwd"), listp); 3108 3109 if (nchan <= 0) { 3110 DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname); 3111 kmem_free(listp, listsz); 3112 return (1); 3113 } 3114 3115 D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname); 3116 3117 /* use property from first node found */ 3118 if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) { 3119 DWARN(vswp, "%s: prop(%s) not found\n", __func__, 3120 id_propname); 3121 kmem_free(listp, listsz); 3122 return (1); 3123 } 3124 3125 /* don't need list any more */ 3126 kmem_free(listp, listsz); 3127 3128 D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id); 3129 3130 /* read mac-address property */ 3131 if (md_get_prop_data(mdp, *node, remaddr_propname, 3132 &addrp, &addrsz)) { 3133 DWARN(vswp, "%s: prop(%s) not found", 3134 __func__, remaddr_propname); 3135 return (1); 3136 } 3137 3138 if (addrsz < ETHERADDRL) { 3139 DWARN(vswp, "%s: invalid address size", __func__); 3140 return (1); 3141 } 3142 3143 macaddr = *((uint64_t *)addrp); 3144 D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr); 3145 3146 for (i = ETHERADDRL - 1; i >= 0; i--) { 3147 ea.ether_addr_octet[i] = macaddr & 0xFF; 3148 macaddr >>= 8; 3149 } 3150 3151 if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) { 3152 DERR(vswp, "%s: failed to attach port", __func__); 3153 return (1); 3154 } 3155 3156 port = vsw_lookup_port(vswp, (int)inst); 3157 3158 /* just successfuly created the port, so it should exist */ 3159 ASSERT(port != NULL); 3160 3161 return (0); 3162 } 3163 3164 /* 3165 * Attach the specified port. 3166 * 3167 * Returns 0 on success, 1 on failure. 3168 */ 3169 static int 3170 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids, 3171 struct ether_addr *macaddr) 3172 { 3173 vsw_port_list_t *plist = &vswp->plist; 3174 vsw_port_t *port, **prev_port; 3175 int i; 3176 3177 D1(vswp, "%s: enter : port %d", __func__, p_instance); 3178 3179 /* port already exists? */ 3180 READ_ENTER(&plist->lockrw); 3181 for (port = plist->head; port != NULL; port = port->p_next) { 3182 if (port->p_instance == p_instance) { 3183 DWARN(vswp, "%s: port instance %d already attached", 3184 __func__, p_instance); 3185 RW_EXIT(&plist->lockrw); 3186 return (1); 3187 } 3188 } 3189 RW_EXIT(&plist->lockrw); 3190 3191 port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP); 3192 port->p_vswp = vswp; 3193 port->p_instance = p_instance; 3194 port->p_ldclist.num_ldcs = 0; 3195 port->p_ldclist.head = NULL; 3196 port->addr_set = VSW_ADDR_UNSET; 3197 3198 rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL); 3199 3200 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); 3201 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); 3202 3203 mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL); 3204 cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL); 3205 3206 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); 3207 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); 3208 port->state = VSW_PORT_INIT; 3209 3210 if (nids > VSW_PORT_MAX_LDCS) { 3211 D2(vswp, "%s: using first of %d ldc ids", __func__, nids); 3212 nids = VSW_PORT_MAX_LDCS; 3213 } 3214 3215 D2(vswp, "%s: %d nids", __func__, nids); 3216 for (i = 0; i < nids; i++) { 3217 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); 3218 if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { 3219 DERR(vswp, "%s: ldc_attach failed", __func__); 3220 3221 rw_destroy(&port->p_ldclist.lockrw); 3222 3223 cv_destroy(&port->ref_cv); 3224 mutex_destroy(&port->ref_lock); 3225 3226 cv_destroy(&port->state_cv); 3227 mutex_destroy(&port->state_lock); 3228 3229 mutex_destroy(&port->tx_lock); 3230 mutex_destroy(&port->mca_lock); 3231 kmem_free(port, sizeof (vsw_port_t)); 3232 return (1); 3233 } 3234 } 3235 3236 ether_copy(macaddr, &port->p_macaddr); 3237 3238 WRITE_ENTER(&plist->lockrw); 3239 3240 /* create the fdb entry for this port/mac address */ 3241 (void) vsw_add_fdb(vswp, port); 3242 3243 mutex_enter(&vswp->hw_lock); 3244 (void) vsw_set_hw(vswp, port, VSW_VNETPORT); 3245 mutex_exit(&vswp->hw_lock); 3246 3247 /* link it into the list of ports for this vsw instance */ 3248 prev_port = (vsw_port_t **)(&plist->head); 3249 port->p_next = *prev_port; 3250 *prev_port = port; 3251 plist->num_ports++; 3252 RW_EXIT(&plist->lockrw); 3253 3254 /* 3255 * Initialise the port and any ldc's under it. 3256 */ 3257 (void) vsw_init_ldcs(port); 3258 3259 D1(vswp, "%s: exit", __func__); 3260 return (0); 3261 } 3262 3263 /* 3264 * Detach the specified port. 3265 * 3266 * Returns 0 on success, 1 on failure. 3267 */ 3268 static int 3269 vsw_port_detach(vsw_t *vswp, int p_instance) 3270 { 3271 vsw_port_t *port = NULL; 3272 vsw_port_list_t *plist = &vswp->plist; 3273 3274 D1(vswp, "%s: enter: port id %d", __func__, p_instance); 3275 3276 WRITE_ENTER(&plist->lockrw); 3277 3278 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { 3279 RW_EXIT(&plist->lockrw); 3280 return (1); 3281 } 3282 3283 if (vsw_plist_del_node(vswp, port)) { 3284 RW_EXIT(&plist->lockrw); 3285 return (1); 3286 } 3287 3288 /* Remove the fdb entry for this port/mac address */ 3289 (void) vsw_del_fdb(vswp, port); 3290 3291 /* Remove any multicast addresses.. */ 3292 vsw_del_mcst_port(port); 3293 3294 /* 3295 * No longer need to hold writer lock on port list now 3296 * that we have unlinked the target port from the list. 3297 */ 3298 RW_EXIT(&plist->lockrw); 3299 3300 /* Remove address if was programmed into HW. */ 3301 mutex_enter(&vswp->hw_lock); 3302 (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); 3303 if (vswp->recfg_reqd) 3304 vsw_reconfig_hw(vswp); 3305 mutex_exit(&vswp->hw_lock); 3306 3307 if (vsw_port_delete(port)) { 3308 return (1); 3309 } 3310 3311 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); 3312 return (0); 3313 } 3314 3315 /* 3316 * Detach all active ports. 3317 * 3318 * Returns 0 on success, 1 on failure. 3319 */ 3320 static int 3321 vsw_detach_ports(vsw_t *vswp) 3322 { 3323 vsw_port_list_t *plist = &vswp->plist; 3324 vsw_port_t *port = NULL; 3325 3326 D1(vswp, "%s: enter", __func__); 3327 3328 WRITE_ENTER(&plist->lockrw); 3329 3330 while ((port = plist->head) != NULL) { 3331 if (vsw_plist_del_node(vswp, port)) { 3332 DERR(vswp, "%s: Error deleting port %d" 3333 " from port list", __func__, port->p_instance); 3334 RW_EXIT(&plist->lockrw); 3335 return (1); 3336 } 3337 3338 /* Remove address if was programmed into HW. */ 3339 mutex_enter(&vswp->hw_lock); 3340 (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); 3341 mutex_exit(&vswp->hw_lock); 3342 3343 /* Remove the fdb entry for this port/mac address */ 3344 (void) vsw_del_fdb(vswp, port); 3345 3346 /* Remove any multicast addresses.. */ 3347 vsw_del_mcst_port(port); 3348 3349 /* 3350 * No longer need to hold the lock on the port list 3351 * now that we have unlinked the target port from the 3352 * list. 3353 */ 3354 RW_EXIT(&plist->lockrw); 3355 if (vsw_port_delete(port)) { 3356 DERR(vswp, "%s: Error deleting port %d", 3357 __func__, port->p_instance); 3358 return (1); 3359 } 3360 WRITE_ENTER(&plist->lockrw); 3361 } 3362 RW_EXIT(&plist->lockrw); 3363 3364 D1(vswp, "%s: exit", __func__); 3365 3366 return (0); 3367 } 3368 3369 /* 3370 * Delete the specified port. 3371 * 3372 * Returns 0 on success, 1 on failure. 3373 */ 3374 static int 3375 vsw_port_delete(vsw_port_t *port) 3376 { 3377 vsw_ldc_list_t *ldcl; 3378 vsw_t *vswp = port->p_vswp; 3379 3380 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); 3381 3382 (void) vsw_uninit_ldcs(port); 3383 3384 /* 3385 * Wait for any pending ctrl msg tasks which reference this 3386 * port to finish. 3387 */ 3388 if (vsw_drain_port_taskq(port)) 3389 return (1); 3390 3391 /* 3392 * Wait for port reference count to hit zero. 3393 */ 3394 mutex_enter(&port->ref_lock); 3395 while (port->ref_cnt != 0) 3396 cv_wait(&port->ref_cv, &port->ref_lock); 3397 mutex_exit(&port->ref_lock); 3398 3399 /* 3400 * Wait for any active callbacks to finish 3401 */ 3402 if (vsw_drain_ldcs(port)) 3403 return (1); 3404 3405 ldcl = &port->p_ldclist; 3406 WRITE_ENTER(&ldcl->lockrw); 3407 while (ldcl->num_ldcs > 0) { 3408 if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) { 3409 cmn_err(CE_WARN, "!vsw%d: unable to detach ldc %ld", 3410 vswp->instance, ldcl->head->ldc_id); 3411 RW_EXIT(&ldcl->lockrw); 3412 return (1); 3413 } 3414 } 3415 RW_EXIT(&ldcl->lockrw); 3416 3417 rw_destroy(&port->p_ldclist.lockrw); 3418 3419 mutex_destroy(&port->mca_lock); 3420 mutex_destroy(&port->tx_lock); 3421 cv_destroy(&port->ref_cv); 3422 mutex_destroy(&port->ref_lock); 3423 3424 cv_destroy(&port->state_cv); 3425 mutex_destroy(&port->state_lock); 3426 3427 kmem_free(port, sizeof (vsw_port_t)); 3428 3429 D1(vswp, "%s: exit", __func__); 3430 3431 return (0); 3432 } 3433 3434 /* 3435 * Attach a logical domain channel (ldc) under a specified port. 3436 * 3437 * Returns 0 on success, 1 on failure. 3438 */ 3439 static int 3440 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) 3441 { 3442 vsw_t *vswp = port->p_vswp; 3443 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3444 vsw_ldc_t *ldcp = NULL; 3445 ldc_attr_t attr; 3446 ldc_status_t istatus; 3447 int status = DDI_FAILURE; 3448 int rv; 3449 enum { PROG_init = 0x0, PROG_mblks = 0x1, 3450 PROG_callback = 0x2} 3451 progress; 3452 3453 progress = PROG_init; 3454 3455 D1(vswp, "%s: enter", __func__); 3456 3457 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); 3458 if (ldcp == NULL) { 3459 DERR(vswp, "%s: kmem_zalloc failed", __func__); 3460 return (1); 3461 } 3462 ldcp->ldc_id = ldc_id; 3463 3464 /* allocate pool of receive mblks */ 3465 rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh)); 3466 if (rv) { 3467 DWARN(vswp, "%s: unable to create free mblk pool for" 3468 " channel %ld (rv %d)", __func__, ldc_id, rv); 3469 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3470 return (1); 3471 } 3472 3473 progress |= PROG_mblks; 3474 3475 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); 3476 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); 3477 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); 3478 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); 3479 rw_init(&ldcp->lane_in.dlistrw, NULL, RW_DRIVER, NULL); 3480 rw_init(&ldcp->lane_out.dlistrw, NULL, RW_DRIVER, NULL); 3481 3482 /* required for handshake with peer */ 3483 ldcp->local_session = (uint64_t)ddi_get_lbolt(); 3484 ldcp->peer_session = 0; 3485 ldcp->session_status = 0; 3486 3487 mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL); 3488 ldcp->hss_id = 1; /* Initial handshake session id */ 3489 3490 /* only set for outbound lane, inbound set by peer */ 3491 mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL); 3492 mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL); 3493 vsw_set_lane_attr(vswp, &ldcp->lane_out); 3494 3495 attr.devclass = LDC_DEV_NT_SVC; 3496 attr.instance = ddi_get_instance(vswp->dip); 3497 attr.mode = LDC_MODE_UNRELIABLE; 3498 attr.mtu = VSW_LDC_MTU; 3499 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); 3500 if (status != 0) { 3501 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", 3502 __func__, ldc_id, status); 3503 goto ldc_attach_fail; 3504 } 3505 3506 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); 3507 if (status != 0) { 3508 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", 3509 __func__, ldc_id, status); 3510 (void) ldc_fini(ldcp->ldc_handle); 3511 goto ldc_attach_fail; 3512 } 3513 3514 progress |= PROG_callback; 3515 3516 mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL); 3517 3518 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 3519 DERR(vswp, "%s: ldc_status failed", __func__); 3520 mutex_destroy(&ldcp->status_lock); 3521 goto ldc_attach_fail; 3522 } 3523 3524 ldcp->ldc_status = istatus; 3525 ldcp->ldc_port = port; 3526 ldcp->ldc_vswp = vswp; 3527 3528 /* link it into the list of channels for this port */ 3529 WRITE_ENTER(&ldcl->lockrw); 3530 ldcp->ldc_next = ldcl->head; 3531 ldcl->head = ldcp; 3532 ldcl->num_ldcs++; 3533 RW_EXIT(&ldcl->lockrw); 3534 3535 D1(vswp, "%s: exit", __func__); 3536 return (0); 3537 3538 ldc_attach_fail: 3539 mutex_destroy(&ldcp->ldc_txlock); 3540 mutex_destroy(&ldcp->ldc_cblock); 3541 3542 cv_destroy(&ldcp->drain_cv); 3543 3544 rw_destroy(&ldcp->lane_in.dlistrw); 3545 rw_destroy(&ldcp->lane_out.dlistrw); 3546 3547 if (progress & PROG_callback) { 3548 (void) ldc_unreg_callback(ldcp->ldc_handle); 3549 } 3550 3551 if ((progress & PROG_mblks) && (ldcp->rxh != NULL)) { 3552 if (vio_destroy_mblks(ldcp->rxh) != 0) { 3553 /* 3554 * Something odd has happened, as the destroy 3555 * will only fail if some mblks have been allocated 3556 * from the pool already (which shouldn't happen) 3557 * and have not been returned. 3558 * 3559 * Add the pool pointer to a list maintained in 3560 * the device instance. Another attempt will be made 3561 * to free the pool when the device itself detaches. 3562 */ 3563 cmn_err(CE_WARN, "!vsw%d: Creation of ldc channel %ld " 3564 "failed and cannot destroy associated mblk " 3565 "pool", vswp->instance, ldc_id); 3566 ldcp->rxh->nextp = vswp->rxh; 3567 vswp->rxh = ldcp->rxh; 3568 } 3569 } 3570 mutex_destroy(&ldcp->drain_cv_lock); 3571 mutex_destroy(&ldcp->hss_lock); 3572 3573 mutex_destroy(&ldcp->lane_in.seq_lock); 3574 mutex_destroy(&ldcp->lane_out.seq_lock); 3575 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3576 3577 return (1); 3578 } 3579 3580 /* 3581 * Detach a logical domain channel (ldc) belonging to a 3582 * particular port. 3583 * 3584 * Returns 0 on success, 1 on failure. 3585 */ 3586 static int 3587 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) 3588 { 3589 vsw_t *vswp = port->p_vswp; 3590 vsw_ldc_t *ldcp, *prev_ldcp; 3591 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3592 int rv; 3593 3594 prev_ldcp = ldcl->head; 3595 for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) { 3596 if (ldcp->ldc_id == ldc_id) { 3597 break; 3598 } 3599 } 3600 3601 /* specified ldc id not found */ 3602 if (ldcp == NULL) { 3603 DERR(vswp, "%s: ldcp = NULL", __func__); 3604 return (1); 3605 } 3606 3607 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); 3608 3609 /* 3610 * Before we can close the channel we must release any mapped 3611 * resources (e.g. drings). 3612 */ 3613 vsw_free_lane_resources(ldcp, INBOUND); 3614 vsw_free_lane_resources(ldcp, OUTBOUND); 3615 3616 /* 3617 * If the close fails we are in serious trouble, as won't 3618 * be able to delete the parent port. 3619 */ 3620 if ((rv = ldc_close(ldcp->ldc_handle)) != 0) { 3621 DERR(vswp, "%s: error %d closing channel %lld", 3622 __func__, rv, ldcp->ldc_id); 3623 return (1); 3624 } 3625 3626 (void) ldc_fini(ldcp->ldc_handle); 3627 3628 ldcp->ldc_status = LDC_INIT; 3629 ldcp->ldc_handle = NULL; 3630 ldcp->ldc_vswp = NULL; 3631 3632 if (ldcp->rxh != NULL) { 3633 if (vio_destroy_mblks(ldcp->rxh)) { 3634 /* 3635 * Mostly likely some mblks are still in use and 3636 * have not been returned to the pool. Add the pool 3637 * to the list maintained in the device instance. 3638 * Another attempt will be made to destroy the pool 3639 * when the device detaches. 3640 */ 3641 ldcp->rxh->nextp = vswp->rxh; 3642 vswp->rxh = ldcp->rxh; 3643 } 3644 } 3645 3646 /* unlink it from the list */ 3647 prev_ldcp = ldcp->ldc_next; 3648 ldcl->num_ldcs--; 3649 3650 mutex_destroy(&ldcp->ldc_txlock); 3651 mutex_destroy(&ldcp->ldc_cblock); 3652 cv_destroy(&ldcp->drain_cv); 3653 mutex_destroy(&ldcp->drain_cv_lock); 3654 mutex_destroy(&ldcp->hss_lock); 3655 mutex_destroy(&ldcp->lane_in.seq_lock); 3656 mutex_destroy(&ldcp->lane_out.seq_lock); 3657 mutex_destroy(&ldcp->status_lock); 3658 rw_destroy(&ldcp->lane_in.dlistrw); 3659 rw_destroy(&ldcp->lane_out.dlistrw); 3660 3661 kmem_free(ldcp, sizeof (vsw_ldc_t)); 3662 3663 return (0); 3664 } 3665 3666 /* 3667 * Open and attempt to bring up the channel. Note that channel 3668 * can only be brought up if peer has also opened channel. 3669 * 3670 * Returns 0 if can open and bring up channel, otherwise 3671 * returns 1. 3672 */ 3673 static int 3674 vsw_ldc_init(vsw_ldc_t *ldcp) 3675 { 3676 vsw_t *vswp = ldcp->ldc_vswp; 3677 ldc_status_t istatus = 0; 3678 int rv; 3679 3680 D1(vswp, "%s: enter", __func__); 3681 3682 LDC_ENTER_LOCK(ldcp); 3683 3684 /* don't start at 0 in case clients don't like that */ 3685 ldcp->next_ident = 1; 3686 3687 rv = ldc_open(ldcp->ldc_handle); 3688 if (rv != 0) { 3689 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", 3690 __func__, ldcp->ldc_id, rv); 3691 LDC_EXIT_LOCK(ldcp); 3692 return (1); 3693 } 3694 3695 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 3696 DERR(vswp, "%s: unable to get status", __func__); 3697 LDC_EXIT_LOCK(ldcp); 3698 return (1); 3699 3700 } else if (istatus != LDC_OPEN && istatus != LDC_READY) { 3701 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", 3702 __func__, ldcp->ldc_id, istatus); 3703 LDC_EXIT_LOCK(ldcp); 3704 return (1); 3705 } 3706 3707 mutex_enter(&ldcp->status_lock); 3708 ldcp->ldc_status = istatus; 3709 mutex_exit(&ldcp->status_lock); 3710 3711 rv = ldc_up(ldcp->ldc_handle); 3712 if (rv != 0) { 3713 /* 3714 * Not a fatal error for ldc_up() to fail, as peer 3715 * end point may simply not be ready yet. 3716 */ 3717 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 3718 ldcp->ldc_id, rv); 3719 LDC_EXIT_LOCK(ldcp); 3720 return (1); 3721 } 3722 3723 /* 3724 * ldc_up() call is non-blocking so need to explicitly 3725 * check channel status to see if in fact the channel 3726 * is UP. 3727 */ 3728 mutex_enter(&ldcp->status_lock); 3729 if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) { 3730 DERR(vswp, "%s: unable to get status", __func__); 3731 mutex_exit(&ldcp->status_lock); 3732 LDC_EXIT_LOCK(ldcp); 3733 return (1); 3734 3735 } 3736 3737 if (ldcp->ldc_status == LDC_UP) { 3738 D2(vswp, "%s: channel %ld now UP (%ld)", __func__, 3739 ldcp->ldc_id, istatus); 3740 mutex_exit(&ldcp->status_lock); 3741 LDC_EXIT_LOCK(ldcp); 3742 3743 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 3744 return (0); 3745 } 3746 3747 mutex_exit(&ldcp->status_lock); 3748 LDC_EXIT_LOCK(ldcp); 3749 3750 D1(vswp, "%s: exit", __func__); 3751 return (0); 3752 } 3753 3754 /* disable callbacks on the channel */ 3755 static int 3756 vsw_ldc_uninit(vsw_ldc_t *ldcp) 3757 { 3758 vsw_t *vswp = ldcp->ldc_vswp; 3759 int rv; 3760 3761 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); 3762 3763 LDC_ENTER_LOCK(ldcp); 3764 3765 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); 3766 if (rv != 0) { 3767 DERR(vswp, "vsw_ldc_uninit(%lld): error disabling " 3768 "interrupts (rv = %d)\n", ldcp->ldc_id, rv); 3769 LDC_EXIT_LOCK(ldcp); 3770 return (1); 3771 } 3772 3773 mutex_enter(&ldcp->status_lock); 3774 ldcp->ldc_status = LDC_INIT; 3775 mutex_exit(&ldcp->status_lock); 3776 3777 LDC_EXIT_LOCK(ldcp); 3778 3779 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); 3780 3781 return (0); 3782 } 3783 3784 static int 3785 vsw_init_ldcs(vsw_port_t *port) 3786 { 3787 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3788 vsw_ldc_t *ldcp; 3789 3790 READ_ENTER(&ldcl->lockrw); 3791 ldcp = ldcl->head; 3792 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3793 (void) vsw_ldc_init(ldcp); 3794 } 3795 RW_EXIT(&ldcl->lockrw); 3796 3797 return (0); 3798 } 3799 3800 static int 3801 vsw_uninit_ldcs(vsw_port_t *port) 3802 { 3803 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3804 vsw_ldc_t *ldcp; 3805 3806 D1(NULL, "vsw_uninit_ldcs: enter\n"); 3807 3808 READ_ENTER(&ldcl->lockrw); 3809 ldcp = ldcl->head; 3810 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3811 (void) vsw_ldc_uninit(ldcp); 3812 } 3813 RW_EXIT(&ldcl->lockrw); 3814 3815 D1(NULL, "vsw_uninit_ldcs: exit\n"); 3816 3817 return (0); 3818 } 3819 3820 /* 3821 * Wait until the callback(s) associated with the ldcs under the specified 3822 * port have completed. 3823 * 3824 * Prior to this function being invoked each channel under this port 3825 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3826 * 3827 * A short explaination of what we are doing below.. 3828 * 3829 * The simplest approach would be to have a reference counter in 3830 * the ldc structure which is increment/decremented by the callbacks as 3831 * they use the channel. The drain function could then simply disable any 3832 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately 3833 * there is a tiny window here - before the callback is able to get the lock 3834 * on the channel it is interrupted and this function gets to execute. It 3835 * sees that the ref count is zero and believes its free to delete the 3836 * associated data structures. 3837 * 3838 * We get around this by taking advantage of the fact that before the ldc 3839 * framework invokes a callback it sets a flag to indicate that there is a 3840 * callback active (or about to become active). If when we attempt to 3841 * unregister a callback when this active flag is set then the unregister 3842 * will fail with EWOULDBLOCK. 3843 * 3844 * If the unregister fails we do a cv_timedwait. We will either be signaled 3845 * by the callback as it is exiting (note we have to wait a short period to 3846 * allow the callback to return fully to the ldc framework and it to clear 3847 * the active flag), or by the timer expiring. In either case we again attempt 3848 * the unregister. We repeat this until we can succesfully unregister the 3849 * callback. 3850 * 3851 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch 3852 * the case where the callback has finished but the ldc framework has not yet 3853 * cleared the active flag. In this case we would never get a cv_signal. 3854 */ 3855 static int 3856 vsw_drain_ldcs(vsw_port_t *port) 3857 { 3858 vsw_ldc_list_t *ldcl = &port->p_ldclist; 3859 vsw_ldc_t *ldcp; 3860 vsw_t *vswp = port->p_vswp; 3861 3862 D1(vswp, "%s: enter", __func__); 3863 3864 READ_ENTER(&ldcl->lockrw); 3865 3866 ldcp = ldcl->head; 3867 3868 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 3869 /* 3870 * If we can unregister the channel callback then we 3871 * know that there is no callback either running or 3872 * scheduled to run for this channel so move on to next 3873 * channel in the list. 3874 */ 3875 mutex_enter(&ldcp->drain_cv_lock); 3876 3877 /* prompt active callbacks to quit */ 3878 ldcp->drain_state = VSW_LDC_DRAINING; 3879 3880 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { 3881 D2(vswp, "%s: unreg callback for chan %ld", __func__, 3882 ldcp->ldc_id); 3883 mutex_exit(&ldcp->drain_cv_lock); 3884 continue; 3885 } else { 3886 /* 3887 * If we end up here we know that either 1) a callback 3888 * is currently executing, 2) is about to start (i.e. 3889 * the ldc framework has set the active flag but 3890 * has not actually invoked the callback yet, or 3) 3891 * has finished and has returned to the ldc framework 3892 * but the ldc framework has not yet cleared the 3893 * active bit. 3894 * 3895 * Wait for it to finish. 3896 */ 3897 while (ldc_unreg_callback(ldcp->ldc_handle) 3898 == EWOULDBLOCK) 3899 (void) cv_timedwait(&ldcp->drain_cv, 3900 &ldcp->drain_cv_lock, lbolt + hz); 3901 3902 mutex_exit(&ldcp->drain_cv_lock); 3903 D2(vswp, "%s: unreg callback for chan %ld after " 3904 "timeout", __func__, ldcp->ldc_id); 3905 } 3906 } 3907 RW_EXIT(&ldcl->lockrw); 3908 3909 D1(vswp, "%s: exit", __func__); 3910 return (0); 3911 } 3912 3913 /* 3914 * Wait until all tasks which reference this port have completed. 3915 * 3916 * Prior to this function being invoked each channel under this port 3917 * should have been quiesced via ldc_set_cb_mode(DISABLE). 3918 */ 3919 static int 3920 vsw_drain_port_taskq(vsw_port_t *port) 3921 { 3922 vsw_t *vswp = port->p_vswp; 3923 3924 D1(vswp, "%s: enter", __func__); 3925 3926 /* 3927 * Mark the port as in the process of being detached, and 3928 * dispatch a marker task to the queue so we know when all 3929 * relevant tasks have completed. 3930 */ 3931 mutex_enter(&port->state_lock); 3932 port->state = VSW_PORT_DETACHING; 3933 3934 if ((vswp->taskq_p == NULL) || 3935 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, 3936 port, DDI_NOSLEEP) != DDI_SUCCESS)) { 3937 DERR(vswp, "%s: unable to dispatch marker task", 3938 __func__); 3939 mutex_exit(&port->state_lock); 3940 return (1); 3941 } 3942 3943 /* 3944 * Wait for the marker task to finish. 3945 */ 3946 while (port->state != VSW_PORT_DETACHABLE) 3947 cv_wait(&port->state_cv, &port->state_lock); 3948 3949 mutex_exit(&port->state_lock); 3950 3951 D1(vswp, "%s: exit", __func__); 3952 3953 return (0); 3954 } 3955 3956 static void 3957 vsw_marker_task(void *arg) 3958 { 3959 vsw_port_t *port = arg; 3960 vsw_t *vswp = port->p_vswp; 3961 3962 D1(vswp, "%s: enter", __func__); 3963 3964 mutex_enter(&port->state_lock); 3965 3966 /* 3967 * No further tasks should be dispatched which reference 3968 * this port so ok to mark it as safe to detach. 3969 */ 3970 port->state = VSW_PORT_DETACHABLE; 3971 3972 cv_signal(&port->state_cv); 3973 3974 mutex_exit(&port->state_lock); 3975 3976 D1(vswp, "%s: exit", __func__); 3977 } 3978 3979 static vsw_port_t * 3980 vsw_lookup_port(vsw_t *vswp, int p_instance) 3981 { 3982 vsw_port_list_t *plist = &vswp->plist; 3983 vsw_port_t *port; 3984 3985 for (port = plist->head; port != NULL; port = port->p_next) { 3986 if (port->p_instance == p_instance) { 3987 D2(vswp, "vsw_lookup_port: found p_instance\n"); 3988 return (port); 3989 } 3990 } 3991 3992 return (NULL); 3993 } 3994 3995 /* 3996 * Search for and remove the specified port from the port 3997 * list. Returns 0 if able to locate and remove port, otherwise 3998 * returns 1. 3999 */ 4000 static int 4001 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) 4002 { 4003 vsw_port_list_t *plist = &vswp->plist; 4004 vsw_port_t *curr_p, *prev_p; 4005 4006 if (plist->head == NULL) 4007 return (1); 4008 4009 curr_p = prev_p = plist->head; 4010 4011 while (curr_p != NULL) { 4012 if (curr_p == port) { 4013 if (prev_p == curr_p) { 4014 plist->head = curr_p->p_next; 4015 } else { 4016 prev_p->p_next = curr_p->p_next; 4017 } 4018 plist->num_ports--; 4019 break; 4020 } else { 4021 prev_p = curr_p; 4022 curr_p = curr_p->p_next; 4023 } 4024 } 4025 return (0); 4026 } 4027 4028 /* 4029 * Interrupt handler for ldc messages. 4030 */ 4031 static uint_t 4032 vsw_ldc_cb(uint64_t event, caddr_t arg) 4033 { 4034 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 4035 vsw_t *vswp = ldcp->ldc_vswp; 4036 4037 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4038 4039 mutex_enter(&ldcp->ldc_cblock); 4040 4041 mutex_enter(&ldcp->status_lock); 4042 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { 4043 mutex_exit(&ldcp->status_lock); 4044 mutex_exit(&ldcp->ldc_cblock); 4045 return (LDC_SUCCESS); 4046 } 4047 mutex_exit(&ldcp->status_lock); 4048 4049 if (event & LDC_EVT_UP) { 4050 /* 4051 * Channel has come up. 4052 */ 4053 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", 4054 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 4055 4056 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 4057 4058 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 4059 } 4060 4061 if (event & LDC_EVT_READ) { 4062 /* 4063 * Data available for reading. 4064 */ 4065 D2(vswp, "%s: id(ld) event(%llx) data READ", 4066 __func__, ldcp->ldc_id, event); 4067 4068 vsw_process_pkt(ldcp); 4069 4070 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 4071 4072 goto vsw_cb_exit; 4073 } 4074 4075 if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) { 4076 D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)", 4077 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 4078 4079 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 4080 } 4081 4082 /* 4083 * Catch either LDC_EVT_WRITE which we don't support or any 4084 * unknown event. 4085 */ 4086 if (event & 4087 ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) { 4088 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", 4089 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 4090 } 4091 4092 vsw_cb_exit: 4093 mutex_exit(&ldcp->ldc_cblock); 4094 4095 /* 4096 * Let the drain function know we are finishing if it 4097 * is waiting. 4098 */ 4099 mutex_enter(&ldcp->drain_cv_lock); 4100 if (ldcp->drain_state == VSW_LDC_DRAINING) 4101 cv_signal(&ldcp->drain_cv); 4102 mutex_exit(&ldcp->drain_cv_lock); 4103 4104 return (LDC_SUCCESS); 4105 } 4106 4107 /* 4108 * Reinitialise data structures associated with the channel. 4109 */ 4110 static void 4111 vsw_ldc_reinit(vsw_ldc_t *ldcp) 4112 { 4113 vsw_t *vswp = ldcp->ldc_vswp; 4114 vsw_port_t *port; 4115 vsw_ldc_list_t *ldcl; 4116 4117 D1(vswp, "%s: enter", __func__); 4118 4119 port = ldcp->ldc_port; 4120 ldcl = &port->p_ldclist; 4121 4122 READ_ENTER(&ldcl->lockrw); 4123 4124 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, 4125 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 4126 4127 vsw_free_lane_resources(ldcp, INBOUND); 4128 vsw_free_lane_resources(ldcp, OUTBOUND); 4129 RW_EXIT(&ldcl->lockrw); 4130 4131 ldcp->lane_in.lstate = 0; 4132 ldcp->lane_out.lstate = 0; 4133 4134 /* 4135 * Remove parent port from any multicast groups 4136 * it may have registered with. Client must resend 4137 * multicast add command after handshake completes. 4138 */ 4139 (void) vsw_del_fdb(vswp, port); 4140 4141 vsw_del_mcst_port(port); 4142 4143 ldcp->peer_session = 0; 4144 ldcp->session_status = 0; 4145 ldcp->hcnt = 0; 4146 ldcp->hphase = VSW_MILESTONE0; 4147 4148 D1(vswp, "%s: exit", __func__); 4149 } 4150 4151 /* 4152 * Process a connection event. 4153 * 4154 * Note - care must be taken to ensure that this function is 4155 * not called with the dlistrw lock held. 4156 */ 4157 static void 4158 vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt) 4159 { 4160 vsw_t *vswp = ldcp->ldc_vswp; 4161 vsw_conn_evt_t *conn = NULL; 4162 4163 D1(vswp, "%s: enter", __func__); 4164 4165 /* 4166 * Check if either a reset or restart event is pending 4167 * or in progress. If so just return. 4168 * 4169 * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT 4170 * being received by the callback handler, or a ECONNRESET error 4171 * code being returned from a ldc_read() or ldc_write() call. 4172 * 4173 * A VSW_CONN_RESTART event occurs when some error checking code 4174 * decides that there is a problem with data from the channel, 4175 * and that the handshake should be restarted. 4176 */ 4177 if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) && 4178 (ldstub((uint8_t *)&ldcp->reset_active))) 4179 return; 4180 4181 /* 4182 * If it is an LDC_UP event we first check the recorded 4183 * state of the channel. If this is UP then we know that 4184 * the channel moving to the UP state has already been dealt 4185 * with and don't need to dispatch a new task. 4186 * 4187 * The reason for this check is that when we do a ldc_up(), 4188 * depending on the state of the peer, we may or may not get 4189 * a LDC_UP event. As we can't depend on getting a LDC_UP evt 4190 * every time we do ldc_up() we explicitly check the channel 4191 * status to see has it come up (ldc_up() is asynch and will 4192 * complete at some undefined time), and take the appropriate 4193 * action. 4194 * 4195 * The flip side of this is that we may get a LDC_UP event 4196 * when we have already seen that the channel is up and have 4197 * dealt with that. 4198 */ 4199 mutex_enter(&ldcp->status_lock); 4200 if (evt == VSW_CONN_UP) { 4201 if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) { 4202 mutex_exit(&ldcp->status_lock); 4203 return; 4204 } 4205 } 4206 mutex_exit(&ldcp->status_lock); 4207 4208 /* 4209 * The transaction group id allows us to identify and discard 4210 * any tasks which are still pending on the taskq and refer 4211 * to the handshake session we are about to restart or reset. 4212 * These stale messages no longer have any real meaning. 4213 */ 4214 mutex_enter(&ldcp->hss_lock); 4215 ldcp->hss_id++; 4216 mutex_exit(&ldcp->hss_lock); 4217 4218 ASSERT(vswp->taskq_p != NULL); 4219 4220 if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) { 4221 cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for" 4222 " connection event", vswp->instance); 4223 goto err_exit; 4224 } 4225 4226 conn->evt = evt; 4227 conn->ldcp = ldcp; 4228 4229 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn, 4230 DDI_NOSLEEP) != DDI_SUCCESS) { 4231 cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task", 4232 vswp->instance); 4233 4234 kmem_free(conn, sizeof (vsw_conn_evt_t)); 4235 goto err_exit; 4236 } 4237 4238 D1(vswp, "%s: exit", __func__); 4239 return; 4240 4241 err_exit: 4242 /* 4243 * Have mostly likely failed due to memory shortage. Clear the flag so 4244 * that future requests will at least be attempted and will hopefully 4245 * succeed. 4246 */ 4247 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) 4248 ldcp->reset_active = 0; 4249 } 4250 4251 /* 4252 * Deal with events relating to a connection. Invoked from a taskq. 4253 */ 4254 static void 4255 vsw_conn_task(void *arg) 4256 { 4257 vsw_conn_evt_t *conn = (vsw_conn_evt_t *)arg; 4258 vsw_ldc_t *ldcp = NULL; 4259 vsw_t *vswp = NULL; 4260 uint16_t evt; 4261 ldc_status_t curr_status; 4262 4263 ldcp = conn->ldcp; 4264 evt = conn->evt; 4265 vswp = ldcp->ldc_vswp; 4266 4267 D1(vswp, "%s: enter", __func__); 4268 4269 /* can safely free now have copied out data */ 4270 kmem_free(conn, sizeof (vsw_conn_evt_t)); 4271 4272 mutex_enter(&ldcp->status_lock); 4273 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { 4274 cmn_err(CE_WARN, "!vsw%d: Unable to read status of " 4275 "channel %ld", vswp->instance, ldcp->ldc_id); 4276 mutex_exit(&ldcp->status_lock); 4277 return; 4278 } 4279 4280 /* 4281 * If we wish to restart the handshake on this channel, then if 4282 * the channel is UP we bring it DOWN to flush the underlying 4283 * ldc queue. 4284 */ 4285 if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP)) 4286 (void) ldc_down(ldcp->ldc_handle); 4287 4288 /* 4289 * re-init all the associated data structures. 4290 */ 4291 vsw_ldc_reinit(ldcp); 4292 4293 /* 4294 * Bring the channel back up (note it does no harm to 4295 * do this even if the channel is already UP, Just 4296 * becomes effectively a no-op). 4297 */ 4298 (void) ldc_up(ldcp->ldc_handle); 4299 4300 /* 4301 * Check if channel is now UP. This will only happen if 4302 * peer has also done a ldc_up(). 4303 */ 4304 if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) { 4305 cmn_err(CE_WARN, "!vsw%d: Unable to read status of " 4306 "channel %ld", vswp->instance, ldcp->ldc_id); 4307 mutex_exit(&ldcp->status_lock); 4308 return; 4309 } 4310 4311 ldcp->ldc_status = curr_status; 4312 4313 /* channel UP so restart handshake by sending version info */ 4314 if (curr_status == LDC_UP) { 4315 if (ldcp->hcnt++ > vsw_num_handshakes) { 4316 cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted" 4317 " handshake attempts (%d) on channel %ld", 4318 vswp->instance, ldcp->hcnt, ldcp->ldc_id); 4319 mutex_exit(&ldcp->status_lock); 4320 return; 4321 } 4322 4323 if (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp, 4324 DDI_NOSLEEP) != DDI_SUCCESS) { 4325 cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task", 4326 vswp->instance); 4327 4328 /* 4329 * Don't count as valid restart attempt if couldn't 4330 * send version msg. 4331 */ 4332 if (ldcp->hcnt > 0) 4333 ldcp->hcnt--; 4334 } 4335 } 4336 4337 /* 4338 * Mark that the process is complete by clearing the flag. 4339 * 4340 * Note is it possible that the taskq dispatch above may have failed, 4341 * most likely due to memory shortage. We still clear the flag so 4342 * future attempts will at least be attempted and will hopefully 4343 * succeed. 4344 */ 4345 if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) 4346 ldcp->reset_active = 0; 4347 4348 mutex_exit(&ldcp->status_lock); 4349 4350 D1(vswp, "%s: exit", __func__); 4351 } 4352 4353 /* 4354 * returns 0 if legal for event signified by flag to have 4355 * occured at the time it did. Otherwise returns 1. 4356 */ 4357 int 4358 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) 4359 { 4360 vsw_t *vswp = ldcp->ldc_vswp; 4361 uint64_t state; 4362 uint64_t phase; 4363 4364 if (dir == INBOUND) 4365 state = ldcp->lane_in.lstate; 4366 else 4367 state = ldcp->lane_out.lstate; 4368 4369 phase = ldcp->hphase; 4370 4371 switch (flag) { 4372 case VSW_VER_INFO_RECV: 4373 if (phase > VSW_MILESTONE0) { 4374 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" 4375 " when in state %d\n", ldcp->ldc_id, phase); 4376 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4377 return (1); 4378 } 4379 break; 4380 4381 case VSW_VER_ACK_RECV: 4382 case VSW_VER_NACK_RECV: 4383 if (!(state & VSW_VER_INFO_SENT)) { 4384 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or " 4385 "VER_NACK when in state %d\n", ldcp->ldc_id, phase); 4386 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4387 return (1); 4388 } else 4389 state &= ~VSW_VER_INFO_SENT; 4390 break; 4391 4392 case VSW_ATTR_INFO_RECV: 4393 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { 4394 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" 4395 " when in state %d\n", ldcp->ldc_id, phase); 4396 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4397 return (1); 4398 } 4399 break; 4400 4401 case VSW_ATTR_ACK_RECV: 4402 case VSW_ATTR_NACK_RECV: 4403 if (!(state & VSW_ATTR_INFO_SENT)) { 4404 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" 4405 " or ATTR_NACK when in state %d\n", 4406 ldcp->ldc_id, phase); 4407 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4408 return (1); 4409 } else 4410 state &= ~VSW_ATTR_INFO_SENT; 4411 break; 4412 4413 case VSW_DRING_INFO_RECV: 4414 if (phase < VSW_MILESTONE1) { 4415 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" 4416 " when in state %d\n", ldcp->ldc_id, phase); 4417 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4418 return (1); 4419 } 4420 break; 4421 4422 case VSW_DRING_ACK_RECV: 4423 case VSW_DRING_NACK_RECV: 4424 if (!(state & VSW_DRING_INFO_SENT)) { 4425 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK " 4426 " or DRING_NACK when in state %d\n", 4427 ldcp->ldc_id, phase); 4428 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4429 return (1); 4430 } else 4431 state &= ~VSW_DRING_INFO_SENT; 4432 break; 4433 4434 case VSW_RDX_INFO_RECV: 4435 if (phase < VSW_MILESTONE3) { 4436 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" 4437 " when in state %d\n", ldcp->ldc_id, phase); 4438 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4439 return (1); 4440 } 4441 break; 4442 4443 case VSW_RDX_ACK_RECV: 4444 case VSW_RDX_NACK_RECV: 4445 if (!(state & VSW_RDX_INFO_SENT)) { 4446 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or " 4447 "RDX_NACK when in state %d\n", ldcp->ldc_id, phase); 4448 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4449 return (1); 4450 } else 4451 state &= ~VSW_RDX_INFO_SENT; 4452 break; 4453 4454 case VSW_MCST_INFO_RECV: 4455 if (phase < VSW_MILESTONE3) { 4456 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" 4457 " when in state %d\n", ldcp->ldc_id, phase); 4458 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4459 return (1); 4460 } 4461 break; 4462 4463 default: 4464 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", 4465 ldcp->ldc_id, flag); 4466 return (1); 4467 } 4468 4469 if (dir == INBOUND) 4470 ldcp->lane_in.lstate = state; 4471 else 4472 ldcp->lane_out.lstate = state; 4473 4474 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); 4475 4476 return (0); 4477 } 4478 4479 void 4480 vsw_next_milestone(vsw_ldc_t *ldcp) 4481 { 4482 vsw_t *vswp = ldcp->ldc_vswp; 4483 4484 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, 4485 ldcp->ldc_id, ldcp->hphase); 4486 4487 DUMP_FLAGS(ldcp->lane_in.lstate); 4488 DUMP_FLAGS(ldcp->lane_out.lstate); 4489 4490 switch (ldcp->hphase) { 4491 4492 case VSW_MILESTONE0: 4493 /* 4494 * If we haven't started to handshake with our peer, 4495 * start to do so now. 4496 */ 4497 if (ldcp->lane_out.lstate == 0) { 4498 D2(vswp, "%s: (chan %lld) starting handshake " 4499 "with peer", __func__, ldcp->ldc_id); 4500 vsw_process_conn_evt(ldcp, VSW_CONN_UP); 4501 } 4502 4503 /* 4504 * Only way to pass this milestone is to have successfully 4505 * negotiated version info. 4506 */ 4507 if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) && 4508 (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) { 4509 4510 D2(vswp, "%s: (chan %lld) leaving milestone 0", 4511 __func__, ldcp->ldc_id); 4512 4513 /* 4514 * Next milestone is passed when attribute 4515 * information has been successfully exchanged. 4516 */ 4517 ldcp->hphase = VSW_MILESTONE1; 4518 vsw_send_attr(ldcp); 4519 4520 } 4521 break; 4522 4523 case VSW_MILESTONE1: 4524 /* 4525 * Only way to pass this milestone is to have successfully 4526 * negotiated attribute information. 4527 */ 4528 if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) { 4529 4530 ldcp->hphase = VSW_MILESTONE2; 4531 4532 /* 4533 * If the peer device has said it wishes to 4534 * use descriptor rings then we send it our ring 4535 * info, otherwise we just set up a private ring 4536 * which we use an internal buffer 4537 */ 4538 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) 4539 vsw_send_dring_info(ldcp); 4540 } 4541 break; 4542 4543 case VSW_MILESTONE2: 4544 /* 4545 * If peer has indicated in its attribute message that 4546 * it wishes to use descriptor rings then the only way 4547 * to pass this milestone is for us to have received 4548 * valid dring info. 4549 * 4550 * If peer is not using descriptor rings then just fall 4551 * through. 4552 */ 4553 if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) && 4554 (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))) 4555 break; 4556 4557 D2(vswp, "%s: (chan %lld) leaving milestone 2", 4558 __func__, ldcp->ldc_id); 4559 4560 ldcp->hphase = VSW_MILESTONE3; 4561 vsw_send_rdx(ldcp); 4562 break; 4563 4564 case VSW_MILESTONE3: 4565 /* 4566 * Pass this milestone when all paramaters have been 4567 * successfully exchanged and RDX sent in both directions. 4568 * 4569 * Mark outbound lane as available to transmit data. 4570 */ 4571 if ((ldcp->lane_out.lstate & VSW_RDX_ACK_SENT) && 4572 (ldcp->lane_in.lstate & VSW_RDX_ACK_RECV)) { 4573 4574 D2(vswp, "%s: (chan %lld) leaving milestone 3", 4575 __func__, ldcp->ldc_id); 4576 D2(vswp, "%s: ** handshake complete (0x%llx : " 4577 "0x%llx) **", __func__, ldcp->lane_in.lstate, 4578 ldcp->lane_out.lstate); 4579 ldcp->lane_out.lstate |= VSW_LANE_ACTIVE; 4580 ldcp->hphase = VSW_MILESTONE4; 4581 ldcp->hcnt = 0; 4582 DISPLAY_STATE(); 4583 } else { 4584 D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)", 4585 __func__, ldcp->lane_in.lstate, 4586 ldcp->lane_out.lstate); 4587 } 4588 break; 4589 4590 case VSW_MILESTONE4: 4591 D2(vswp, "%s: (chan %lld) in milestone 4", __func__, 4592 ldcp->ldc_id); 4593 break; 4594 4595 default: 4596 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, 4597 ldcp->ldc_id, ldcp->hphase); 4598 } 4599 4600 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, 4601 ldcp->hphase); 4602 } 4603 4604 /* 4605 * Check if major version is supported. 4606 * 4607 * Returns 0 if finds supported major number, and if necessary 4608 * adjusts the minor field. 4609 * 4610 * Returns 1 if can't match major number exactly. Sets mjor/minor 4611 * to next lowest support values, or to zero if no other values possible. 4612 */ 4613 static int 4614 vsw_supported_version(vio_ver_msg_t *vp) 4615 { 4616 int i; 4617 4618 D1(NULL, "vsw_supported_version: enter"); 4619 4620 for (i = 0; i < VSW_NUM_VER; i++) { 4621 if (vsw_versions[i].ver_major == vp->ver_major) { 4622 /* 4623 * Matching or lower major version found. Update 4624 * minor number if necessary. 4625 */ 4626 if (vp->ver_minor > vsw_versions[i].ver_minor) { 4627 D2(NULL, "%s: adjusting minor value from %d " 4628 "to %d", __func__, vp->ver_minor, 4629 vsw_versions[i].ver_minor); 4630 vp->ver_minor = vsw_versions[i].ver_minor; 4631 } 4632 4633 return (0); 4634 } 4635 4636 if (vsw_versions[i].ver_major < vp->ver_major) { 4637 if (vp->ver_minor > vsw_versions[i].ver_minor) { 4638 D2(NULL, "%s: adjusting minor value from %d " 4639 "to %d", __func__, vp->ver_minor, 4640 vsw_versions[i].ver_minor); 4641 vp->ver_minor = vsw_versions[i].ver_minor; 4642 } 4643 return (1); 4644 } 4645 } 4646 4647 /* No match was possible, zero out fields */ 4648 vp->ver_major = 0; 4649 vp->ver_minor = 0; 4650 4651 D1(NULL, "vsw_supported_version: exit"); 4652 4653 return (1); 4654 } 4655 4656 /* 4657 * Main routine for processing messages received over LDC. 4658 */ 4659 static void 4660 vsw_process_pkt(void *arg) 4661 { 4662 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 4663 vsw_t *vswp = ldcp->ldc_vswp; 4664 size_t msglen; 4665 vio_msg_tag_t tag; 4666 def_msg_t dmsg; 4667 int rv = 0; 4668 4669 4670 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4671 4672 /* 4673 * If channel is up read messages until channel is empty. 4674 */ 4675 do { 4676 msglen = sizeof (dmsg); 4677 rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen); 4678 4679 if (rv != 0) { 4680 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n", 4681 __func__, ldcp->ldc_id, rv, msglen); 4682 } 4683 4684 /* channel has been reset */ 4685 if (rv == ECONNRESET) { 4686 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 4687 break; 4688 } 4689 4690 if (msglen == 0) { 4691 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, 4692 ldcp->ldc_id); 4693 break; 4694 } 4695 4696 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, 4697 ldcp->ldc_id, msglen); 4698 4699 /* 4700 * Figure out what sort of packet we have gotten by 4701 * examining the msg tag, and then switch it appropriately. 4702 */ 4703 bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t)); 4704 4705 switch (tag.vio_msgtype) { 4706 case VIO_TYPE_CTRL: 4707 vsw_dispatch_ctrl_task(ldcp, &dmsg, tag); 4708 break; 4709 case VIO_TYPE_DATA: 4710 vsw_process_data_pkt(ldcp, &dmsg, tag); 4711 break; 4712 case VIO_TYPE_ERR: 4713 vsw_process_err_pkt(ldcp, &dmsg, tag); 4714 break; 4715 default: 4716 DERR(vswp, "%s: Unknown tag(%lx) ", __func__, 4717 "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id); 4718 break; 4719 } 4720 } while (msglen); 4721 4722 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); 4723 } 4724 4725 /* 4726 * Dispatch a task to process a VIO control message. 4727 */ 4728 static void 4729 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag) 4730 { 4731 vsw_ctrl_task_t *ctaskp = NULL; 4732 vsw_port_t *port = ldcp->ldc_port; 4733 vsw_t *vswp = port->p_vswp; 4734 4735 D1(vswp, "%s: enter", __func__); 4736 4737 /* 4738 * We need to handle RDX ACK messages in-band as once they 4739 * are exchanged it is possible that we will get an 4740 * immediate (legitimate) data packet. 4741 */ 4742 if ((tag.vio_subtype_env == VIO_RDX) && 4743 (tag.vio_subtype == VIO_SUBTYPE_ACK)) { 4744 4745 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV)) 4746 return; 4747 4748 ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV; 4749 D2(vswp, "%s (%ld) handling RDX_ACK in place " 4750 "(ostate 0x%llx : hphase %d)", __func__, 4751 ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase); 4752 vsw_next_milestone(ldcp); 4753 return; 4754 } 4755 4756 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); 4757 4758 if (ctaskp == NULL) { 4759 DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__); 4760 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4761 return; 4762 } 4763 4764 ctaskp->ldcp = ldcp; 4765 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t)); 4766 mutex_enter(&ldcp->hss_lock); 4767 ctaskp->hss_id = ldcp->hss_id; 4768 mutex_exit(&ldcp->hss_lock); 4769 4770 /* 4771 * Dispatch task to processing taskq if port is not in 4772 * the process of being detached. 4773 */ 4774 mutex_enter(&port->state_lock); 4775 if (port->state == VSW_PORT_INIT) { 4776 if ((vswp->taskq_p == NULL) || 4777 (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt, 4778 ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) { 4779 DERR(vswp, "%s: unable to dispatch task to taskq", 4780 __func__); 4781 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4782 mutex_exit(&port->state_lock); 4783 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4784 return; 4785 } 4786 } else { 4787 DWARN(vswp, "%s: port %d detaching, not dispatching " 4788 "task", __func__, port->p_instance); 4789 } 4790 4791 mutex_exit(&port->state_lock); 4792 4793 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, 4794 ldcp->ldc_id); 4795 D1(vswp, "%s: exit", __func__); 4796 } 4797 4798 /* 4799 * Process a VIO ctrl message. Invoked from taskq. 4800 */ 4801 static void 4802 vsw_process_ctrl_pkt(void *arg) 4803 { 4804 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; 4805 vsw_ldc_t *ldcp = ctaskp->ldcp; 4806 vsw_t *vswp = ldcp->ldc_vswp; 4807 vio_msg_tag_t tag; 4808 uint16_t env; 4809 4810 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4811 4812 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); 4813 env = tag.vio_subtype_env; 4814 4815 /* stale pkt check */ 4816 mutex_enter(&ldcp->hss_lock); 4817 if (ctaskp->hss_id < ldcp->hss_id) { 4818 DWARN(vswp, "%s: discarding stale packet belonging to earlier" 4819 " (%ld) handshake session", __func__, ctaskp->hss_id); 4820 mutex_exit(&ldcp->hss_lock); 4821 return; 4822 } 4823 mutex_exit(&ldcp->hss_lock); 4824 4825 /* session id check */ 4826 if (ldcp->session_status & VSW_PEER_SESSION) { 4827 if (ldcp->peer_session != tag.vio_sid) { 4828 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 4829 __func__, ldcp->ldc_id, tag.vio_sid); 4830 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4831 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 4832 return; 4833 } 4834 } 4835 4836 /* 4837 * Switch on vio_subtype envelope, then let lower routines 4838 * decide if its an INFO, ACK or NACK packet. 4839 */ 4840 switch (env) { 4841 case VIO_VER_INFO: 4842 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); 4843 break; 4844 case VIO_DRING_REG: 4845 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); 4846 break; 4847 case VIO_DRING_UNREG: 4848 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); 4849 break; 4850 case VIO_ATTR_INFO: 4851 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); 4852 break; 4853 case VNET_MCAST_INFO: 4854 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); 4855 break; 4856 case VIO_RDX: 4857 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); 4858 break; 4859 default: 4860 DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env); 4861 } 4862 4863 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 4864 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4865 } 4866 4867 /* 4868 * Version negotiation. We can end up here either because our peer 4869 * has responded to a handshake message we have sent it, or our peer 4870 * has initiated a handshake with us. If its the former then can only 4871 * be ACK or NACK, if its the later can only be INFO. 4872 * 4873 * If its an ACK we move to the next stage of the handshake, namely 4874 * attribute exchange. If its a NACK we see if we can specify another 4875 * version, if we can't we stop. 4876 * 4877 * If it is an INFO we reset all params associated with communication 4878 * in that direction over this channel (remember connection is 4879 * essentially 2 independent simplex channels). 4880 */ 4881 void 4882 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) 4883 { 4884 vio_ver_msg_t *ver_pkt; 4885 vsw_t *vswp = ldcp->ldc_vswp; 4886 4887 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4888 4889 /* 4890 * We know this is a ctrl/version packet so 4891 * cast it into the correct structure. 4892 */ 4893 ver_pkt = (vio_ver_msg_t *)pkt; 4894 4895 switch (ver_pkt->tag.vio_subtype) { 4896 case VIO_SUBTYPE_INFO: 4897 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); 4898 4899 /* 4900 * Record the session id, which we will use from now 4901 * until we see another VER_INFO msg. Even then the 4902 * session id in most cases will be unchanged, execpt 4903 * if channel was reset. 4904 */ 4905 if ((ldcp->session_status & VSW_PEER_SESSION) && 4906 (ldcp->peer_session != ver_pkt->tag.vio_sid)) { 4907 DERR(vswp, "%s: updating session id for chan %lld " 4908 "from %llx to %llx", __func__, ldcp->ldc_id, 4909 ldcp->peer_session, ver_pkt->tag.vio_sid); 4910 } 4911 4912 ldcp->peer_session = ver_pkt->tag.vio_sid; 4913 ldcp->session_status |= VSW_PEER_SESSION; 4914 4915 /* Legal message at this time ? */ 4916 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) 4917 return; 4918 4919 /* 4920 * First check the device class. Currently only expect 4921 * to be talking to a network device. In the future may 4922 * also talk to another switch. 4923 */ 4924 if (ver_pkt->dev_class != VDEV_NETWORK) { 4925 DERR(vswp, "%s: illegal device class %d", __func__, 4926 ver_pkt->dev_class); 4927 4928 ver_pkt->tag.vio_sid = ldcp->local_session; 4929 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4930 4931 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4932 4933 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 4934 sizeof (vio_ver_msg_t), B_TRUE); 4935 4936 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 4937 vsw_next_milestone(ldcp); 4938 return; 4939 } else { 4940 ldcp->dev_class = ver_pkt->dev_class; 4941 } 4942 4943 /* 4944 * Now check the version. 4945 */ 4946 if (vsw_supported_version(ver_pkt) == 0) { 4947 /* 4948 * Support this major version and possibly 4949 * adjusted minor version. 4950 */ 4951 4952 D2(vswp, "%s: accepted ver %d:%d", __func__, 4953 ver_pkt->ver_major, ver_pkt->ver_minor); 4954 4955 /* Store accepted values */ 4956 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4957 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4958 4959 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4960 4961 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; 4962 } else { 4963 /* 4964 * NACK back with the next lower major/minor 4965 * pairing we support (if don't suuport any more 4966 * versions then they will be set to zero. 4967 */ 4968 4969 D2(vswp, "%s: replying with ver %d:%d", __func__, 4970 ver_pkt->ver_major, ver_pkt->ver_minor); 4971 4972 /* Store updated values */ 4973 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4974 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4975 4976 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4977 4978 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 4979 } 4980 4981 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 4982 ver_pkt->tag.vio_sid = ldcp->local_session; 4983 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 4984 sizeof (vio_ver_msg_t), B_TRUE); 4985 4986 vsw_next_milestone(ldcp); 4987 break; 4988 4989 case VIO_SUBTYPE_ACK: 4990 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); 4991 4992 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) 4993 return; 4994 4995 /* Store updated values */ 4996 ldcp->lane_in.ver_major = ver_pkt->ver_major; 4997 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 4998 4999 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; 5000 vsw_next_milestone(ldcp); 5001 5002 break; 5003 5004 case VIO_SUBTYPE_NACK: 5005 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); 5006 5007 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) 5008 return; 5009 5010 /* 5011 * If our peer sent us a NACK with the ver fields set to 5012 * zero then there is nothing more we can do. Otherwise see 5013 * if we support either the version suggested, or a lesser 5014 * one. 5015 */ 5016 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 5017 DERR(vswp, "%s: peer unable to negotiate any " 5018 "further.", __func__); 5019 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 5020 vsw_next_milestone(ldcp); 5021 return; 5022 } 5023 5024 /* 5025 * Check to see if we support this major version or 5026 * a lower one. If we don't then maj/min will be set 5027 * to zero. 5028 */ 5029 (void) vsw_supported_version(ver_pkt); 5030 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 5031 /* Nothing more we can do */ 5032 DERR(vswp, "%s: version negotiation failed.\n", 5033 __func__); 5034 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 5035 vsw_next_milestone(ldcp); 5036 } else { 5037 /* found a supported major version */ 5038 ldcp->lane_out.ver_major = ver_pkt->ver_major; 5039 ldcp->lane_out.ver_minor = ver_pkt->ver_minor; 5040 5041 D2(vswp, "%s: resending with updated values (%x, %x)", 5042 __func__, ver_pkt->ver_major, ver_pkt->ver_minor); 5043 5044 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; 5045 ver_pkt->tag.vio_sid = ldcp->local_session; 5046 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 5047 5048 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 5049 5050 (void) vsw_send_msg(ldcp, (void *)ver_pkt, 5051 sizeof (vio_ver_msg_t), B_TRUE); 5052 5053 vsw_next_milestone(ldcp); 5054 5055 } 5056 break; 5057 5058 default: 5059 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5060 ver_pkt->tag.vio_subtype); 5061 } 5062 5063 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 5064 } 5065 5066 /* 5067 * Process an attribute packet. We can end up here either because our peer 5068 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our 5069 * peer has sent us an attribute INFO message 5070 * 5071 * If its an ACK we then move to the next stage of the handshake which 5072 * is to send our descriptor ring info to our peer. If its a NACK then 5073 * there is nothing more we can (currently) do. 5074 * 5075 * If we get a valid/acceptable INFO packet (and we have already negotiated 5076 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we 5077 * NACK back and reset channel state to INACTIV. 5078 * 5079 * FUTURE: in time we will probably negotiate over attributes, but for 5080 * the moment unacceptable attributes are regarded as a fatal error. 5081 * 5082 */ 5083 void 5084 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) 5085 { 5086 vnet_attr_msg_t *attr_pkt; 5087 vsw_t *vswp = ldcp->ldc_vswp; 5088 vsw_port_t *port = ldcp->ldc_port; 5089 uint64_t macaddr = 0; 5090 int i; 5091 5092 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5093 5094 /* 5095 * We know this is a ctrl/attr packet so 5096 * cast it into the correct structure. 5097 */ 5098 attr_pkt = (vnet_attr_msg_t *)pkt; 5099 5100 switch (attr_pkt->tag.vio_subtype) { 5101 case VIO_SUBTYPE_INFO: 5102 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5103 5104 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) 5105 return; 5106 5107 /* 5108 * If the attributes are unacceptable then we NACK back. 5109 */ 5110 if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) { 5111 5112 DERR(vswp, "%s (chan %d): invalid attributes", 5113 __func__, ldcp->ldc_id); 5114 5115 vsw_free_lane_resources(ldcp, INBOUND); 5116 5117 attr_pkt->tag.vio_sid = ldcp->local_session; 5118 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5119 5120 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 5121 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; 5122 (void) vsw_send_msg(ldcp, (void *)attr_pkt, 5123 sizeof (vnet_attr_msg_t), B_TRUE); 5124 5125 vsw_next_milestone(ldcp); 5126 return; 5127 } 5128 5129 /* 5130 * Otherwise store attributes for this lane and update 5131 * lane state. 5132 */ 5133 ldcp->lane_in.mtu = attr_pkt->mtu; 5134 ldcp->lane_in.addr = attr_pkt->addr; 5135 ldcp->lane_in.addr_type = attr_pkt->addr_type; 5136 ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode; 5137 ldcp->lane_in.ack_freq = attr_pkt->ack_freq; 5138 5139 macaddr = ldcp->lane_in.addr; 5140 for (i = ETHERADDRL - 1; i >= 0; i--) { 5141 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; 5142 macaddr >>= 8; 5143 } 5144 5145 /* create the fdb entry for this port/mac address */ 5146 (void) vsw_add_fdb(vswp, port); 5147 5148 /* setup device specifc xmit routines */ 5149 mutex_enter(&port->tx_lock); 5150 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) { 5151 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); 5152 port->transmit = vsw_dringsend; 5153 } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) { 5154 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); 5155 vsw_create_privring(ldcp); 5156 port->transmit = vsw_descrsend; 5157 } 5158 mutex_exit(&port->tx_lock); 5159 5160 attr_pkt->tag.vio_sid = ldcp->local_session; 5161 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5162 5163 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 5164 5165 ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; 5166 5167 (void) vsw_send_msg(ldcp, (void *)attr_pkt, 5168 sizeof (vnet_attr_msg_t), B_TRUE); 5169 5170 vsw_next_milestone(ldcp); 5171 break; 5172 5173 case VIO_SUBTYPE_ACK: 5174 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5175 5176 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) 5177 return; 5178 5179 ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV; 5180 vsw_next_milestone(ldcp); 5181 break; 5182 5183 case VIO_SUBTYPE_NACK: 5184 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5185 5186 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) 5187 return; 5188 5189 ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV; 5190 vsw_next_milestone(ldcp); 5191 break; 5192 5193 default: 5194 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5195 attr_pkt->tag.vio_subtype); 5196 } 5197 5198 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5199 } 5200 5201 /* 5202 * Process a dring info packet. We can end up here either because our peer 5203 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our 5204 * peer has sent us a dring INFO message. 5205 * 5206 * If we get a valid/acceptable INFO packet (and we have already negotiated 5207 * a version) we ACK back and update the lane state, otherwise we NACK back. 5208 * 5209 * FUTURE: nothing to stop client from sending us info on multiple dring's 5210 * but for the moment we will just use the first one we are given. 5211 * 5212 */ 5213 void 5214 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) 5215 { 5216 vio_dring_reg_msg_t *dring_pkt; 5217 vsw_t *vswp = ldcp->ldc_vswp; 5218 ldc_mem_info_t minfo; 5219 dring_info_t *dp, *dbp; 5220 int dring_found = 0; 5221 5222 /* 5223 * We know this is a ctrl/dring packet so 5224 * cast it into the correct structure. 5225 */ 5226 dring_pkt = (vio_dring_reg_msg_t *)pkt; 5227 5228 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5229 5230 switch (dring_pkt->tag.vio_subtype) { 5231 case VIO_SUBTYPE_INFO: 5232 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5233 5234 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 5235 return; 5236 5237 /* 5238 * If the dring params are unacceptable then we NACK back. 5239 */ 5240 if (vsw_check_dring_info(dring_pkt)) { 5241 5242 DERR(vswp, "%s (%lld): invalid dring info", 5243 __func__, ldcp->ldc_id); 5244 5245 vsw_free_lane_resources(ldcp, INBOUND); 5246 5247 dring_pkt->tag.vio_sid = ldcp->local_session; 5248 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5249 5250 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5251 5252 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5253 5254 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5255 sizeof (vio_dring_reg_msg_t), B_TRUE); 5256 5257 vsw_next_milestone(ldcp); 5258 return; 5259 } 5260 5261 /* 5262 * Otherwise, attempt to map in the dring using the 5263 * cookie. If that succeeds we send back a unique dring 5264 * identifier that the sending side will use in future 5265 * to refer to this descriptor ring. 5266 */ 5267 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 5268 5269 dp->num_descriptors = dring_pkt->num_descriptors; 5270 dp->descriptor_size = dring_pkt->descriptor_size; 5271 dp->options = dring_pkt->options; 5272 dp->ncookies = dring_pkt->ncookies; 5273 5274 /* 5275 * Note: should only get one cookie. Enforced in 5276 * the ldc layer. 5277 */ 5278 bcopy(&dring_pkt->cookie[0], &dp->cookie[0], 5279 sizeof (ldc_mem_cookie_t)); 5280 5281 D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__, 5282 dp->num_descriptors, dp->descriptor_size); 5283 D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__, 5284 dp->options, dp->ncookies); 5285 5286 if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0], 5287 dp->ncookies, dp->num_descriptors, dp->descriptor_size, 5288 LDC_SHADOW_MAP, &(dp->handle))) != 0) { 5289 5290 DERR(vswp, "%s: dring_map failed\n", __func__); 5291 5292 kmem_free(dp, sizeof (dring_info_t)); 5293 vsw_free_lane_resources(ldcp, INBOUND); 5294 5295 dring_pkt->tag.vio_sid = ldcp->local_session; 5296 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5297 5298 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5299 5300 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5301 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5302 sizeof (vio_dring_reg_msg_t), B_TRUE); 5303 5304 vsw_next_milestone(ldcp); 5305 return; 5306 } 5307 5308 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 5309 5310 DERR(vswp, "%s: dring_addr failed\n", __func__); 5311 5312 kmem_free(dp, sizeof (dring_info_t)); 5313 vsw_free_lane_resources(ldcp, INBOUND); 5314 5315 dring_pkt->tag.vio_sid = ldcp->local_session; 5316 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 5317 5318 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 5319 5320 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 5321 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5322 sizeof (vio_dring_reg_msg_t), B_TRUE); 5323 5324 vsw_next_milestone(ldcp); 5325 return; 5326 } else { 5327 /* store the address of the pub part of ring */ 5328 dp->pub_addr = minfo.vaddr; 5329 } 5330 5331 /* no private section as we are importing */ 5332 dp->priv_addr = NULL; 5333 5334 /* 5335 * Using simple mono increasing int for ident at 5336 * the moment. 5337 */ 5338 dp->ident = ldcp->next_ident; 5339 ldcp->next_ident++; 5340 5341 dp->end_idx = 0; 5342 dp->next = NULL; 5343 5344 /* 5345 * Link it onto the end of the list of drings 5346 * for this lane. 5347 */ 5348 if (ldcp->lane_in.dringp == NULL) { 5349 D2(vswp, "%s: adding first INBOUND dring", __func__); 5350 ldcp->lane_in.dringp = dp; 5351 } else { 5352 dbp = ldcp->lane_in.dringp; 5353 5354 while (dbp->next != NULL) 5355 dbp = dbp->next; 5356 5357 dbp->next = dp; 5358 } 5359 5360 /* acknowledge it */ 5361 dring_pkt->tag.vio_sid = ldcp->local_session; 5362 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5363 dring_pkt->dring_ident = dp->ident; 5364 5365 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 5366 sizeof (vio_dring_reg_msg_t), B_TRUE); 5367 5368 ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; 5369 vsw_next_milestone(ldcp); 5370 break; 5371 5372 case VIO_SUBTYPE_ACK: 5373 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5374 5375 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) 5376 return; 5377 5378 /* 5379 * Peer is acknowledging our dring info and will have 5380 * sent us a dring identifier which we will use to 5381 * refer to this ring w.r.t. our peer. 5382 */ 5383 dp = ldcp->lane_out.dringp; 5384 if (dp != NULL) { 5385 /* 5386 * Find the ring this ident should be associated 5387 * with. 5388 */ 5389 if (vsw_dring_match(dp, dring_pkt)) { 5390 dring_found = 1; 5391 5392 } else while (dp != NULL) { 5393 if (vsw_dring_match(dp, dring_pkt)) { 5394 dring_found = 1; 5395 break; 5396 } 5397 dp = dp->next; 5398 } 5399 5400 if (dring_found == 0) { 5401 DERR(NULL, "%s: unrecognised ring cookie", 5402 __func__); 5403 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5404 return; 5405 } 5406 5407 } else { 5408 DERR(vswp, "%s: DRING ACK received but no drings " 5409 "allocated", __func__); 5410 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5411 return; 5412 } 5413 5414 /* store ident */ 5415 dp->ident = dring_pkt->dring_ident; 5416 ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV; 5417 vsw_next_milestone(ldcp); 5418 break; 5419 5420 case VIO_SUBTYPE_NACK: 5421 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5422 5423 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) 5424 return; 5425 5426 ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV; 5427 vsw_next_milestone(ldcp); 5428 break; 5429 5430 default: 5431 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5432 dring_pkt->tag.vio_subtype); 5433 } 5434 5435 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5436 } 5437 5438 /* 5439 * Process a request from peer to unregister a dring. 5440 * 5441 * For the moment we just restart the handshake if our 5442 * peer endpoint attempts to unregister a dring. 5443 */ 5444 void 5445 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) 5446 { 5447 vsw_t *vswp = ldcp->ldc_vswp; 5448 vio_dring_unreg_msg_t *dring_pkt; 5449 5450 /* 5451 * We know this is a ctrl/dring packet so 5452 * cast it into the correct structure. 5453 */ 5454 dring_pkt = (vio_dring_unreg_msg_t *)pkt; 5455 5456 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5457 5458 switch (dring_pkt->tag.vio_subtype) { 5459 case VIO_SUBTYPE_INFO: 5460 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5461 5462 DWARN(vswp, "%s: restarting handshake..", __func__); 5463 break; 5464 5465 case VIO_SUBTYPE_ACK: 5466 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5467 5468 DWARN(vswp, "%s: restarting handshake..", __func__); 5469 break; 5470 5471 case VIO_SUBTYPE_NACK: 5472 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5473 5474 DWARN(vswp, "%s: restarting handshake..", __func__); 5475 break; 5476 5477 default: 5478 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5479 dring_pkt->tag.vio_subtype); 5480 } 5481 5482 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5483 5484 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5485 } 5486 5487 #define SND_MCST_NACK(ldcp, pkt) \ 5488 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5489 pkt->tag.vio_sid = ldcp->local_session; \ 5490 (void) vsw_send_msg(ldcp, (void *)pkt, \ 5491 sizeof (vnet_mcast_msg_t), B_TRUE); 5492 5493 /* 5494 * Process a multicast request from a vnet. 5495 * 5496 * Vnet's specify a multicast address that they are interested in. This 5497 * address is used as a key into the hash table which forms the multicast 5498 * forwarding database (mFDB). 5499 * 5500 * The table keys are the multicast addresses, while the table entries 5501 * are pointers to lists of ports which wish to receive packets for the 5502 * specified multicast address. 5503 * 5504 * When a multicast packet is being switched we use the address as a key 5505 * into the hash table, and then walk the appropriate port list forwarding 5506 * the pkt to each port in turn. 5507 * 5508 * If a vnet is no longer interested in a particular multicast grouping 5509 * we simply find the correct location in the hash table and then delete 5510 * the relevant port from the port list. 5511 * 5512 * To deal with the case whereby a port is being deleted without first 5513 * removing itself from the lists in the hash table, we maintain a list 5514 * of multicast addresses the port has registered an interest in, within 5515 * the port structure itself. We then simply walk that list of addresses 5516 * using them as keys into the hash table and remove the port from the 5517 * appropriate lists. 5518 */ 5519 static void 5520 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) 5521 { 5522 vnet_mcast_msg_t *mcst_pkt; 5523 vsw_port_t *port = ldcp->ldc_port; 5524 vsw_t *vswp = ldcp->ldc_vswp; 5525 int i; 5526 5527 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5528 5529 /* 5530 * We know this is a ctrl/mcast packet so 5531 * cast it into the correct structure. 5532 */ 5533 mcst_pkt = (vnet_mcast_msg_t *)pkt; 5534 5535 switch (mcst_pkt->tag.vio_subtype) { 5536 case VIO_SUBTYPE_INFO: 5537 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5538 5539 /* 5540 * Check if in correct state to receive a multicast 5541 * message (i.e. handshake complete). If not reset 5542 * the handshake. 5543 */ 5544 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) 5545 return; 5546 5547 /* 5548 * Before attempting to add or remove address check 5549 * that they are valid multicast addresses. 5550 * If not, then NACK back. 5551 */ 5552 for (i = 0; i < mcst_pkt->count; i++) { 5553 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { 5554 DERR(vswp, "%s: invalid multicast address", 5555 __func__); 5556 SND_MCST_NACK(ldcp, mcst_pkt); 5557 return; 5558 } 5559 } 5560 5561 /* 5562 * Now add/remove the addresses. If this fails we 5563 * NACK back. 5564 */ 5565 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { 5566 SND_MCST_NACK(ldcp, mcst_pkt); 5567 return; 5568 } 5569 5570 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5571 mcst_pkt->tag.vio_sid = ldcp->local_session; 5572 5573 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); 5574 5575 (void) vsw_send_msg(ldcp, (void *)mcst_pkt, 5576 sizeof (vnet_mcast_msg_t), B_TRUE); 5577 break; 5578 5579 case VIO_SUBTYPE_ACK: 5580 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5581 5582 /* 5583 * We shouldn't ever get a multicast ACK message as 5584 * at the moment we never request multicast addresses 5585 * to be set on some other device. This may change in 5586 * the future if we have cascading switches. 5587 */ 5588 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) 5589 return; 5590 5591 /* Do nothing */ 5592 break; 5593 5594 case VIO_SUBTYPE_NACK: 5595 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5596 5597 /* 5598 * We shouldn't get a multicast NACK packet for the 5599 * same reasons as we shouldn't get a ACK packet. 5600 */ 5601 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) 5602 return; 5603 5604 /* Do nothing */ 5605 break; 5606 5607 default: 5608 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 5609 mcst_pkt->tag.vio_subtype); 5610 } 5611 5612 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5613 } 5614 5615 static void 5616 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) 5617 { 5618 vio_rdx_msg_t *rdx_pkt; 5619 vsw_t *vswp = ldcp->ldc_vswp; 5620 5621 /* 5622 * We know this is a ctrl/rdx packet so 5623 * cast it into the correct structure. 5624 */ 5625 rdx_pkt = (vio_rdx_msg_t *)pkt; 5626 5627 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 5628 5629 switch (rdx_pkt->tag.vio_subtype) { 5630 case VIO_SUBTYPE_INFO: 5631 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5632 5633 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV)) 5634 return; 5635 5636 rdx_pkt->tag.vio_sid = ldcp->local_session; 5637 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5638 5639 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); 5640 5641 ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT; 5642 5643 (void) vsw_send_msg(ldcp, (void *)rdx_pkt, 5644 sizeof (vio_rdx_msg_t), B_TRUE); 5645 5646 vsw_next_milestone(ldcp); 5647 break; 5648 5649 case VIO_SUBTYPE_ACK: 5650 /* 5651 * Should be handled in-band by callback handler. 5652 */ 5653 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); 5654 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5655 break; 5656 5657 case VIO_SUBTYPE_NACK: 5658 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5659 5660 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV)) 5661 return; 5662 5663 ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV; 5664 vsw_next_milestone(ldcp); 5665 break; 5666 5667 default: 5668 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 5669 rdx_pkt->tag.vio_subtype); 5670 } 5671 5672 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5673 } 5674 5675 static void 5676 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag) 5677 { 5678 uint16_t env = tag.vio_subtype_env; 5679 vsw_t *vswp = ldcp->ldc_vswp; 5680 5681 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5682 5683 /* session id check */ 5684 if (ldcp->session_status & VSW_PEER_SESSION) { 5685 if (ldcp->peer_session != tag.vio_sid) { 5686 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 5687 __func__, ldcp->ldc_id, tag.vio_sid); 5688 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5689 return; 5690 } 5691 } 5692 5693 /* 5694 * It is an error for us to be getting data packets 5695 * before the handshake has completed. 5696 */ 5697 if (ldcp->hphase != VSW_MILESTONE4) { 5698 DERR(vswp, "%s: got data packet before handshake complete " 5699 "hphase %d (%x: %x)", __func__, ldcp->hphase, 5700 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 5701 DUMP_FLAGS(ldcp->lane_in.lstate); 5702 DUMP_FLAGS(ldcp->lane_out.lstate); 5703 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 5704 return; 5705 } 5706 5707 /* 5708 * Switch on vio_subtype envelope, then let lower routines 5709 * decide if its an INFO, ACK or NACK packet. 5710 */ 5711 if (env == VIO_DRING_DATA) { 5712 vsw_process_data_dring_pkt(ldcp, dpkt); 5713 } else if (env == VIO_PKT_DATA) { 5714 vsw_process_data_raw_pkt(ldcp, dpkt); 5715 } else if (env == VIO_DESC_DATA) { 5716 vsw_process_data_ibnd_pkt(ldcp, dpkt); 5717 } else { 5718 DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env); 5719 } 5720 5721 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 5722 } 5723 5724 #define SND_DRING_NACK(ldcp, pkt) \ 5725 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5726 pkt->tag.vio_sid = ldcp->local_session; \ 5727 (void) vsw_send_msg(ldcp, (void *)pkt, \ 5728 sizeof (vio_dring_msg_t), B_TRUE); 5729 5730 static void 5731 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) 5732 { 5733 vio_dring_msg_t *dring_pkt; 5734 vnet_public_desc_t *pub_addr = NULL; 5735 vsw_private_desc_t *priv_addr = NULL; 5736 dring_info_t *dp = NULL; 5737 vsw_t *vswp = ldcp->ldc_vswp; 5738 mblk_t *mp = NULL; 5739 mblk_t *bp = NULL; 5740 mblk_t *bpt = NULL; 5741 size_t nbytes = 0; 5742 size_t off = 0; 5743 uint64_t ncookies = 0; 5744 uint64_t chain = 0; 5745 uint64_t j, len; 5746 uint32_t pos, start, datalen; 5747 uint32_t range_start, range_end; 5748 int32_t end, num, cnt = 0; 5749 int i, rv, msg_rv = 0; 5750 boolean_t ack_needed = B_FALSE; 5751 boolean_t prev_desc_ack = B_FALSE; 5752 int read_attempts = 0; 5753 5754 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5755 5756 /* 5757 * We know this is a data/dring packet so 5758 * cast it into the correct structure. 5759 */ 5760 dring_pkt = (vio_dring_msg_t *)dpkt; 5761 5762 /* 5763 * Switch on the vio_subtype. If its INFO then we need to 5764 * process the data. If its an ACK we need to make sure 5765 * it makes sense (i.e did we send an earlier data/info), 5766 * and if its a NACK then we maybe attempt a retry. 5767 */ 5768 switch (dring_pkt->tag.vio_subtype) { 5769 case VIO_SUBTYPE_INFO: 5770 D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); 5771 5772 READ_ENTER(&ldcp->lane_in.dlistrw); 5773 if ((dp = vsw_ident2dring(&ldcp->lane_in, 5774 dring_pkt->dring_ident)) == NULL) { 5775 RW_EXIT(&ldcp->lane_in.dlistrw); 5776 5777 DERR(vswp, "%s(%lld): unable to find dring from " 5778 "ident 0x%llx", __func__, ldcp->ldc_id, 5779 dring_pkt->dring_ident); 5780 5781 SND_DRING_NACK(ldcp, dring_pkt); 5782 return; 5783 } 5784 5785 start = pos = dring_pkt->start_idx; 5786 end = dring_pkt->end_idx; 5787 len = dp->num_descriptors; 5788 5789 range_start = range_end = pos; 5790 5791 D2(vswp, "%s(%lld): start index %ld : end %ld\n", 5792 __func__, ldcp->ldc_id, start, end); 5793 5794 if (end == -1) { 5795 num = -1; 5796 } else if (end >= 0) { 5797 num = end >= pos ? end - pos + 1: (len - pos + 1) + end; 5798 5799 /* basic sanity check */ 5800 if (end > len) { 5801 RW_EXIT(&ldcp->lane_in.dlistrw); 5802 DERR(vswp, "%s(%lld): endpoint %lld outside " 5803 "ring length %lld", __func__, 5804 ldcp->ldc_id, end, len); 5805 5806 SND_DRING_NACK(ldcp, dring_pkt); 5807 return; 5808 } 5809 } else { 5810 RW_EXIT(&ldcp->lane_in.dlistrw); 5811 DERR(vswp, "%s(%lld): invalid endpoint %lld", 5812 __func__, ldcp->ldc_id, end); 5813 SND_DRING_NACK(ldcp, dring_pkt); 5814 return; 5815 } 5816 5817 while (cnt != num) { 5818 vsw_recheck_desc: 5819 if ((rv = ldc_mem_dring_acquire(dp->handle, 5820 pos, pos)) != 0) { 5821 RW_EXIT(&ldcp->lane_in.dlistrw); 5822 DERR(vswp, "%s(%lld): unable to acquire " 5823 "descriptor at pos %d: err %d", 5824 __func__, pos, ldcp->ldc_id, rv); 5825 SND_DRING_NACK(ldcp, dring_pkt); 5826 return; 5827 } 5828 5829 pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; 5830 5831 /* 5832 * When given a bounded range of descriptors 5833 * to process, its an error to hit a descriptor 5834 * which is not ready. In the non-bounded case 5835 * (end_idx == -1) this simply indicates we have 5836 * reached the end of the current active range. 5837 */ 5838 if (pub_addr->hdr.dstate != VIO_DESC_READY) { 5839 /* unbound - no error */ 5840 if (end == -1) { 5841 if (read_attempts == vsw_read_attempts) 5842 break; 5843 5844 delay(drv_usectohz(vsw_desc_delay)); 5845 read_attempts++; 5846 goto vsw_recheck_desc; 5847 } 5848 5849 /* bounded - error - so NACK back */ 5850 RW_EXIT(&ldcp->lane_in.dlistrw); 5851 DERR(vswp, "%s(%lld): descriptor not READY " 5852 "(%d)", __func__, ldcp->ldc_id, 5853 pub_addr->hdr.dstate); 5854 SND_DRING_NACK(ldcp, dring_pkt); 5855 return; 5856 } 5857 5858 DTRACE_PROBE1(read_attempts, int, read_attempts); 5859 5860 range_end = pos; 5861 5862 /* 5863 * If we ACK'd the previous descriptor then now 5864 * record the new range start position for later 5865 * ACK's. 5866 */ 5867 if (prev_desc_ack) { 5868 range_start = pos; 5869 5870 D2(vswp, "%s(%lld): updating range start to be " 5871 "%d", __func__, ldcp->ldc_id, range_start); 5872 5873 prev_desc_ack = B_FALSE; 5874 } 5875 5876 /* 5877 * Data is padded to align on 8 byte boundary, 5878 * datalen is actual data length, i.e. minus that 5879 * padding. 5880 */ 5881 datalen = pub_addr->nbytes; 5882 5883 /* 5884 * Does peer wish us to ACK when we have finished 5885 * with this descriptor ? 5886 */ 5887 if (pub_addr->hdr.ack) 5888 ack_needed = B_TRUE; 5889 5890 D2(vswp, "%s(%lld): processing desc %lld at pos" 5891 " 0x%llx : dstate 0x%lx : datalen 0x%lx", 5892 __func__, ldcp->ldc_id, pos, pub_addr, 5893 pub_addr->hdr.dstate, datalen); 5894 5895 /* 5896 * Mark that we are starting to process descriptor. 5897 */ 5898 pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; 5899 5900 mp = vio_allocb(ldcp->rxh); 5901 if (mp == NULL) { 5902 /* 5903 * No free receive buffers available, so 5904 * fallback onto allocb(9F). Make sure that 5905 * we get a data buffer which is a multiple 5906 * of 8 as this is required by ldc_mem_copy. 5907 */ 5908 DTRACE_PROBE(allocb); 5909 if ((mp = allocb(datalen + VNET_IPALIGN + 8, 5910 BPRI_MED)) == NULL) { 5911 DERR(vswp, "%s(%ld): allocb failed", 5912 __func__, ldcp->ldc_id); 5913 pub_addr->hdr.dstate = VIO_DESC_DONE; 5914 (void) ldc_mem_dring_release(dp->handle, 5915 pos, pos); 5916 break; 5917 } 5918 } 5919 5920 /* 5921 * Ensure that we ask ldc for an aligned 5922 * number of bytes. 5923 */ 5924 nbytes = datalen + VNET_IPALIGN; 5925 if (nbytes & 0x7) { 5926 off = 8 - (nbytes & 0x7); 5927 nbytes += off; 5928 } 5929 5930 ncookies = pub_addr->ncookies; 5931 rv = ldc_mem_copy(ldcp->ldc_handle, 5932 (caddr_t)mp->b_rptr, 0, &nbytes, 5933 pub_addr->memcookie, ncookies, LDC_COPY_IN); 5934 5935 if (rv != 0) { 5936 DERR(vswp, "%s(%d): unable to copy in data " 5937 "from %d cookies in desc %d (rv %d)", 5938 __func__, ldcp->ldc_id, ncookies, pos, rv); 5939 freemsg(mp); 5940 5941 pub_addr->hdr.dstate = VIO_DESC_DONE; 5942 (void) ldc_mem_dring_release(dp->handle, 5943 pos, pos); 5944 break; 5945 } else { 5946 D2(vswp, "%s(%d): copied in %ld bytes" 5947 " using %d cookies", __func__, 5948 ldcp->ldc_id, nbytes, ncookies); 5949 } 5950 5951 /* adjust the read pointer to skip over the padding */ 5952 mp->b_rptr += VNET_IPALIGN; 5953 5954 /* point to the actual end of data */ 5955 mp->b_wptr = mp->b_rptr + datalen; 5956 5957 /* build a chain of received packets */ 5958 if (bp == NULL) { 5959 /* first pkt */ 5960 bp = mp; 5961 bp->b_next = bp->b_prev = NULL; 5962 bpt = bp; 5963 chain = 1; 5964 } else { 5965 mp->b_next = NULL; 5966 mp->b_prev = bpt; 5967 bpt->b_next = mp; 5968 bpt = mp; 5969 chain++; 5970 } 5971 5972 /* mark we are finished with this descriptor */ 5973 pub_addr->hdr.dstate = VIO_DESC_DONE; 5974 5975 (void) ldc_mem_dring_release(dp->handle, pos, pos); 5976 5977 /* 5978 * Send an ACK back to peer if requested. 5979 */ 5980 if (ack_needed) { 5981 ack_needed = B_FALSE; 5982 5983 dring_pkt->start_idx = range_start; 5984 dring_pkt->end_idx = range_end; 5985 5986 DERR(vswp, "%s(%lld): processed %d %d, ACK" 5987 " requested", __func__, ldcp->ldc_id, 5988 dring_pkt->start_idx, dring_pkt->end_idx); 5989 5990 dring_pkt->dring_process_state = VIO_DP_ACTIVE; 5991 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 5992 dring_pkt->tag.vio_sid = ldcp->local_session; 5993 5994 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 5995 sizeof (vio_dring_msg_t), B_FALSE); 5996 5997 /* 5998 * Check if ACK was successfully sent. If not 5999 * we break and deal with that below. 6000 */ 6001 if (msg_rv != 0) 6002 break; 6003 6004 prev_desc_ack = B_TRUE; 6005 range_start = pos; 6006 } 6007 6008 /* next descriptor */ 6009 pos = (pos + 1) % len; 6010 cnt++; 6011 6012 /* 6013 * Break out of loop here and stop processing to 6014 * allow some other network device (or disk) to 6015 * get access to the cpu. 6016 */ 6017 if (chain > vsw_chain_len) { 6018 D3(vswp, "%s(%lld): switching chain of %d " 6019 "msgs", __func__, ldcp->ldc_id, chain); 6020 break; 6021 } 6022 } 6023 RW_EXIT(&ldcp->lane_in.dlistrw); 6024 6025 /* 6026 * If when we attempted to send the ACK we found that the 6027 * channel had been reset then now handle this. We deal with 6028 * it here as we cannot reset the channel while holding the 6029 * dlistrw lock, and we don't want to acquire/release it 6030 * continuously in the above loop, as a channel reset should 6031 * be a rare event. 6032 */ 6033 if (msg_rv == ECONNRESET) { 6034 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 6035 break; 6036 } 6037 6038 /* send the chain of packets to be switched */ 6039 if (bp != NULL) { 6040 D3(vswp, "%s(%lld): switching chain of %d msgs", 6041 __func__, ldcp->ldc_id, chain); 6042 vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT, 6043 ldcp->ldc_port, NULL); 6044 } 6045 6046 DTRACE_PROBE1(msg_cnt, int, cnt); 6047 6048 /* 6049 * We are now finished so ACK back with the state 6050 * set to STOPPING so our peer knows we are finished 6051 */ 6052 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 6053 dring_pkt->tag.vio_sid = ldcp->local_session; 6054 6055 dring_pkt->dring_process_state = VIO_DP_STOPPED; 6056 6057 DTRACE_PROBE(stop_process_sent); 6058 6059 /* 6060 * We have not processed any more descriptors beyond 6061 * the last one we ACK'd. 6062 */ 6063 if (prev_desc_ack) 6064 range_start = range_end; 6065 6066 dring_pkt->start_idx = range_start; 6067 dring_pkt->end_idx = range_end; 6068 6069 D2(vswp, "%s(%lld) processed : %d : %d, now stopping", 6070 __func__, ldcp->ldc_id, dring_pkt->start_idx, 6071 dring_pkt->end_idx); 6072 6073 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 6074 sizeof (vio_dring_msg_t), B_TRUE); 6075 break; 6076 6077 case VIO_SUBTYPE_ACK: 6078 D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); 6079 /* 6080 * Verify that the relevant descriptors are all 6081 * marked as DONE 6082 */ 6083 READ_ENTER(&ldcp->lane_out.dlistrw); 6084 if ((dp = vsw_ident2dring(&ldcp->lane_out, 6085 dring_pkt->dring_ident)) == NULL) { 6086 RW_EXIT(&ldcp->lane_out.dlistrw); 6087 DERR(vswp, "%s: unknown ident in ACK", __func__); 6088 return; 6089 } 6090 6091 pub_addr = (vnet_public_desc_t *)dp->pub_addr; 6092 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 6093 6094 start = end = 0; 6095 start = dring_pkt->start_idx; 6096 end = dring_pkt->end_idx; 6097 len = dp->num_descriptors; 6098 6099 j = num = 0; 6100 /* calculate # descriptors taking into a/c wrap around */ 6101 num = end >= start ? end - start + 1: (len - start + 1) + end; 6102 6103 D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", 6104 __func__, ldcp->ldc_id, start, end, num); 6105 6106 mutex_enter(&dp->dlock); 6107 dp->last_ack_recv = end; 6108 mutex_exit(&dp->dlock); 6109 6110 for (i = start; j < num; i = (i + 1) % len, j++) { 6111 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 6112 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6113 6114 /* 6115 * If the last descriptor in a range has the ACK 6116 * bit set then we will get two messages from our 6117 * peer relating to it. The normal ACK msg and then 6118 * a subsequent STOP msg. The first message will have 6119 * resulted in the descriptor being reclaimed and 6120 * its state set to FREE so when we encounter a non 6121 * DONE descriptor we need to check to see if its 6122 * because we have just reclaimed it. 6123 */ 6124 mutex_enter(&priv_addr->dstate_lock); 6125 if (pub_addr->hdr.dstate == VIO_DESC_DONE) { 6126 /* clear all the fields */ 6127 bzero(priv_addr->datap, priv_addr->datalen); 6128 priv_addr->datalen = 0; 6129 6130 pub_addr->hdr.dstate = VIO_DESC_FREE; 6131 pub_addr->hdr.ack = 0; 6132 6133 priv_addr->dstate = VIO_DESC_FREE; 6134 mutex_exit(&priv_addr->dstate_lock); 6135 6136 D3(vswp, "clearing descp %d : pub state " 6137 "0x%llx : priv state 0x%llx", i, 6138 pub_addr->hdr.dstate, priv_addr->dstate); 6139 6140 } else { 6141 mutex_exit(&priv_addr->dstate_lock); 6142 6143 if (dring_pkt->dring_process_state != 6144 VIO_DP_STOPPED) { 6145 DERR(vswp, "%s: descriptor %lld at pos " 6146 " 0x%llx not DONE (0x%lx)\n", 6147 __func__, i, pub_addr, 6148 pub_addr->hdr.dstate); 6149 RW_EXIT(&ldcp->lane_out.dlistrw); 6150 return; 6151 } 6152 } 6153 } 6154 6155 /* 6156 * If our peer is stopping processing descriptors then 6157 * we check to make sure it has processed all the descriptors 6158 * we have updated. If not then we send it a new message 6159 * to prompt it to restart. 6160 */ 6161 if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { 6162 DTRACE_PROBE(stop_process_recv); 6163 D2(vswp, "%s(%lld): got stopping msg : %d : %d", 6164 __func__, ldcp->ldc_id, dring_pkt->start_idx, 6165 dring_pkt->end_idx); 6166 6167 /* 6168 * Check next descriptor in public section of ring. 6169 * If its marked as READY then we need to prompt our 6170 * peer to start processing the ring again. 6171 */ 6172 i = (end + 1) % len; 6173 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 6174 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6175 6176 /* 6177 * Hold the restart lock across all of this to 6178 * make sure that its not possible for us to 6179 * decide that a msg needs to be sent in the future 6180 * but the sending code having already checked is 6181 * about to exit. 6182 */ 6183 mutex_enter(&dp->restart_lock); 6184 mutex_enter(&priv_addr->dstate_lock); 6185 if (pub_addr->hdr.dstate == VIO_DESC_READY) { 6186 6187 mutex_exit(&priv_addr->dstate_lock); 6188 6189 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 6190 dring_pkt->tag.vio_sid = ldcp->local_session; 6191 6192 mutex_enter(&ldcp->lane_out.seq_lock); 6193 dring_pkt->seq_num = ldcp->lane_out.seq_num++; 6194 mutex_exit(&ldcp->lane_out.seq_lock); 6195 6196 dring_pkt->start_idx = (end + 1) % len; 6197 dring_pkt->end_idx = -1; 6198 6199 D2(vswp, "%s(%lld) : sending restart msg:" 6200 " %d : %d", __func__, ldcp->ldc_id, 6201 dring_pkt->start_idx, dring_pkt->end_idx); 6202 6203 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 6204 sizeof (vio_dring_msg_t), B_FALSE); 6205 6206 } else { 6207 mutex_exit(&priv_addr->dstate_lock); 6208 dp->restart_reqd = B_TRUE; 6209 } 6210 mutex_exit(&dp->restart_lock); 6211 } 6212 RW_EXIT(&ldcp->lane_out.dlistrw); 6213 6214 /* only do channel reset after dropping dlistrw lock */ 6215 if (msg_rv == ECONNRESET) 6216 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 6217 6218 break; 6219 6220 case VIO_SUBTYPE_NACK: 6221 DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", 6222 __func__, ldcp->ldc_id); 6223 /* 6224 * Something is badly wrong if we are getting NACK's 6225 * for our data pkts. So reset the channel. 6226 */ 6227 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 6228 6229 break; 6230 6231 default: 6232 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 6233 ldcp->ldc_id, dring_pkt->tag.vio_subtype); 6234 } 6235 6236 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 6237 } 6238 6239 /* 6240 * VIO_PKT_DATA (a.k.a raw data mode ) 6241 * 6242 * Note - currently not supported. Do nothing. 6243 */ 6244 static void 6245 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt) 6246 { 6247 _NOTE(ARGUNUSED(dpkt)) 6248 6249 D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 6250 DERR(NULL, "%s (%lld): currently unsupported", __func__, ldcp->ldc_id); 6251 D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 6252 } 6253 6254 /* 6255 * Process an in-band descriptor message (most likely from 6256 * OBP). 6257 */ 6258 static void 6259 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) 6260 { 6261 vnet_ibnd_desc_t *ibnd_desc; 6262 dring_info_t *dp = NULL; 6263 vsw_private_desc_t *priv_addr = NULL; 6264 vsw_t *vswp = ldcp->ldc_vswp; 6265 mblk_t *mp = NULL; 6266 size_t nbytes = 0; 6267 size_t off = 0; 6268 uint64_t idx = 0; 6269 uint32_t num = 1, len, datalen = 0; 6270 uint64_t ncookies = 0; 6271 int i, rv; 6272 int j = 0; 6273 6274 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6275 6276 ibnd_desc = (vnet_ibnd_desc_t *)pkt; 6277 6278 switch (ibnd_desc->hdr.tag.vio_subtype) { 6279 case VIO_SUBTYPE_INFO: 6280 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 6281 6282 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 6283 return; 6284 6285 /* 6286 * Data is padded to align on a 8 byte boundary, 6287 * nbytes is actual data length, i.e. minus that 6288 * padding. 6289 */ 6290 datalen = ibnd_desc->nbytes; 6291 6292 D2(vswp, "%s(%lld): processing inband desc : " 6293 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); 6294 6295 ncookies = ibnd_desc->ncookies; 6296 6297 /* 6298 * allocb(9F) returns an aligned data block. We 6299 * need to ensure that we ask ldc for an aligned 6300 * number of bytes also. 6301 */ 6302 nbytes = datalen; 6303 if (nbytes & 0x7) { 6304 off = 8 - (nbytes & 0x7); 6305 nbytes += off; 6306 } 6307 6308 mp = allocb(datalen, BPRI_MED); 6309 if (mp == NULL) { 6310 DERR(vswp, "%s(%lld): allocb failed", 6311 __func__, ldcp->ldc_id); 6312 return; 6313 } 6314 6315 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, 6316 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, 6317 LDC_COPY_IN); 6318 6319 if (rv != 0) { 6320 DERR(vswp, "%s(%d): unable to copy in data from " 6321 "%d cookie(s)", __func__, ldcp->ldc_id, ncookies); 6322 freemsg(mp); 6323 return; 6324 } 6325 6326 D2(vswp, "%s(%d): copied in %ld bytes using %d cookies", 6327 __func__, ldcp->ldc_id, nbytes, ncookies); 6328 6329 /* point to the actual end of data */ 6330 mp->b_wptr = mp->b_rptr + datalen; 6331 6332 /* 6333 * We ACK back every in-band descriptor message we process 6334 */ 6335 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; 6336 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; 6337 (void) vsw_send_msg(ldcp, (void *)ibnd_desc, 6338 sizeof (vnet_ibnd_desc_t), B_TRUE); 6339 6340 /* send the packet to be switched */ 6341 vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, 6342 ldcp->ldc_port, NULL); 6343 6344 break; 6345 6346 case VIO_SUBTYPE_ACK: 6347 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 6348 6349 /* Verify the ACK is valid */ 6350 idx = ibnd_desc->hdr.desc_handle; 6351 6352 if (idx >= VSW_RING_NUM_EL) { 6353 cmn_err(CE_WARN, "!vsw%d: corrupted ACK received " 6354 "(idx %ld)", vswp->instance, idx); 6355 return; 6356 } 6357 6358 if ((dp = ldcp->lane_out.dringp) == NULL) { 6359 DERR(vswp, "%s: no dring found", __func__); 6360 return; 6361 } 6362 6363 len = dp->num_descriptors; 6364 /* 6365 * If the descriptor we are being ACK'ed for is not the 6366 * one we expected, then pkts were lost somwhere, either 6367 * when we tried to send a msg, or a previous ACK msg from 6368 * our peer. In either case we now reclaim the descriptors 6369 * in the range from the last ACK we received up to the 6370 * current ACK. 6371 */ 6372 if (idx != dp->last_ack_recv) { 6373 DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)", 6374 __func__, dp->last_ack_recv, idx); 6375 num = idx >= dp->last_ack_recv ? 6376 idx - dp->last_ack_recv + 1: 6377 (len - dp->last_ack_recv + 1) + idx; 6378 } 6379 6380 /* 6381 * When we sent the in-band message to our peer we 6382 * marked the copy in our private ring as READY. We now 6383 * check that the descriptor we are being ACK'ed for is in 6384 * fact READY, i.e. it is one we have shared with our peer. 6385 * 6386 * If its not we flag an error, but still reset the descr 6387 * back to FREE. 6388 */ 6389 for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) { 6390 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 6391 mutex_enter(&priv_addr->dstate_lock); 6392 if (priv_addr->dstate != VIO_DESC_READY) { 6393 DERR(vswp, "%s: (%ld) desc at index %ld not " 6394 "READY (0x%lx)", __func__, 6395 ldcp->ldc_id, idx, priv_addr->dstate); 6396 DERR(vswp, "%s: bound %d: ncookies %ld : " 6397 "datalen %ld", __func__, 6398 priv_addr->bound, priv_addr->ncookies, 6399 priv_addr->datalen); 6400 } 6401 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, 6402 ldcp->ldc_id, idx); 6403 /* release resources associated with sent msg */ 6404 bzero(priv_addr->datap, priv_addr->datalen); 6405 priv_addr->datalen = 0; 6406 priv_addr->dstate = VIO_DESC_FREE; 6407 mutex_exit(&priv_addr->dstate_lock); 6408 } 6409 /* update to next expected value */ 6410 dp->last_ack_recv = (idx + 1) % dp->num_descriptors; 6411 6412 break; 6413 6414 case VIO_SUBTYPE_NACK: 6415 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 6416 6417 /* 6418 * We should only get a NACK if our peer doesn't like 6419 * something about a message we have sent it. If this 6420 * happens we just release the resources associated with 6421 * the message. (We are relying on higher layers to decide 6422 * whether or not to resend. 6423 */ 6424 6425 /* limit check */ 6426 idx = ibnd_desc->hdr.desc_handle; 6427 6428 if (idx >= VSW_RING_NUM_EL) { 6429 DERR(vswp, "%s: corrupted NACK received (idx %lld)", 6430 __func__, idx); 6431 return; 6432 } 6433 6434 if ((dp = ldcp->lane_out.dringp) == NULL) { 6435 DERR(vswp, "%s: no dring found", __func__); 6436 return; 6437 } 6438 6439 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 6440 6441 /* move to correct location in ring */ 6442 priv_addr += idx; 6443 6444 /* release resources associated with sent msg */ 6445 mutex_enter(&priv_addr->dstate_lock); 6446 bzero(priv_addr->datap, priv_addr->datalen); 6447 priv_addr->datalen = 0; 6448 priv_addr->dstate = VIO_DESC_FREE; 6449 mutex_exit(&priv_addr->dstate_lock); 6450 6451 break; 6452 6453 default: 6454 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 6455 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); 6456 } 6457 6458 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 6459 } 6460 6461 static void 6462 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag) 6463 { 6464 _NOTE(ARGUNUSED(epkt)) 6465 6466 vsw_t *vswp = ldcp->ldc_vswp; 6467 uint16_t env = tag.vio_subtype_env; 6468 6469 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 6470 6471 /* 6472 * Error vio_subtypes have yet to be defined. So for 6473 * the moment we can't do anything. 6474 */ 6475 D2(vswp, "%s: (%x) vio_subtype env", __func__, env); 6476 6477 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 6478 } 6479 6480 /* 6481 * Switch the given ethernet frame when operating in layer 2 mode. 6482 * 6483 * vswp: pointer to the vsw instance 6484 * mp: pointer to chain of ethernet frame(s) to be switched 6485 * caller: identifies the source of this frame as: 6486 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 6487 * 2. VSW_PHYSDEV - the physical ethernet device 6488 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 6489 * arg: argument provided by the caller. 6490 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 6491 * 2. for PHYSDEV - NULL 6492 * 3. for LOCALDEV - pointer to to this vsw_t(self) 6493 */ 6494 void 6495 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 6496 vsw_port_t *arg, mac_resource_handle_t mrh) 6497 { 6498 struct ether_header *ehp; 6499 vsw_port_t *port = NULL; 6500 mblk_t *bp, *ret_m; 6501 mblk_t *nmp = NULL; 6502 vsw_port_list_t *plist = &vswp->plist; 6503 6504 D1(vswp, "%s: enter (caller %d)", __func__, caller); 6505 6506 /* 6507 * PERF: rather than breaking up the chain here, scan it 6508 * to find all mblks heading to same destination and then 6509 * pass that sub-chain to the lower transmit functions. 6510 */ 6511 6512 /* process the chain of packets */ 6513 bp = mp; 6514 while (bp) { 6515 mp = bp; 6516 bp = bp->b_next; 6517 mp->b_next = mp->b_prev = NULL; 6518 ehp = (struct ether_header *)mp->b_rptr; 6519 6520 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 6521 __func__, MBLKSIZE(mp), MBLKL(mp)); 6522 6523 READ_ENTER(&vswp->if_lockrw); 6524 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 6525 /* 6526 * If destination is VSW_LOCALDEV (vsw as an eth 6527 * interface) and if the device is up & running, 6528 * send the packet up the stack on this host. 6529 * If the virtual interface is down, drop the packet. 6530 */ 6531 if (caller != VSW_LOCALDEV) { 6532 if (vswp->if_state & VSW_IF_UP) { 6533 RW_EXIT(&vswp->if_lockrw); 6534 mac_rx(vswp->if_mh, mrh, mp); 6535 } else { 6536 RW_EXIT(&vswp->if_lockrw); 6537 /* Interface down, drop pkt */ 6538 freemsg(mp); 6539 } 6540 } else { 6541 RW_EXIT(&vswp->if_lockrw); 6542 freemsg(mp); 6543 } 6544 continue; 6545 } 6546 RW_EXIT(&vswp->if_lockrw); 6547 6548 READ_ENTER(&plist->lockrw); 6549 port = vsw_lookup_fdb(vswp, ehp); 6550 if (port) { 6551 /* 6552 * Mark the port as in-use. 6553 */ 6554 mutex_enter(&port->ref_lock); 6555 port->ref_cnt++; 6556 mutex_exit(&port->ref_lock); 6557 RW_EXIT(&plist->lockrw); 6558 6559 /* 6560 * If plumbed and in promisc mode then copy msg 6561 * and send up the stack. 6562 */ 6563 READ_ENTER(&vswp->if_lockrw); 6564 if (VSW_U_P(vswp->if_state)) { 6565 RW_EXIT(&vswp->if_lockrw); 6566 nmp = copymsg(mp); 6567 if (nmp) 6568 mac_rx(vswp->if_mh, mrh, nmp); 6569 } else { 6570 RW_EXIT(&vswp->if_lockrw); 6571 } 6572 6573 /* 6574 * If the destination is in FDB, the packet 6575 * should be forwarded to the correponding 6576 * vsw_port (connected to a vnet device - 6577 * VSW_VNETPORT) 6578 */ 6579 (void) vsw_portsend(port, mp); 6580 6581 /* 6582 * Decrement use count in port and check if 6583 * should wake delete thread. 6584 */ 6585 mutex_enter(&port->ref_lock); 6586 port->ref_cnt--; 6587 if (port->ref_cnt == 0) 6588 cv_signal(&port->ref_cv); 6589 mutex_exit(&port->ref_lock); 6590 } else { 6591 RW_EXIT(&plist->lockrw); 6592 /* 6593 * Destination not in FDB. 6594 * 6595 * If the destination is broadcast or 6596 * multicast forward the packet to all 6597 * (VNETPORTs, PHYSDEV, LOCALDEV), 6598 * except the caller. 6599 */ 6600 if (IS_BROADCAST(ehp)) { 6601 D3(vswp, "%s: BROADCAST pkt", __func__); 6602 (void) vsw_forward_all(vswp, mp, caller, arg); 6603 } else if (IS_MULTICAST(ehp)) { 6604 D3(vswp, "%s: MULTICAST pkt", __func__); 6605 (void) vsw_forward_grp(vswp, mp, caller, arg); 6606 } else { 6607 /* 6608 * If the destination is unicast, and came 6609 * from either a logical network device or 6610 * the switch itself when it is plumbed, then 6611 * send it out on the physical device and also 6612 * up the stack if the logical interface is 6613 * in promiscious mode. 6614 * 6615 * NOTE: The assumption here is that if we 6616 * cannot find the destination in our fdb, its 6617 * a unicast address, and came from either a 6618 * vnet or down the stack (when plumbed) it 6619 * must be destinded for an ethernet device 6620 * outside our ldoms. 6621 */ 6622 if (caller == VSW_VNETPORT) { 6623 READ_ENTER(&vswp->if_lockrw); 6624 if (VSW_U_P(vswp->if_state)) { 6625 RW_EXIT(&vswp->if_lockrw); 6626 nmp = copymsg(mp); 6627 if (nmp) 6628 mac_rx(vswp->if_mh, 6629 mrh, nmp); 6630 } else { 6631 RW_EXIT(&vswp->if_lockrw); 6632 } 6633 if ((ret_m = vsw_tx_msg(vswp, mp)) 6634 != NULL) { 6635 DERR(vswp, "%s: drop mblks to " 6636 "phys dev", __func__); 6637 freemsg(ret_m); 6638 } 6639 6640 } else if (caller == VSW_PHYSDEV) { 6641 /* 6642 * Pkt seen because card in promisc 6643 * mode. Send up stack if plumbed in 6644 * promisc mode, else drop it. 6645 */ 6646 READ_ENTER(&vswp->if_lockrw); 6647 if (VSW_U_P(vswp->if_state)) { 6648 RW_EXIT(&vswp->if_lockrw); 6649 mac_rx(vswp->if_mh, mrh, mp); 6650 } else { 6651 RW_EXIT(&vswp->if_lockrw); 6652 freemsg(mp); 6653 } 6654 6655 } else if (caller == VSW_LOCALDEV) { 6656 /* 6657 * Pkt came down the stack, send out 6658 * over physical device. 6659 */ 6660 if ((ret_m = vsw_tx_msg(vswp, mp)) 6661 != NULL) { 6662 DERR(vswp, "%s: drop mblks to " 6663 "phys dev", __func__); 6664 freemsg(ret_m); 6665 } 6666 } 6667 } 6668 } 6669 } 6670 D1(vswp, "%s: exit\n", __func__); 6671 } 6672 6673 /* 6674 * Switch ethernet frame when in layer 3 mode (i.e. using IP 6675 * layer to do the routing). 6676 * 6677 * There is a large amount of overlap between this function and 6678 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 6679 * both these functions. 6680 */ 6681 void 6682 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 6683 vsw_port_t *arg, mac_resource_handle_t mrh) 6684 { 6685 struct ether_header *ehp; 6686 vsw_port_t *port = NULL; 6687 mblk_t *bp = NULL; 6688 vsw_port_list_t *plist = &vswp->plist; 6689 6690 D1(vswp, "%s: enter (caller %d)", __func__, caller); 6691 6692 /* 6693 * In layer 3 mode should only ever be switching packets 6694 * between IP layer and vnet devices. So make sure thats 6695 * who is invoking us. 6696 */ 6697 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 6698 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 6699 freemsgchain(mp); 6700 return; 6701 } 6702 6703 /* process the chain of packets */ 6704 bp = mp; 6705 while (bp) { 6706 mp = bp; 6707 bp = bp->b_next; 6708 mp->b_next = mp->b_prev = NULL; 6709 ehp = (struct ether_header *)mp->b_rptr; 6710 6711 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 6712 __func__, MBLKSIZE(mp), MBLKL(mp)); 6713 6714 READ_ENTER(&plist->lockrw); 6715 port = vsw_lookup_fdb(vswp, ehp); 6716 if (port) { 6717 /* 6718 * Mark port as in-use. 6719 */ 6720 mutex_enter(&port->ref_lock); 6721 port->ref_cnt++; 6722 mutex_exit(&port->ref_lock); 6723 RW_EXIT(&plist->lockrw); 6724 6725 D2(vswp, "%s: sending to target port", __func__); 6726 (void) vsw_portsend(port, mp); 6727 6728 /* 6729 * Finished with port so decrement ref count and 6730 * check if should wake delete thread. 6731 */ 6732 mutex_enter(&port->ref_lock); 6733 port->ref_cnt--; 6734 if (port->ref_cnt == 0) 6735 cv_signal(&port->ref_cv); 6736 mutex_exit(&port->ref_lock); 6737 } else { 6738 RW_EXIT(&plist->lockrw); 6739 /* 6740 * Destination not in FDB 6741 * 6742 * If the destination is broadcast or 6743 * multicast forward the packet to all 6744 * (VNETPORTs, PHYSDEV, LOCALDEV), 6745 * except the caller. 6746 */ 6747 if (IS_BROADCAST(ehp)) { 6748 D2(vswp, "%s: BROADCAST pkt", __func__); 6749 (void) vsw_forward_all(vswp, mp, caller, arg); 6750 } else if (IS_MULTICAST(ehp)) { 6751 D2(vswp, "%s: MULTICAST pkt", __func__); 6752 (void) vsw_forward_grp(vswp, mp, caller, arg); 6753 } else { 6754 /* 6755 * Unicast pkt from vnet that we don't have 6756 * an FDB entry for, so must be destinded for 6757 * the outside world. Attempt to send up to the 6758 * IP layer to allow it to deal with it. 6759 */ 6760 if (caller == VSW_VNETPORT) { 6761 READ_ENTER(&vswp->if_lockrw); 6762 if (vswp->if_state & VSW_IF_UP) { 6763 RW_EXIT(&vswp->if_lockrw); 6764 D2(vswp, "%s: sending up", 6765 __func__); 6766 mac_rx(vswp->if_mh, mrh, mp); 6767 } else { 6768 RW_EXIT(&vswp->if_lockrw); 6769 /* Interface down, drop pkt */ 6770 D2(vswp, "%s I/F down", 6771 __func__); 6772 freemsg(mp); 6773 } 6774 } 6775 } 6776 } 6777 } 6778 6779 D1(vswp, "%s: exit", __func__); 6780 } 6781 6782 /* 6783 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 6784 * except the caller (port on which frame arrived). 6785 */ 6786 static int 6787 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6788 { 6789 vsw_port_list_t *plist = &vswp->plist; 6790 vsw_port_t *portp; 6791 mblk_t *nmp = NULL; 6792 mblk_t *ret_m = NULL; 6793 int skip_port = 0; 6794 6795 D1(vswp, "vsw_forward_all: enter\n"); 6796 6797 /* 6798 * Broadcast message from inside ldoms so send to outside 6799 * world if in either of layer 2 modes. 6800 */ 6801 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6802 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6803 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 6804 6805 nmp = dupmsg(mp); 6806 if (nmp) { 6807 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6808 DERR(vswp, "%s: dropping pkt(s) " 6809 "consisting of %ld bytes of data for" 6810 " physical device", __func__, MBLKL(ret_m)); 6811 freemsg(ret_m); 6812 } 6813 } 6814 } 6815 6816 if (caller == VSW_VNETPORT) 6817 skip_port = 1; 6818 6819 /* 6820 * Broadcast message from other vnet (layer 2 or 3) or outside 6821 * world (layer 2 only), send up stack if plumbed. 6822 */ 6823 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 6824 READ_ENTER(&vswp->if_lockrw); 6825 if (vswp->if_state & VSW_IF_UP) { 6826 RW_EXIT(&vswp->if_lockrw); 6827 nmp = copymsg(mp); 6828 if (nmp) 6829 mac_rx(vswp->if_mh, NULL, nmp); 6830 } else { 6831 RW_EXIT(&vswp->if_lockrw); 6832 } 6833 } 6834 6835 /* send it to all VNETPORTs */ 6836 READ_ENTER(&plist->lockrw); 6837 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 6838 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 6839 /* 6840 * Caution ! - don't reorder these two checks as arg 6841 * will be NULL if the caller is PHYSDEV. skip_port is 6842 * only set if caller is VNETPORT. 6843 */ 6844 if ((skip_port) && (portp == arg)) 6845 continue; 6846 else { 6847 nmp = dupmsg(mp); 6848 if (nmp) { 6849 (void) vsw_portsend(portp, nmp); 6850 } else { 6851 DERR(vswp, "vsw_forward_all: nmp NULL"); 6852 } 6853 } 6854 } 6855 RW_EXIT(&plist->lockrw); 6856 6857 freemsg(mp); 6858 6859 D1(vswp, "vsw_forward_all: exit\n"); 6860 return (0); 6861 } 6862 6863 /* 6864 * Forward pkts to any devices or interfaces which have registered 6865 * an interest in them (i.e. multicast groups). 6866 */ 6867 static int 6868 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 6869 { 6870 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 6871 mfdb_ent_t *entp = NULL; 6872 mfdb_ent_t *tpp = NULL; 6873 vsw_port_t *port; 6874 uint64_t key = 0; 6875 mblk_t *nmp = NULL; 6876 mblk_t *ret_m = NULL; 6877 boolean_t check_if = B_TRUE; 6878 6879 /* 6880 * Convert address to hash table key 6881 */ 6882 KEY_HASH(key, ehp->ether_dhost); 6883 6884 D1(vswp, "%s: key 0x%llx", __func__, key); 6885 6886 /* 6887 * If pkt came from either a vnet or down the stack (if we are 6888 * plumbed) and we are in layer 2 mode, then we send the pkt out 6889 * over the physical adapter, and then check to see if any other 6890 * vnets are interested in it. 6891 */ 6892 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 6893 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 6894 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 6895 nmp = dupmsg(mp); 6896 if (nmp) { 6897 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 6898 DERR(vswp, "%s: dropping pkt(s) consisting of " 6899 "%ld bytes of data for physical device", 6900 __func__, MBLKL(ret_m)); 6901 freemsg(ret_m); 6902 } 6903 } 6904 } 6905 6906 READ_ENTER(&vswp->mfdbrw); 6907 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 6908 (mod_hash_val_t *)&entp) != 0) { 6909 D3(vswp, "%s: no table entry found for addr 0x%llx", 6910 __func__, key); 6911 } else { 6912 /* 6913 * Send to list of devices associated with this address... 6914 */ 6915 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 6916 6917 /* dont send to ourselves */ 6918 if ((caller == VSW_VNETPORT) && 6919 (tpp->d_addr == (void *)arg)) { 6920 port = (vsw_port_t *)tpp->d_addr; 6921 D3(vswp, "%s: not sending to ourselves" 6922 " : port %d", __func__, port->p_instance); 6923 continue; 6924 6925 } else if ((caller == VSW_LOCALDEV) && 6926 (tpp->d_type == VSW_LOCALDEV)) { 6927 D3(vswp, "%s: not sending back up stack", 6928 __func__); 6929 continue; 6930 } 6931 6932 if (tpp->d_type == VSW_VNETPORT) { 6933 port = (vsw_port_t *)tpp->d_addr; 6934 D3(vswp, "%s: sending to port %ld for addr " 6935 "0x%llx", __func__, port->p_instance, key); 6936 6937 nmp = dupmsg(mp); 6938 if (nmp) 6939 (void) vsw_portsend(port, nmp); 6940 } else { 6941 if (vswp->if_state & VSW_IF_UP) { 6942 nmp = copymsg(mp); 6943 if (nmp) 6944 mac_rx(vswp->if_mh, NULL, nmp); 6945 check_if = B_FALSE; 6946 D3(vswp, "%s: sending up stack" 6947 " for addr 0x%llx", __func__, key); 6948 } 6949 } 6950 } 6951 } 6952 6953 RW_EXIT(&vswp->mfdbrw); 6954 6955 /* 6956 * If the pkt came from either a vnet or from physical device, 6957 * and if we havent already sent the pkt up the stack then we 6958 * check now if we can/should (i.e. the interface is plumbed 6959 * and in promisc mode). 6960 */ 6961 if ((check_if) && 6962 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 6963 READ_ENTER(&vswp->if_lockrw); 6964 if (VSW_U_P(vswp->if_state)) { 6965 RW_EXIT(&vswp->if_lockrw); 6966 D3(vswp, "%s: (caller %d) finally sending up stack" 6967 " for addr 0x%llx", __func__, caller, key); 6968 nmp = copymsg(mp); 6969 if (nmp) 6970 mac_rx(vswp->if_mh, NULL, nmp); 6971 } else { 6972 RW_EXIT(&vswp->if_lockrw); 6973 } 6974 } 6975 6976 freemsg(mp); 6977 6978 D1(vswp, "%s: exit", __func__); 6979 6980 return (0); 6981 } 6982 6983 /* transmit the packet over the given port */ 6984 static int 6985 vsw_portsend(vsw_port_t *port, mblk_t *mp) 6986 { 6987 vsw_ldc_list_t *ldcl = &port->p_ldclist; 6988 vsw_ldc_t *ldcp; 6989 int status = 0; 6990 6991 6992 READ_ENTER(&ldcl->lockrw); 6993 /* 6994 * Note for now, we have a single channel. 6995 */ 6996 ldcp = ldcl->head; 6997 if (ldcp == NULL) { 6998 DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n"); 6999 freemsg(mp); 7000 RW_EXIT(&ldcl->lockrw); 7001 return (1); 7002 } 7003 7004 /* 7005 * Send the message out using the appropriate 7006 * transmit function which will free mblock when it 7007 * is finished with it. 7008 */ 7009 mutex_enter(&port->tx_lock); 7010 if (port->transmit != NULL) 7011 status = (*port->transmit)(ldcp, mp); 7012 else { 7013 freemsg(mp); 7014 } 7015 mutex_exit(&port->tx_lock); 7016 7017 RW_EXIT(&ldcl->lockrw); 7018 7019 return (status); 7020 } 7021 7022 /* 7023 * Send packet out via descriptor ring to a logical device. 7024 */ 7025 static int 7026 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) 7027 { 7028 vio_dring_msg_t dring_pkt; 7029 dring_info_t *dp = NULL; 7030 vsw_private_desc_t *priv_desc = NULL; 7031 vnet_public_desc_t *pub = NULL; 7032 vsw_t *vswp = ldcp->ldc_vswp; 7033 mblk_t *bp; 7034 size_t n, size; 7035 caddr_t bufp; 7036 int idx; 7037 int status = LDC_TX_SUCCESS; 7038 7039 D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); 7040 7041 /* TODO: make test a macro */ 7042 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 7043 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 7044 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " 7045 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, 7046 ldcp->lane_out.lstate); 7047 freemsg(mp); 7048 return (LDC_TX_FAILURE); 7049 } 7050 7051 /* 7052 * Note - using first ring only, this may change 7053 * in the future. 7054 */ 7055 READ_ENTER(&ldcp->lane_out.dlistrw); 7056 if ((dp = ldcp->lane_out.dringp) == NULL) { 7057 RW_EXIT(&ldcp->lane_out.dlistrw); 7058 DERR(vswp, "%s(%lld): no dring for outbound lane on" 7059 " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); 7060 freemsg(mp); 7061 return (LDC_TX_FAILURE); 7062 } 7063 7064 size = msgsize(mp); 7065 if (size > (size_t)ETHERMAX) { 7066 RW_EXIT(&ldcp->lane_out.dlistrw); 7067 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 7068 ldcp->ldc_id, size); 7069 freemsg(mp); 7070 return (LDC_TX_FAILURE); 7071 } 7072 7073 /* 7074 * Find a free descriptor 7075 * 7076 * Note: for the moment we are assuming that we will only 7077 * have one dring going from the switch to each of its 7078 * peers. This may change in the future. 7079 */ 7080 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 7081 D2(vswp, "%s(%lld): no descriptor available for ring " 7082 "at 0x%llx", __func__, ldcp->ldc_id, dp); 7083 7084 /* nothing more we can do */ 7085 status = LDC_TX_NORESOURCES; 7086 goto vsw_dringsend_free_exit; 7087 } else { 7088 D2(vswp, "%s(%lld): free private descriptor found at pos %ld " 7089 "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc); 7090 } 7091 7092 /* copy data into the descriptor */ 7093 bufp = priv_desc->datap; 7094 bufp += VNET_IPALIGN; 7095 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 7096 n = MBLKL(bp); 7097 bcopy(bp->b_rptr, bufp, n); 7098 bufp += n; 7099 } 7100 7101 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 7102 7103 pub = priv_desc->descp; 7104 pub->nbytes = priv_desc->datalen; 7105 7106 mutex_enter(&priv_desc->dstate_lock); 7107 pub->hdr.dstate = VIO_DESC_READY; 7108 mutex_exit(&priv_desc->dstate_lock); 7109 7110 /* 7111 * Determine whether or not we need to send a message to our 7112 * peer prompting them to read our newly updated descriptor(s). 7113 */ 7114 mutex_enter(&dp->restart_lock); 7115 if (dp->restart_reqd) { 7116 dp->restart_reqd = B_FALSE; 7117 mutex_exit(&dp->restart_lock); 7118 7119 /* 7120 * Send a vio_dring_msg to peer to prompt them to read 7121 * the updated descriptor ring. 7122 */ 7123 dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; 7124 dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 7125 dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; 7126 dring_pkt.tag.vio_sid = ldcp->local_session; 7127 7128 /* Note - for now using first ring */ 7129 dring_pkt.dring_ident = dp->ident; 7130 7131 mutex_enter(&ldcp->lane_out.seq_lock); 7132 dring_pkt.seq_num = ldcp->lane_out.seq_num++; 7133 mutex_exit(&ldcp->lane_out.seq_lock); 7134 7135 /* 7136 * If last_ack_recv is -1 then we know we've not 7137 * received any ack's yet, so this must be the first 7138 * msg sent, so set the start to the begining of the ring. 7139 */ 7140 mutex_enter(&dp->dlock); 7141 if (dp->last_ack_recv == -1) { 7142 dring_pkt.start_idx = 0; 7143 } else { 7144 dring_pkt.start_idx = 7145 (dp->last_ack_recv + 1) % dp->num_descriptors; 7146 } 7147 dring_pkt.end_idx = -1; 7148 mutex_exit(&dp->dlock); 7149 7150 D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, 7151 ldcp->ldc_id, dp, dring_pkt.dring_ident); 7152 D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", 7153 __func__, ldcp->ldc_id, dring_pkt.start_idx, 7154 dring_pkt.end_idx, dring_pkt.seq_num); 7155 7156 RW_EXIT(&ldcp->lane_out.dlistrw); 7157 7158 (void) vsw_send_msg(ldcp, (void *)&dring_pkt, 7159 sizeof (vio_dring_msg_t), B_TRUE); 7160 7161 /* free the message block */ 7162 freemsg(mp); 7163 return (status); 7164 7165 } else { 7166 mutex_exit(&dp->restart_lock); 7167 D2(vswp, "%s(%lld): updating descp %d", __func__, 7168 ldcp->ldc_id, idx); 7169 } 7170 7171 vsw_dringsend_free_exit: 7172 7173 RW_EXIT(&ldcp->lane_out.dlistrw); 7174 7175 /* free the message block */ 7176 freemsg(mp); 7177 7178 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 7179 return (status); 7180 } 7181 7182 /* 7183 * Send an in-band descriptor message over ldc. 7184 */ 7185 static int 7186 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) 7187 { 7188 vsw_t *vswp = ldcp->ldc_vswp; 7189 vnet_ibnd_desc_t ibnd_msg; 7190 vsw_private_desc_t *priv_desc = NULL; 7191 dring_info_t *dp = NULL; 7192 size_t n, size = 0; 7193 caddr_t bufp; 7194 mblk_t *bp; 7195 int idx, i; 7196 int status = LDC_TX_SUCCESS; 7197 static int warn_msg = 1; 7198 7199 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 7200 7201 ASSERT(mp != NULL); 7202 7203 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 7204 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 7205 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", 7206 __func__, ldcp->ldc_id, ldcp->ldc_status, 7207 ldcp->lane_out.lstate); 7208 freemsg(mp); 7209 return (LDC_TX_FAILURE); 7210 } 7211 7212 /* 7213 * only expect single dring to exist, which we use 7214 * as an internal buffer, rather than a transfer channel. 7215 */ 7216 READ_ENTER(&ldcp->lane_out.dlistrw); 7217 if ((dp = ldcp->lane_out.dringp) == NULL) { 7218 DERR(vswp, "%s(%lld): no dring for outbound lane", 7219 __func__, ldcp->ldc_id); 7220 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__, 7221 ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate); 7222 RW_EXIT(&ldcp->lane_out.dlistrw); 7223 freemsg(mp); 7224 return (LDC_TX_FAILURE); 7225 } 7226 7227 size = msgsize(mp); 7228 if (size > (size_t)ETHERMAX) { 7229 RW_EXIT(&ldcp->lane_out.dlistrw); 7230 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 7231 ldcp->ldc_id, size); 7232 freemsg(mp); 7233 return (LDC_TX_FAILURE); 7234 } 7235 7236 /* 7237 * Find a free descriptor in our buffer ring 7238 */ 7239 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 7240 RW_EXIT(&ldcp->lane_out.dlistrw); 7241 if (warn_msg) { 7242 DERR(vswp, "%s(%lld): no descriptor available for ring " 7243 "at 0x%llx", __func__, ldcp->ldc_id, dp); 7244 warn_msg = 0; 7245 } 7246 7247 /* nothing more we can do */ 7248 status = LDC_TX_NORESOURCES; 7249 goto vsw_descrsend_free_exit; 7250 } else { 7251 D2(vswp, "%s(%lld): free private descriptor found at pos " 7252 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc); 7253 warn_msg = 1; 7254 } 7255 7256 /* copy data into the descriptor */ 7257 bufp = priv_desc->datap; 7258 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 7259 n = MBLKL(bp); 7260 bcopy(bp->b_rptr, bufp, n); 7261 bufp += n; 7262 } 7263 7264 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 7265 7266 /* create and send the in-band descp msg */ 7267 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; 7268 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; 7269 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; 7270 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; 7271 7272 mutex_enter(&ldcp->lane_out.seq_lock); 7273 ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; 7274 mutex_exit(&ldcp->lane_out.seq_lock); 7275 7276 /* 7277 * Copy the mem cookies describing the data from the 7278 * private region of the descriptor ring into the inband 7279 * descriptor. 7280 */ 7281 for (i = 0; i < priv_desc->ncookies; i++) { 7282 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], 7283 sizeof (ldc_mem_cookie_t)); 7284 } 7285 7286 ibnd_msg.hdr.desc_handle = idx; 7287 ibnd_msg.ncookies = priv_desc->ncookies; 7288 ibnd_msg.nbytes = size; 7289 7290 RW_EXIT(&ldcp->lane_out.dlistrw); 7291 7292 (void) vsw_send_msg(ldcp, (void *)&ibnd_msg, 7293 sizeof (vnet_ibnd_desc_t), B_TRUE); 7294 7295 vsw_descrsend_free_exit: 7296 7297 /* free the allocated message blocks */ 7298 freemsg(mp); 7299 7300 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 7301 return (status); 7302 } 7303 7304 static void 7305 vsw_send_ver(void *arg) 7306 { 7307 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 7308 vsw_t *vswp = ldcp->ldc_vswp; 7309 lane_t *lp = &ldcp->lane_out; 7310 vio_ver_msg_t ver_msg; 7311 7312 D1(vswp, "%s enter", __func__); 7313 7314 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7315 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7316 ver_msg.tag.vio_subtype_env = VIO_VER_INFO; 7317 ver_msg.tag.vio_sid = ldcp->local_session; 7318 7319 ver_msg.ver_major = vsw_versions[0].ver_major; 7320 ver_msg.ver_minor = vsw_versions[0].ver_minor; 7321 ver_msg.dev_class = VDEV_NETWORK_SWITCH; 7322 7323 lp->lstate |= VSW_VER_INFO_SENT; 7324 lp->ver_major = ver_msg.ver_major; 7325 lp->ver_minor = ver_msg.ver_minor; 7326 7327 DUMP_TAG(ver_msg.tag); 7328 7329 (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE); 7330 7331 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); 7332 } 7333 7334 static void 7335 vsw_send_attr(vsw_ldc_t *ldcp) 7336 { 7337 vsw_t *vswp = ldcp->ldc_vswp; 7338 lane_t *lp = &ldcp->lane_out; 7339 vnet_attr_msg_t attr_msg; 7340 7341 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 7342 7343 /* 7344 * Subtype is set to INFO by default 7345 */ 7346 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7347 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7348 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; 7349 attr_msg.tag.vio_sid = ldcp->local_session; 7350 7351 /* payload copied from default settings for lane */ 7352 attr_msg.mtu = lp->mtu; 7353 attr_msg.addr_type = lp->addr_type; 7354 attr_msg.xfer_mode = lp->xfer_mode; 7355 attr_msg.ack_freq = lp->xfer_mode; 7356 7357 READ_ENTER(&vswp->if_lockrw); 7358 bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL); 7359 RW_EXIT(&vswp->if_lockrw); 7360 7361 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; 7362 7363 DUMP_TAG(attr_msg.tag); 7364 7365 (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE); 7366 7367 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 7368 } 7369 7370 /* 7371 * Create dring info msg (which also results in the creation of 7372 * a dring). 7373 */ 7374 static vio_dring_reg_msg_t * 7375 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp) 7376 { 7377 vio_dring_reg_msg_t *mp; 7378 dring_info_t *dp; 7379 vsw_t *vswp = ldcp->ldc_vswp; 7380 7381 D1(vswp, "vsw_create_dring_info_pkt enter\n"); 7382 7383 /* 7384 * If we can't create a dring, obviously no point sending 7385 * a message. 7386 */ 7387 if ((dp = vsw_create_dring(ldcp)) == NULL) 7388 return (NULL); 7389 7390 mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); 7391 7392 mp->tag.vio_msgtype = VIO_TYPE_CTRL; 7393 mp->tag.vio_subtype = VIO_SUBTYPE_INFO; 7394 mp->tag.vio_subtype_env = VIO_DRING_REG; 7395 mp->tag.vio_sid = ldcp->local_session; 7396 7397 /* payload */ 7398 mp->num_descriptors = dp->num_descriptors; 7399 mp->descriptor_size = dp->descriptor_size; 7400 mp->options = dp->options; 7401 mp->ncookies = dp->ncookies; 7402 bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); 7403 7404 mp->dring_ident = 0; 7405 7406 D1(vswp, "vsw_create_dring_info_pkt exit\n"); 7407 7408 return (mp); 7409 } 7410 7411 static void 7412 vsw_send_dring_info(vsw_ldc_t *ldcp) 7413 { 7414 vio_dring_reg_msg_t *dring_msg; 7415 vsw_t *vswp = ldcp->ldc_vswp; 7416 7417 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); 7418 7419 dring_msg = vsw_create_dring_info_pkt(ldcp); 7420 if (dring_msg == NULL) { 7421 cmn_err(CE_WARN, "!vsw%d: %s: error creating msg", 7422 vswp->instance, __func__); 7423 return; 7424 } 7425 7426 ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT; 7427 7428 DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); 7429 7430 (void) vsw_send_msg(ldcp, dring_msg, 7431 sizeof (vio_dring_reg_msg_t), B_TRUE); 7432 7433 kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); 7434 7435 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); 7436 } 7437 7438 static void 7439 vsw_send_rdx(vsw_ldc_t *ldcp) 7440 { 7441 vsw_t *vswp = ldcp->ldc_vswp; 7442 vio_rdx_msg_t rdx_msg; 7443 7444 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 7445 7446 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 7447 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 7448 rdx_msg.tag.vio_subtype_env = VIO_RDX; 7449 rdx_msg.tag.vio_sid = ldcp->local_session; 7450 7451 ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT; 7452 7453 DUMP_TAG(rdx_msg.tag); 7454 7455 (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE); 7456 7457 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 7458 } 7459 7460 /* 7461 * Generic routine to send message out over ldc channel. 7462 * 7463 * It is possible that when we attempt to write over the ldc channel 7464 * that we get notified that it has been reset. Depending on the value 7465 * of the handle_reset flag we either handle that event here or simply 7466 * notify the caller that the channel was reset. 7467 */ 7468 static int 7469 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset) 7470 { 7471 int rv; 7472 size_t msglen = size; 7473 vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; 7474 vsw_t *vswp = ldcp->ldc_vswp; 7475 7476 D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", 7477 ldcp->ldc_id, size); 7478 7479 D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); 7480 D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); 7481 D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); 7482 7483 mutex_enter(&ldcp->ldc_txlock); 7484 do { 7485 msglen = size; 7486 rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); 7487 } while (rv == EWOULDBLOCK && --vsw_wretries > 0); 7488 7489 if ((rv != 0) || (msglen != size)) { 7490 DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) " 7491 "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen); 7492 } 7493 mutex_exit(&ldcp->ldc_txlock); 7494 7495 /* 7496 * If channel has been reset we either handle it here or 7497 * simply report back that it has been reset and let caller 7498 * decide what to do. 7499 */ 7500 if (rv == ECONNRESET) { 7501 DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id); 7502 7503 /* 7504 * N.B - must never be holding the dlistrw lock when 7505 * we do a reset of the channel. 7506 */ 7507 if (handle_reset) { 7508 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 7509 } 7510 } 7511 7512 return (rv); 7513 } 7514 7515 /* 7516 * Add an entry into FDB, for the given mac address and port_id. 7517 * Returns 0 on success, 1 on failure. 7518 * 7519 * Lock protecting FDB must be held by calling process. 7520 */ 7521 static int 7522 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 7523 { 7524 uint64_t addr = 0; 7525 7526 D1(vswp, "%s: enter", __func__); 7527 7528 KEY_HASH(addr, port->p_macaddr); 7529 7530 D2(vswp, "%s: key = 0x%llx", __func__, addr); 7531 7532 /* 7533 * Note: duplicate keys will be rejected by mod_hash. 7534 */ 7535 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 7536 (mod_hash_val_t)port) != 0) { 7537 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 7538 return (1); 7539 } 7540 7541 D1(vswp, "%s: exit", __func__); 7542 return (0); 7543 } 7544 7545 /* 7546 * Remove an entry from FDB. 7547 * Returns 0 on success, 1 on failure. 7548 */ 7549 static int 7550 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 7551 { 7552 uint64_t addr = 0; 7553 7554 D1(vswp, "%s: enter", __func__); 7555 7556 KEY_HASH(addr, port->p_macaddr); 7557 7558 D2(vswp, "%s: key = 0x%llx", __func__, addr); 7559 7560 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 7561 7562 D1(vswp, "%s: enter", __func__); 7563 7564 return (0); 7565 } 7566 7567 /* 7568 * Search fdb for a given mac address. 7569 * Returns pointer to the entry if found, else returns NULL. 7570 */ 7571 static vsw_port_t * 7572 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 7573 { 7574 uint64_t key = 0; 7575 vsw_port_t *port = NULL; 7576 7577 D1(vswp, "%s: enter", __func__); 7578 7579 KEY_HASH(key, ehp->ether_dhost); 7580 7581 D2(vswp, "%s: key = 0x%llx", __func__, key); 7582 7583 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 7584 (mod_hash_val_t *)&port) != 0) { 7585 D2(vswp, "%s: no port found", __func__); 7586 return (NULL); 7587 } 7588 7589 D1(vswp, "%s: exit", __func__); 7590 7591 return (port); 7592 } 7593 7594 /* 7595 * Add or remove multicast address(es). 7596 * 7597 * Returns 0 on success, 1 on failure. 7598 */ 7599 static int 7600 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 7601 { 7602 mcst_addr_t *mcst_p = NULL; 7603 vsw_t *vswp = port->p_vswp; 7604 uint64_t addr = 0x0; 7605 int i; 7606 7607 D1(vswp, "%s: enter", __func__); 7608 7609 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 7610 7611 mutex_enter(&vswp->mac_lock); 7612 if (vswp->mh == NULL) { 7613 mutex_exit(&vswp->mac_lock); 7614 return (1); 7615 } 7616 mutex_exit(&vswp->mac_lock); 7617 7618 for (i = 0; i < mcst_pkt->count; i++) { 7619 /* 7620 * Convert address into form that can be used 7621 * as hash table key. 7622 */ 7623 KEY_HASH(addr, mcst_pkt->mca[i]); 7624 7625 /* 7626 * Add or delete the specified address/port combination. 7627 */ 7628 if (mcst_pkt->set == 0x1) { 7629 D3(vswp, "%s: adding multicast address 0x%llx for " 7630 "port %ld", __func__, addr, port->p_instance); 7631 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 7632 /* 7633 * Update the list of multicast 7634 * addresses contained within the 7635 * port structure to include this new 7636 * one. 7637 */ 7638 mcst_p = kmem_alloc( 7639 sizeof (mcst_addr_t), KM_NOSLEEP); 7640 if (mcst_p == NULL) { 7641 DERR(vswp, "%s: unable to alloc mem", 7642 __func__); 7643 return (1); 7644 } 7645 7646 mcst_p->nextp = NULL; 7647 mcst_p->addr = addr; 7648 7649 mutex_enter(&port->mca_lock); 7650 mcst_p->nextp = port->mcap; 7651 port->mcap = mcst_p; 7652 mutex_exit(&port->mca_lock); 7653 7654 /* 7655 * Program the address into HW. If the addr 7656 * has already been programmed then the MAC 7657 * just increments a ref counter (which is 7658 * used when the address is being deleted) 7659 */ 7660 mutex_enter(&vswp->mac_lock); 7661 if ((vswp->mh == NULL) || 7662 mac_multicst_add(vswp->mh, 7663 (uchar_t *)&mcst_pkt->mca[i])) { 7664 mutex_exit(&vswp->mac_lock); 7665 cmn_err(CE_WARN, "!vsw%d: unable to " 7666 "add multicast address", 7667 vswp->instance); 7668 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 7669 addr, port); 7670 vsw_del_addr(VSW_VNETPORT, port, addr); 7671 return (1); 7672 } 7673 mutex_exit(&vswp->mac_lock); 7674 7675 } else { 7676 DERR(vswp, "%s: error adding multicast " 7677 "address 0x%llx for port %ld", 7678 __func__, addr, port->p_instance); 7679 return (1); 7680 } 7681 } else { 7682 /* 7683 * Delete an entry from the multicast hash 7684 * table and update the address list 7685 * appropriately. 7686 */ 7687 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 7688 D3(vswp, "%s: deleting multicast address " 7689 "0x%llx for port %ld", __func__, addr, 7690 port->p_instance); 7691 7692 vsw_del_addr(VSW_VNETPORT, port, addr); 7693 7694 /* 7695 * Remove the address from HW. The address 7696 * will actually only be removed once the ref 7697 * count within the MAC layer has dropped to 7698 * zero. I.e. we can safely call this fn even 7699 * if other ports are interested in this 7700 * address. 7701 */ 7702 mutex_enter(&vswp->mac_lock); 7703 if ((vswp->mh == NULL) || 7704 mac_multicst_remove(vswp->mh, 7705 (uchar_t *)&mcst_pkt->mca[i])) { 7706 mutex_exit(&vswp->mac_lock); 7707 cmn_err(CE_WARN, "!vsw%d: unable to " 7708 "remove multicast address", 7709 vswp->instance); 7710 return (1); 7711 } 7712 mutex_exit(&vswp->mac_lock); 7713 7714 } else { 7715 DERR(vswp, "%s: error deleting multicast " 7716 "addr 0x%llx for port %ld", 7717 __func__, addr, port->p_instance); 7718 return (1); 7719 } 7720 } 7721 } 7722 D1(vswp, "%s: exit", __func__); 7723 return (0); 7724 } 7725 7726 /* 7727 * Add a new multicast entry. 7728 * 7729 * Search hash table based on address. If match found then 7730 * update associated val (which is chain of ports), otherwise 7731 * create new key/val (addr/port) pair and insert into table. 7732 */ 7733 static int 7734 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 7735 { 7736 int dup = 0; 7737 int rv = 0; 7738 mfdb_ent_t *ment = NULL; 7739 mfdb_ent_t *tmp_ent = NULL; 7740 mfdb_ent_t *new_ent = NULL; 7741 void *tgt = NULL; 7742 7743 if (devtype == VSW_VNETPORT) { 7744 /* 7745 * Being invoked from a vnet. 7746 */ 7747 ASSERT(arg != NULL); 7748 tgt = arg; 7749 D2(NULL, "%s: port %d : address 0x%llx", __func__, 7750 ((vsw_port_t *)arg)->p_instance, addr); 7751 } else { 7752 /* 7753 * We are being invoked via the m_multicst mac entry 7754 * point. 7755 */ 7756 D2(NULL, "%s: address 0x%llx", __func__, addr); 7757 tgt = (void *)vswp; 7758 } 7759 7760 WRITE_ENTER(&vswp->mfdbrw); 7761 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7762 (mod_hash_val_t *)&ment) != 0) { 7763 7764 /* address not currently in table */ 7765 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7766 ment->d_addr = (void *)tgt; 7767 ment->d_type = devtype; 7768 ment->nextp = NULL; 7769 7770 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 7771 (mod_hash_val_t)ment) != 0) { 7772 DERR(vswp, "%s: hash table insertion failed", __func__); 7773 kmem_free(ment, sizeof (mfdb_ent_t)); 7774 rv = 1; 7775 } else { 7776 D2(vswp, "%s: added initial entry for 0x%llx to " 7777 "table", __func__, addr); 7778 } 7779 } else { 7780 /* 7781 * Address in table. Check to see if specified port 7782 * is already associated with the address. If not add 7783 * it now. 7784 */ 7785 tmp_ent = ment; 7786 while (tmp_ent != NULL) { 7787 if (tmp_ent->d_addr == (void *)tgt) { 7788 if (devtype == VSW_VNETPORT) { 7789 DERR(vswp, "%s: duplicate port entry " 7790 "found for portid %ld and key " 7791 "0x%llx", __func__, 7792 ((vsw_port_t *)arg)->p_instance, 7793 addr); 7794 } else { 7795 DERR(vswp, "%s: duplicate entry found" 7796 "for key 0x%llx", __func__, addr); 7797 } 7798 rv = 1; 7799 dup = 1; 7800 break; 7801 } 7802 tmp_ent = tmp_ent->nextp; 7803 } 7804 7805 /* 7806 * Port not on list so add it to end now. 7807 */ 7808 if (0 == dup) { 7809 D2(vswp, "%s: added entry for 0x%llx to table", 7810 __func__, addr); 7811 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 7812 new_ent->d_addr = (void *)tgt; 7813 new_ent->d_type = devtype; 7814 new_ent->nextp = NULL; 7815 7816 tmp_ent = ment; 7817 while (tmp_ent->nextp != NULL) 7818 tmp_ent = tmp_ent->nextp; 7819 7820 tmp_ent->nextp = new_ent; 7821 } 7822 } 7823 7824 RW_EXIT(&vswp->mfdbrw); 7825 return (rv); 7826 } 7827 7828 /* 7829 * Remove a multicast entry from the hashtable. 7830 * 7831 * Search hash table based on address. If match found, scan 7832 * list of ports associated with address. If specified port 7833 * found remove it from list. 7834 */ 7835 static int 7836 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 7837 { 7838 mfdb_ent_t *ment = NULL; 7839 mfdb_ent_t *curr_p, *prev_p; 7840 void *tgt = NULL; 7841 7842 D1(vswp, "%s: enter", __func__); 7843 7844 if (devtype == VSW_VNETPORT) { 7845 tgt = (vsw_port_t *)arg; 7846 D2(vswp, "%s: removing port %d from mFDB for address" 7847 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr); 7848 } else { 7849 D2(vswp, "%s: removing entry", __func__); 7850 tgt = (void *)vswp; 7851 } 7852 7853 WRITE_ENTER(&vswp->mfdbrw); 7854 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 7855 (mod_hash_val_t *)&ment) != 0) { 7856 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 7857 RW_EXIT(&vswp->mfdbrw); 7858 return (1); 7859 } 7860 7861 prev_p = curr_p = ment; 7862 7863 while (curr_p != NULL) { 7864 if (curr_p->d_addr == (void *)tgt) { 7865 if (devtype == VSW_VNETPORT) { 7866 D2(vswp, "%s: port %d found", __func__, 7867 ((vsw_port_t *)tgt)->p_instance); 7868 } else { 7869 D2(vswp, "%s: instance found", __func__); 7870 } 7871 7872 if (prev_p == curr_p) { 7873 /* 7874 * head of list, if no other element is in 7875 * list then destroy this entry, otherwise 7876 * just replace it with updated value. 7877 */ 7878 ment = curr_p->nextp; 7879 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7880 if (ment == NULL) { 7881 (void) mod_hash_destroy(vswp->mfdb, 7882 (mod_hash_val_t)addr); 7883 } else { 7884 (void) mod_hash_replace(vswp->mfdb, 7885 (mod_hash_key_t)addr, 7886 (mod_hash_val_t)ment); 7887 } 7888 } else { 7889 /* 7890 * Not head of list, no need to do 7891 * replacement, just adjust list pointers. 7892 */ 7893 prev_p->nextp = curr_p->nextp; 7894 kmem_free(curr_p, sizeof (mfdb_ent_t)); 7895 } 7896 break; 7897 } 7898 7899 prev_p = curr_p; 7900 curr_p = curr_p->nextp; 7901 } 7902 7903 RW_EXIT(&vswp->mfdbrw); 7904 7905 D1(vswp, "%s: exit", __func__); 7906 7907 return (0); 7908 } 7909 7910 /* 7911 * Port is being deleted, but has registered an interest in one 7912 * or more multicast groups. Using the list of addresses maintained 7913 * within the port structure find the appropriate entry in the hash 7914 * table and remove this port from the list of interested ports. 7915 */ 7916 static void 7917 vsw_del_mcst_port(vsw_port_t *port) 7918 { 7919 mcst_addr_t *mcst_p = NULL; 7920 vsw_t *vswp = port->p_vswp; 7921 7922 D1(vswp, "%s: enter", __func__); 7923 7924 mutex_enter(&port->mca_lock); 7925 while (port->mcap != NULL) { 7926 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 7927 port->mcap->addr, port); 7928 7929 mcst_p = port->mcap->nextp; 7930 kmem_free(port->mcap, sizeof (mcst_addr_t)); 7931 port->mcap = mcst_p; 7932 } 7933 mutex_exit(&port->mca_lock); 7934 7935 D1(vswp, "%s: exit", __func__); 7936 } 7937 7938 /* 7939 * This vsw instance is detaching, but has registered an interest in one 7940 * or more multicast groups. Using the list of addresses maintained 7941 * within the vsw structure find the appropriate entry in the hash 7942 * table and remove this instance from the list of interested ports. 7943 */ 7944 static void 7945 vsw_del_mcst_vsw(vsw_t *vswp) 7946 { 7947 mcst_addr_t *next_p = NULL; 7948 7949 D1(vswp, "%s: enter", __func__); 7950 7951 mutex_enter(&vswp->mca_lock); 7952 7953 while (vswp->mcap != NULL) { 7954 DERR(vswp, "%s: deleting addr 0x%llx", 7955 __func__, vswp->mcap->addr); 7956 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL); 7957 7958 next_p = vswp->mcap->nextp; 7959 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 7960 vswp->mcap = next_p; 7961 } 7962 7963 vswp->mcap = NULL; 7964 mutex_exit(&vswp->mca_lock); 7965 7966 D1(vswp, "%s: exit", __func__); 7967 } 7968 7969 7970 /* 7971 * Remove the specified address from the list of address maintained 7972 * in this port node. 7973 */ 7974 static void 7975 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) 7976 { 7977 vsw_t *vswp = NULL; 7978 vsw_port_t *port = NULL; 7979 mcst_addr_t *prev_p = NULL; 7980 mcst_addr_t *curr_p = NULL; 7981 7982 D1(NULL, "%s: enter : devtype %d : addr 0x%llx", 7983 __func__, devtype, addr); 7984 7985 if (devtype == VSW_VNETPORT) { 7986 port = (vsw_port_t *)arg; 7987 mutex_enter(&port->mca_lock); 7988 prev_p = curr_p = port->mcap; 7989 } else { 7990 vswp = (vsw_t *)arg; 7991 mutex_enter(&vswp->mca_lock); 7992 prev_p = curr_p = vswp->mcap; 7993 } 7994 7995 while (curr_p != NULL) { 7996 if (curr_p->addr == addr) { 7997 D2(NULL, "%s: address found", __func__); 7998 /* match found */ 7999 if (prev_p == curr_p) { 8000 /* list head */ 8001 if (devtype == VSW_VNETPORT) 8002 port->mcap = curr_p->nextp; 8003 else 8004 vswp->mcap = curr_p->nextp; 8005 } else { 8006 prev_p->nextp = curr_p->nextp; 8007 } 8008 kmem_free(curr_p, sizeof (mcst_addr_t)); 8009 break; 8010 } else { 8011 prev_p = curr_p; 8012 curr_p = curr_p->nextp; 8013 } 8014 } 8015 8016 if (devtype == VSW_VNETPORT) 8017 mutex_exit(&port->mca_lock); 8018 else 8019 mutex_exit(&vswp->mca_lock); 8020 8021 D1(NULL, "%s: exit", __func__); 8022 } 8023 8024 /* 8025 * Creates a descriptor ring (dring) and links it into the 8026 * link of outbound drings for this channel. 8027 * 8028 * Returns NULL if creation failed. 8029 */ 8030 static dring_info_t * 8031 vsw_create_dring(vsw_ldc_t *ldcp) 8032 { 8033 vsw_private_desc_t *priv_addr = NULL; 8034 vsw_t *vswp = ldcp->ldc_vswp; 8035 ldc_mem_info_t minfo; 8036 dring_info_t *dp, *tp; 8037 int i; 8038 8039 dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 8040 8041 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 8042 8043 /* create public section of ring */ 8044 if ((ldc_mem_dring_create(VSW_RING_NUM_EL, 8045 VSW_PUB_SIZE, &dp->handle)) != 0) { 8046 8047 DERR(vswp, "vsw_create_dring(%lld): ldc dring create " 8048 "failed", ldcp->ldc_id); 8049 goto create_fail_exit; 8050 } 8051 8052 ASSERT(dp->handle != NULL); 8053 8054 /* 8055 * Get the base address of the public section of the ring. 8056 */ 8057 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 8058 DERR(vswp, "vsw_create_dring(%lld): dring info failed\n", 8059 ldcp->ldc_id); 8060 goto dring_fail_exit; 8061 } else { 8062 ASSERT(minfo.vaddr != 0); 8063 dp->pub_addr = minfo.vaddr; 8064 } 8065 8066 dp->num_descriptors = VSW_RING_NUM_EL; 8067 dp->descriptor_size = VSW_PUB_SIZE; 8068 dp->options = VIO_TX_DRING; 8069 dp->ncookies = 1; /* guaranteed by ldc */ 8070 8071 /* 8072 * create private portion of ring 8073 */ 8074 dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( 8075 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 8076 8077 if (vsw_setup_ring(ldcp, dp)) { 8078 DERR(vswp, "%s: unable to setup ring", __func__); 8079 goto dring_fail_exit; 8080 } 8081 8082 /* haven't used any descriptors yet */ 8083 dp->end_idx = 0; 8084 dp->last_ack_recv = -1; 8085 8086 /* bind dring to the channel */ 8087 if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, 8088 LDC_SHADOW_MAP, LDC_MEM_RW, 8089 &dp->cookie[0], &dp->ncookies)) != 0) { 8090 DERR(vswp, "vsw_create_dring: unable to bind to channel " 8091 "%lld", ldcp->ldc_id); 8092 goto dring_fail_exit; 8093 } 8094 8095 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 8096 dp->restart_reqd = B_TRUE; 8097 8098 /* 8099 * Only ever create rings for outgoing lane. Link it onto 8100 * end of list. 8101 */ 8102 WRITE_ENTER(&ldcp->lane_out.dlistrw); 8103 if (ldcp->lane_out.dringp == NULL) { 8104 D2(vswp, "vsw_create_dring: adding first outbound ring"); 8105 ldcp->lane_out.dringp = dp; 8106 } else { 8107 tp = ldcp->lane_out.dringp; 8108 while (tp->next != NULL) 8109 tp = tp->next; 8110 8111 tp->next = dp; 8112 } 8113 RW_EXIT(&ldcp->lane_out.dlistrw); 8114 8115 return (dp); 8116 8117 dring_fail_exit: 8118 (void) ldc_mem_dring_destroy(dp->handle); 8119 8120 create_fail_exit: 8121 if (dp->priv_addr != NULL) { 8122 priv_addr = dp->priv_addr; 8123 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8124 if (priv_addr->memhandle != NULL) 8125 (void) ldc_mem_free_handle( 8126 priv_addr->memhandle); 8127 priv_addr++; 8128 } 8129 kmem_free(dp->priv_addr, 8130 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 8131 } 8132 mutex_destroy(&dp->dlock); 8133 8134 kmem_free(dp, sizeof (dring_info_t)); 8135 return (NULL); 8136 } 8137 8138 /* 8139 * Create a ring consisting of just a private portion and link 8140 * it into the list of rings for the outbound lane. 8141 * 8142 * These type of rings are used primarily for temporary data 8143 * storage (i.e. as data buffers). 8144 */ 8145 void 8146 vsw_create_privring(vsw_ldc_t *ldcp) 8147 { 8148 dring_info_t *dp, *tp; 8149 vsw_t *vswp = ldcp->ldc_vswp; 8150 8151 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 8152 8153 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 8154 8155 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 8156 8157 /* no public section */ 8158 dp->pub_addr = NULL; 8159 8160 dp->priv_addr = kmem_zalloc( 8161 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 8162 8163 dp->num_descriptors = VSW_RING_NUM_EL; 8164 8165 if (vsw_setup_ring(ldcp, dp)) { 8166 DERR(vswp, "%s: setup of ring failed", __func__); 8167 kmem_free(dp->priv_addr, 8168 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 8169 mutex_destroy(&dp->dlock); 8170 kmem_free(dp, sizeof (dring_info_t)); 8171 return; 8172 } 8173 8174 /* haven't used any descriptors yet */ 8175 dp->end_idx = 0; 8176 8177 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 8178 dp->restart_reqd = B_TRUE; 8179 8180 /* 8181 * Only ever create rings for outgoing lane. Link it onto 8182 * end of list. 8183 */ 8184 WRITE_ENTER(&ldcp->lane_out.dlistrw); 8185 if (ldcp->lane_out.dringp == NULL) { 8186 D2(vswp, "%s: adding first outbound privring", __func__); 8187 ldcp->lane_out.dringp = dp; 8188 } else { 8189 tp = ldcp->lane_out.dringp; 8190 while (tp->next != NULL) 8191 tp = tp->next; 8192 8193 tp->next = dp; 8194 } 8195 RW_EXIT(&ldcp->lane_out.dlistrw); 8196 8197 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 8198 } 8199 8200 /* 8201 * Setup the descriptors in the dring. Returns 0 on success, 1 on 8202 * failure. 8203 */ 8204 int 8205 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) 8206 { 8207 vnet_public_desc_t *pub_addr = NULL; 8208 vsw_private_desc_t *priv_addr = NULL; 8209 vsw_t *vswp = ldcp->ldc_vswp; 8210 uint64_t *tmpp; 8211 uint64_t offset = 0; 8212 uint32_t ncookies = 0; 8213 static char *name = "vsw_setup_ring"; 8214 int i, j, nc, rv; 8215 8216 priv_addr = dp->priv_addr; 8217 pub_addr = dp->pub_addr; 8218 8219 /* public section may be null but private should never be */ 8220 ASSERT(priv_addr != NULL); 8221 8222 /* 8223 * Allocate the region of memory which will be used to hold 8224 * the data the descriptors will refer to. 8225 */ 8226 dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ); 8227 dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); 8228 8229 D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, 8230 dp->data_sz, dp->data_addr); 8231 8232 tmpp = (uint64_t *)dp->data_addr; 8233 offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp); 8234 8235 /* 8236 * Initialise some of the private and public (if they exist) 8237 * descriptor fields. 8238 */ 8239 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8240 mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); 8241 8242 if ((ldc_mem_alloc_handle(ldcp->ldc_handle, 8243 &priv_addr->memhandle)) != 0) { 8244 DERR(vswp, "%s: alloc mem handle failed", name); 8245 goto setup_ring_cleanup; 8246 } 8247 8248 priv_addr->datap = (void *)tmpp; 8249 8250 rv = ldc_mem_bind_handle(priv_addr->memhandle, 8251 (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ, 8252 LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, 8253 &(priv_addr->memcookie[0]), &ncookies); 8254 if (rv != 0) { 8255 DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " 8256 "(rv %d)", name, ldcp->ldc_id, rv); 8257 goto setup_ring_cleanup; 8258 } 8259 priv_addr->bound = 1; 8260 8261 D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", 8262 name, i, priv_addr->memcookie[0].addr, 8263 priv_addr->memcookie[0].size); 8264 8265 if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { 8266 DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " 8267 "invalid num of cookies (%d) for size 0x%llx", 8268 name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ); 8269 8270 goto setup_ring_cleanup; 8271 } else { 8272 for (j = 1; j < ncookies; j++) { 8273 rv = ldc_mem_nextcookie(priv_addr->memhandle, 8274 &(priv_addr->memcookie[j])); 8275 if (rv != 0) { 8276 DERR(vswp, "%s: ldc_mem_nextcookie " 8277 "failed rv (%d)", name, rv); 8278 goto setup_ring_cleanup; 8279 } 8280 D3(vswp, "%s: memcookie %d : addr 0x%llx : " 8281 "size 0x%llx", name, j, 8282 priv_addr->memcookie[j].addr, 8283 priv_addr->memcookie[j].size); 8284 } 8285 8286 } 8287 priv_addr->ncookies = ncookies; 8288 priv_addr->dstate = VIO_DESC_FREE; 8289 8290 if (pub_addr != NULL) { 8291 8292 /* link pub and private sides */ 8293 priv_addr->descp = pub_addr; 8294 8295 pub_addr->ncookies = priv_addr->ncookies; 8296 8297 for (nc = 0; nc < pub_addr->ncookies; nc++) { 8298 bcopy(&priv_addr->memcookie[nc], 8299 &pub_addr->memcookie[nc], 8300 sizeof (ldc_mem_cookie_t)); 8301 } 8302 8303 pub_addr->hdr.dstate = VIO_DESC_FREE; 8304 pub_addr++; 8305 } 8306 8307 /* 8308 * move to next element in the dring and the next 8309 * position in the data buffer. 8310 */ 8311 priv_addr++; 8312 tmpp += offset; 8313 } 8314 8315 return (0); 8316 8317 setup_ring_cleanup: 8318 priv_addr = dp->priv_addr; 8319 8320 for (j = 0; j < i; j++) { 8321 (void) ldc_mem_unbind_handle(priv_addr->memhandle); 8322 (void) ldc_mem_free_handle(priv_addr->memhandle); 8323 8324 mutex_destroy(&priv_addr->dstate_lock); 8325 8326 priv_addr++; 8327 } 8328 kmem_free(dp->data_addr, dp->data_sz); 8329 8330 return (1); 8331 } 8332 8333 /* 8334 * Searches the private section of a ring for a free descriptor, 8335 * starting at the location of the last free descriptor found 8336 * previously. 8337 * 8338 * Returns 0 if free descriptor is available, and updates state 8339 * of private descriptor to VIO_DESC_READY, otherwise returns 1. 8340 * 8341 * FUTURE: might need to return contiguous range of descriptors 8342 * as dring info msg assumes all will be contiguous. 8343 */ 8344 static int 8345 vsw_dring_find_free_desc(dring_info_t *dringp, 8346 vsw_private_desc_t **priv_p, int *idx) 8347 { 8348 vsw_private_desc_t *addr = NULL; 8349 int num = VSW_RING_NUM_EL; 8350 int ret = 1; 8351 8352 D1(NULL, "%s enter\n", __func__); 8353 8354 ASSERT(dringp->priv_addr != NULL); 8355 8356 D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", 8357 __func__, dringp, dringp->end_idx); 8358 8359 addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; 8360 8361 mutex_enter(&addr->dstate_lock); 8362 if (addr->dstate == VIO_DESC_FREE) { 8363 addr->dstate = VIO_DESC_READY; 8364 *priv_p = addr; 8365 *idx = dringp->end_idx; 8366 dringp->end_idx = (dringp->end_idx + 1) % num; 8367 ret = 0; 8368 8369 } 8370 mutex_exit(&addr->dstate_lock); 8371 8372 /* ring full */ 8373 if (ret == 1) { 8374 D2(NULL, "%s: no desp free: started at %d", __func__, 8375 dringp->end_idx); 8376 } 8377 8378 D1(NULL, "%s: exit\n", __func__); 8379 8380 return (ret); 8381 } 8382 8383 /* 8384 * Map from a dring identifier to the ring itself. Returns 8385 * pointer to ring or NULL if no match found. 8386 * 8387 * Should be called with dlistrw rwlock held as reader. 8388 */ 8389 static dring_info_t * 8390 vsw_ident2dring(lane_t *lane, uint64_t ident) 8391 { 8392 dring_info_t *dp = NULL; 8393 8394 if ((dp = lane->dringp) == NULL) { 8395 return (NULL); 8396 } else { 8397 if (dp->ident == ident) 8398 return (dp); 8399 8400 while (dp != NULL) { 8401 if (dp->ident == ident) 8402 break; 8403 dp = dp->next; 8404 } 8405 } 8406 8407 return (dp); 8408 } 8409 8410 /* 8411 * Set the default lane attributes. These are copied into 8412 * the attr msg we send to our peer. If they are not acceptable 8413 * then (currently) the handshake ends. 8414 */ 8415 static void 8416 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) 8417 { 8418 bzero(lp, sizeof (lane_t)); 8419 8420 READ_ENTER(&vswp->if_lockrw); 8421 ether_copy(&(vswp->if_addr), &(lp->addr)); 8422 RW_EXIT(&vswp->if_lockrw); 8423 8424 lp->mtu = VSW_MTU; 8425 lp->addr_type = ADDR_TYPE_MAC; 8426 lp->xfer_mode = VIO_DRING_MODE; 8427 lp->ack_freq = 0; /* for shared mode */ 8428 8429 mutex_enter(&lp->seq_lock); 8430 lp->seq_num = VNET_ISS; 8431 mutex_exit(&lp->seq_lock); 8432 } 8433 8434 /* 8435 * Verify that the attributes are acceptable. 8436 * 8437 * FUTURE: If some attributes are not acceptable, change them 8438 * our desired values. 8439 */ 8440 static int 8441 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port) 8442 { 8443 int ret = 0; 8444 8445 D1(NULL, "vsw_check_attr enter\n"); 8446 8447 /* 8448 * Note we currently only support in-band descriptors 8449 * and descriptor rings, not packet based transfer (VIO_PKT_MODE) 8450 */ 8451 if ((pkt->xfer_mode != VIO_DESC_MODE) && 8452 (pkt->xfer_mode != VIO_DRING_MODE)) { 8453 D2(NULL, "vsw_check_attr: unknown mode %x\n", pkt->xfer_mode); 8454 ret = 1; 8455 } 8456 8457 /* Only support MAC addresses at moment. */ 8458 if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) { 8459 D2(NULL, "vsw_check_attr: invalid addr_type %x, " 8460 "or address 0x%llx\n", pkt->addr_type, pkt->addr); 8461 ret = 1; 8462 } 8463 8464 /* 8465 * MAC address supplied by device should match that stored 8466 * in the vsw-port OBP node. Need to decide what to do if they 8467 * don't match, for the moment just warn but don't fail. 8468 */ 8469 if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) { 8470 DERR(NULL, "vsw_check_attr: device supplied address " 8471 "0x%llx doesn't match node address 0x%llx\n", 8472 pkt->addr, port->p_macaddr); 8473 } 8474 8475 /* 8476 * Ack freq only makes sense in pkt mode, in shared 8477 * mode the ring descriptors say whether or not to 8478 * send back an ACK. 8479 */ 8480 if ((pkt->xfer_mode == VIO_DRING_MODE) && 8481 (pkt->ack_freq > 0)) { 8482 D2(NULL, "vsw_check_attr: non zero ack freq " 8483 " in SHM mode\n"); 8484 ret = 1; 8485 } 8486 8487 /* 8488 * Note: for the moment we only support ETHER 8489 * frames. This may change in the future. 8490 */ 8491 if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) { 8492 D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n", 8493 pkt->mtu); 8494 ret = 1; 8495 } 8496 8497 D1(NULL, "vsw_check_attr exit\n"); 8498 8499 return (ret); 8500 } 8501 8502 /* 8503 * Returns 1 if there is a problem, 0 otherwise. 8504 */ 8505 static int 8506 vsw_check_dring_info(vio_dring_reg_msg_t *pkt) 8507 { 8508 _NOTE(ARGUNUSED(pkt)) 8509 8510 int ret = 0; 8511 8512 D1(NULL, "vsw_check_dring_info enter\n"); 8513 8514 if ((pkt->num_descriptors == 0) || 8515 (pkt->descriptor_size == 0) || 8516 (pkt->ncookies != 1)) { 8517 DERR(NULL, "vsw_check_dring_info: invalid dring msg"); 8518 ret = 1; 8519 } 8520 8521 D1(NULL, "vsw_check_dring_info exit\n"); 8522 8523 return (ret); 8524 } 8525 8526 /* 8527 * Returns 1 if two memory cookies match. Otherwise returns 0. 8528 */ 8529 static int 8530 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2) 8531 { 8532 if ((m1->addr != m2->addr) || 8533 (m2->size != m2->size)) { 8534 return (0); 8535 } else { 8536 return (1); 8537 } 8538 } 8539 8540 /* 8541 * Returns 1 if ring described in reg message matches that 8542 * described by dring_info structure. Otherwise returns 0. 8543 */ 8544 static int 8545 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) 8546 { 8547 if ((msg->descriptor_size != dp->descriptor_size) || 8548 (msg->num_descriptors != dp->num_descriptors) || 8549 (msg->ncookies != dp->ncookies) || 8550 !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) { 8551 return (0); 8552 } else { 8553 return (1); 8554 } 8555 8556 } 8557 8558 static caddr_t 8559 vsw_print_ethaddr(uint8_t *a, char *ebuf) 8560 { 8561 (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", 8562 a[0], a[1], a[2], a[3], a[4], a[5]); 8563 return (ebuf); 8564 } 8565 8566 /* 8567 * Reset and free all the resources associated with 8568 * the channel. 8569 */ 8570 static void 8571 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) 8572 { 8573 dring_info_t *dp, *dpp; 8574 lane_t *lp = NULL; 8575 int rv = 0; 8576 8577 ASSERT(ldcp != NULL); 8578 8579 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); 8580 8581 if (dir == INBOUND) { 8582 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" 8583 " of channel %lld", __func__, ldcp->ldc_id); 8584 lp = &ldcp->lane_in; 8585 } else { 8586 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" 8587 " of channel %lld", __func__, ldcp->ldc_id); 8588 lp = &ldcp->lane_out; 8589 } 8590 8591 lp->lstate = VSW_LANE_INACTIV; 8592 mutex_enter(&lp->seq_lock); 8593 lp->seq_num = VNET_ISS; 8594 mutex_exit(&lp->seq_lock); 8595 if (lp->dringp) { 8596 if (dir == INBOUND) { 8597 WRITE_ENTER(&lp->dlistrw); 8598 dp = lp->dringp; 8599 while (dp != NULL) { 8600 dpp = dp->next; 8601 if (dp->handle != NULL) 8602 (void) ldc_mem_dring_unmap(dp->handle); 8603 kmem_free(dp, sizeof (dring_info_t)); 8604 dp = dpp; 8605 } 8606 RW_EXIT(&lp->dlistrw); 8607 } else { 8608 /* 8609 * unbind, destroy exported dring, free dring struct 8610 */ 8611 WRITE_ENTER(&lp->dlistrw); 8612 dp = lp->dringp; 8613 rv = vsw_free_ring(dp); 8614 RW_EXIT(&lp->dlistrw); 8615 } 8616 if (rv == 0) { 8617 lp->dringp = NULL; 8618 } 8619 } 8620 8621 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); 8622 } 8623 8624 /* 8625 * Free ring and all associated resources. 8626 * 8627 * Should be called with dlistrw rwlock held as writer. 8628 */ 8629 static int 8630 vsw_free_ring(dring_info_t *dp) 8631 { 8632 vsw_private_desc_t *paddr = NULL; 8633 dring_info_t *dpp; 8634 int i, rv = 1; 8635 8636 while (dp != NULL) { 8637 mutex_enter(&dp->dlock); 8638 dpp = dp->next; 8639 if (dp->priv_addr != NULL) { 8640 /* 8641 * First unbind and free the memory handles 8642 * stored in each descriptor within the ring. 8643 */ 8644 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8645 paddr = (vsw_private_desc_t *) 8646 dp->priv_addr + i; 8647 if (paddr->memhandle != NULL) { 8648 if (paddr->bound == 1) { 8649 rv = ldc_mem_unbind_handle( 8650 paddr->memhandle); 8651 8652 if (rv != 0) { 8653 DERR(NULL, "error " 8654 "unbinding handle for " 8655 "ring 0x%llx at pos %d", 8656 dp, i); 8657 mutex_exit(&dp->dlock); 8658 return (rv); 8659 } 8660 paddr->bound = 0; 8661 } 8662 8663 rv = ldc_mem_free_handle( 8664 paddr->memhandle); 8665 if (rv != 0) { 8666 DERR(NULL, "error freeing " 8667 "handle for ring 0x%llx " 8668 "at pos %d", dp, i); 8669 mutex_exit(&dp->dlock); 8670 return (rv); 8671 } 8672 paddr->memhandle = NULL; 8673 } 8674 mutex_destroy(&paddr->dstate_lock); 8675 } 8676 kmem_free(dp->priv_addr, 8677 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 8678 } 8679 8680 /* 8681 * Now unbind and destroy the ring itself. 8682 */ 8683 if (dp->handle != NULL) { 8684 (void) ldc_mem_dring_unbind(dp->handle); 8685 (void) ldc_mem_dring_destroy(dp->handle); 8686 } 8687 8688 if (dp->data_addr != NULL) { 8689 kmem_free(dp->data_addr, dp->data_sz); 8690 } 8691 8692 mutex_exit(&dp->dlock); 8693 mutex_destroy(&dp->dlock); 8694 mutex_destroy(&dp->restart_lock); 8695 kmem_free(dp, sizeof (dring_info_t)); 8696 8697 dp = dpp; 8698 } 8699 return (0); 8700 } 8701 8702 /* 8703 * Debugging routines 8704 */ 8705 static void 8706 display_state(void) 8707 { 8708 vsw_t *vswp; 8709 vsw_port_list_t *plist; 8710 vsw_port_t *port; 8711 vsw_ldc_list_t *ldcl; 8712 vsw_ldc_t *ldcp; 8713 8714 cmn_err(CE_NOTE, "***** system state *****"); 8715 8716 for (vswp = vsw_head; vswp; vswp = vswp->next) { 8717 plist = &vswp->plist; 8718 READ_ENTER(&plist->lockrw); 8719 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", 8720 vswp->instance, plist->num_ports); 8721 8722 for (port = plist->head; port != NULL; port = port->p_next) { 8723 ldcl = &port->p_ldclist; 8724 cmn_err(CE_CONT, "port %d : %d ldcs attached\n", 8725 port->p_instance, ldcl->num_ldcs); 8726 READ_ENTER(&ldcl->lockrw); 8727 ldcp = ldcl->head; 8728 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 8729 cmn_err(CE_CONT, "chan %lu : dev %d : " 8730 "status %d : phase %u\n", 8731 ldcp->ldc_id, ldcp->dev_class, 8732 ldcp->ldc_status, ldcp->hphase); 8733 cmn_err(CE_CONT, "chan %lu : lsession %lu : " 8734 "psession %lu\n", ldcp->ldc_id, 8735 ldcp->local_session, ldcp->peer_session); 8736 8737 cmn_err(CE_CONT, "Inbound lane:\n"); 8738 display_lane(&ldcp->lane_in); 8739 cmn_err(CE_CONT, "Outbound lane:\n"); 8740 display_lane(&ldcp->lane_out); 8741 } 8742 RW_EXIT(&ldcl->lockrw); 8743 } 8744 RW_EXIT(&plist->lockrw); 8745 } 8746 cmn_err(CE_NOTE, "***** system state *****"); 8747 } 8748 8749 static void 8750 display_lane(lane_t *lp) 8751 { 8752 dring_info_t *drp; 8753 8754 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", 8755 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); 8756 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", 8757 lp->addr_type, lp->addr, lp->xfer_mode); 8758 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); 8759 8760 cmn_err(CE_CONT, "Dring info:\n"); 8761 for (drp = lp->dringp; drp != NULL; drp = drp->next) { 8762 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", 8763 drp->num_descriptors, drp->descriptor_size); 8764 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle); 8765 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", 8766 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); 8767 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", 8768 drp->ident, drp->end_idx); 8769 display_ring(drp); 8770 } 8771 } 8772 8773 static void 8774 display_ring(dring_info_t *dringp) 8775 { 8776 uint64_t i; 8777 uint64_t priv_count = 0; 8778 uint64_t pub_count = 0; 8779 vnet_public_desc_t *pub_addr = NULL; 8780 vsw_private_desc_t *priv_addr = NULL; 8781 8782 for (i = 0; i < VSW_RING_NUM_EL; i++) { 8783 if (dringp->pub_addr != NULL) { 8784 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; 8785 8786 if (pub_addr->hdr.dstate == VIO_DESC_FREE) 8787 pub_count++; 8788 } 8789 8790 if (dringp->priv_addr != NULL) { 8791 priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i; 8792 8793 if (priv_addr->dstate == VIO_DESC_FREE) 8794 priv_count++; 8795 } 8796 } 8797 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", 8798 i, priv_count, pub_count); 8799 } 8800 8801 static void 8802 dump_flags(uint64_t state) 8803 { 8804 int i; 8805 8806 typedef struct flag_name { 8807 int flag_val; 8808 char *flag_name; 8809 } flag_name_t; 8810 8811 flag_name_t flags[] = { 8812 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", 8813 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", 8814 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", 8815 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", 8816 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", 8817 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", 8818 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", 8819 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", 8820 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", 8821 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", 8822 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", 8823 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", 8824 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", 8825 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", 8826 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", 8827 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", 8828 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", 8829 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", 8830 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", 8831 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", 8832 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", 8833 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", 8834 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", 8835 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", 8836 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", 8837 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", 8838 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", 8839 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", 8840 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", 8841 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", 8842 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; 8843 8844 DERR(NULL, "DUMP_FLAGS: %llx\n", state); 8845 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { 8846 if (state & flags[i].flag_val) 8847 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); 8848 } 8849 } 8850