1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/kmem.h> 28 #include <sys/conf.h> 29 #include <sys/ddi.h> 30 #include <sys/sunddi.h> 31 #include <sys/ksynch.h> 32 #include <sys/dlpi.h> /* HCKSUM_INET_FULL_V4 */ 33 #include <sys/pattr.h> /* HCK_FULLCKSUM */ 34 #include <sys/ib/mgt/sm_attr.h> /* SM_INIT_TYPE_REPLY_... */ 35 36 #include <sys/ib/clients/eoib/eib_impl.h> 37 38 /* 39 * Declarations private to this file 40 */ 41 static void eib_ibt_reset_partitions(eib_t *); 42 static void eib_ibt_wakeup_sqd_waiters(eib_t *, ibt_channel_hdl_t); 43 static int eib_ibt_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t, boolean_t, 44 boolean_t *); 45 static boolean_t eib_ibt_has_chan_pkey_changed(eib_t *, eib_chan_t *); 46 static boolean_t eib_ibt_has_any_pkey_changed(eib_t *); 47 static int eib_ibt_fill_avect(eib_t *, eib_avect_t *, ib_lid_t); 48 static void eib_ibt_record_srate(eib_t *); 49 50 /* 51 * Definitions private to this file 52 */ 53 54 /* 55 * SM's init type reply flags 56 */ 57 #define EIB_PORT_ATTR_LOADED(itr) \ 58 (((itr) & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) 59 #define EIB_PORT_ATTR_NOT_PRESERVED(itr) \ 60 (((itr) & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) 61 #define EIB_PORT_PRES_NOT_PRESERVED(itr) \ 62 (((itr) & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 0) 63 64 /* 65 * eib_ibt_hca_init() initialization progress flags 66 */ 67 #define EIB_HCAINIT_HCA_OPENED 0x01 68 #define EIB_HCAINIT_ATTRS_ALLOCD 0x02 69 #define EIB_HCAINIT_HCA_PORTS_QUERIED 0x04 70 #define EIB_HCAINIT_PD_ALLOCD 0x08 71 #define EIB_HCAINIT_CAPAB_RECORDED 0x10 72 73 int 74 eib_ibt_hca_init(eib_t *ss) 75 { 76 ibt_status_t ret; 77 ibt_hca_portinfo_t *pi; 78 uint_t num_pi; 79 uint_t sz_pi; 80 uint_t progress = 0; 81 82 if (ss->ei_hca_hdl) 83 return (EIB_E_SUCCESS); 84 85 /* 86 * Open the HCA 87 */ 88 ret = ibt_open_hca(ss->ei_ibt_hdl, ss->ei_props->ep_hca_guid, 89 &ss->ei_hca_hdl); 90 if (ret != IBT_SUCCESS) { 91 EIB_DPRINTF_ERR(ss->ei_instance, 92 "ibt_open_hca(hca_guid=0x%llx) " 93 "failed, ret=%d", ss->ei_props->ep_hca_guid, ret); 94 goto ibt_hca_init_fail; 95 } 96 progress |= EIB_HCAINIT_HCA_OPENED; 97 98 /* 99 * Query and store HCA attributes 100 */ 101 ss->ei_hca_attrs = kmem_zalloc(sizeof (ibt_hca_attr_t), KM_SLEEP); 102 progress |= EIB_HCAINIT_ATTRS_ALLOCD; 103 104 ret = ibt_query_hca(ss->ei_hca_hdl, ss->ei_hca_attrs); 105 if (ret != IBT_SUCCESS) { 106 EIB_DPRINTF_ERR(ss->ei_instance, 107 "ibt_query_hca(hca_hdl=0x%llx, " 108 "hca_guid=0x%llx) failed, ret=%d", 109 ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret); 110 goto ibt_hca_init_fail; 111 } 112 113 /* 114 * At this point, we don't even care about the linkstate, we only want 115 * to record our invariant base port guid and mtu 116 */ 117 ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num, 118 &pi, &num_pi, &sz_pi); 119 if (ret != IBT_SUCCESS) { 120 EIB_DPRINTF_ERR(ss->ei_instance, 121 "ibt_query_hca_ports(hca_hdl=0x%llx, " 122 "port=0x%x) failed, ret=%d", ss->ei_hca_hdl, 123 ss->ei_props->ep_port_num, ret); 124 goto ibt_hca_init_fail; 125 } 126 if (num_pi != 1) { 127 EIB_DPRINTF_ERR(ss->ei_instance, 128 "ibt_query_hca_ports(hca_hdl=0x%llx, " 129 "port=0x%x) returned num_pi=%d", ss->ei_hca_hdl, 130 ss->ei_props->ep_port_num, num_pi); 131 ibt_free_portinfo(pi, sz_pi); 132 goto ibt_hca_init_fail; 133 } 134 135 ss->ei_props->ep_sgid = pi->p_sgid_tbl[0]; 136 ss->ei_props->ep_mtu = (128 << pi->p_mtu); 137 ibt_free_portinfo(pi, sz_pi); 138 139 progress |= EIB_HCAINIT_HCA_PORTS_QUERIED; 140 141 /* 142 * Allocate a protection domain for all our transactions 143 */ 144 ret = ibt_alloc_pd(ss->ei_hca_hdl, IBT_PD_NO_FLAGS, &ss->ei_pd_hdl); 145 if (ret != IBT_SUCCESS) { 146 EIB_DPRINTF_ERR(ss->ei_instance, 147 "ibt_alloc_pd(hca_hdl=0x%llx, " 148 "hca_guid=0x%llx) failed, ret=%d", 149 ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret); 150 goto ibt_hca_init_fail; 151 } 152 progress |= EIB_HCAINIT_PD_ALLOCD; 153 154 /* 155 * Finally, record the capabilities 156 */ 157 ss->ei_caps = kmem_zalloc(sizeof (eib_caps_t), KM_SLEEP); 158 eib_ibt_record_capab(ss, ss->ei_hca_attrs, ss->ei_caps); 159 eib_ibt_record_srate(ss); 160 161 progress |= EIB_HCAINIT_CAPAB_RECORDED; 162 163 return (EIB_E_SUCCESS); 164 165 ibt_hca_init_fail: 166 eib_rb_ibt_hca_init(ss, progress); 167 return (EIB_E_FAILURE); 168 } 169 170 void 171 eib_ibt_link_mod(eib_t *ss) 172 { 173 eib_node_state_t *ns = ss->ei_node_state; 174 ibt_hca_portinfo_t *pi; 175 ibt_status_t ret; 176 uint8_t vn0_mac[ETHERADDRL]; 177 boolean_t all_zombies = B_FALSE; 178 boolean_t all_need_rejoin = B_FALSE; 179 uint_t num_pi; 180 uint_t sz_pi; 181 uint8_t itr; 182 183 if (ns->ns_link_state == LINK_STATE_UNKNOWN) 184 return; 185 186 /* 187 * See if we can get the port attributes or we're as good as down. 188 */ 189 ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num, 190 &pi, &num_pi, &sz_pi); 191 if ((ret != IBT_SUCCESS) || (pi->p_linkstate != IBT_PORT_ACTIVE)) { 192 ibt_free_portinfo(pi, sz_pi); 193 eib_mac_link_down(ss, B_FALSE); 194 return; 195 } 196 197 /* 198 * If the SM re-initialized the port attributes, but did not preserve 199 * the old attributes, we need to check more. 200 */ 201 itr = pi->p_init_type_reply; 202 if (EIB_PORT_ATTR_LOADED(itr) && EIB_PORT_ATTR_NOT_PRESERVED(itr)) { 203 /* 204 * We're just coming back up; if we see that our base lid 205 * or sgid table has changed, we'll update these and try to 206 * restart all active vnics. If any of the vnic pkeys have 207 * changed, we'll reset the affected channels to the new pkey. 208 */ 209 if (bcmp(pi->p_sgid_tbl, &ss->ei_props->ep_sgid, 210 sizeof (ib_gid_t)) != 0) { 211 EIB_DPRINTF_VERBOSE(ss->ei_instance, 212 "eib_ibt_link_mod: port sgid table changed " 213 "(old %llx.%llx != new %llx.%llx), " 214 "all vnics are zombies now.", 215 ss->ei_props->ep_sgid.gid_prefix, 216 ss->ei_props->ep_sgid.gid_guid, 217 pi->p_sgid_tbl[0].gid_prefix, 218 pi->p_sgid_tbl[0].gid_guid); 219 220 ss->ei_props->ep_sgid = pi->p_sgid_tbl[0]; 221 all_zombies = B_TRUE; 222 223 } else if (ss->ei_props->ep_blid != pi->p_base_lid) { 224 EIB_DPRINTF_VERBOSE(ss->ei_instance, 225 "eib_ibt_link_mod: port base lid changed " 226 "(old 0x%x != new 0x%x), " 227 "all vnics are zombies now.", 228 ss->ei_props->ep_blid, pi->p_base_lid); 229 230 ss->ei_props->ep_blid = pi->p_base_lid; 231 all_zombies = B_TRUE; 232 233 } else if (eib_ibt_has_any_pkey_changed(ss)) { 234 EIB_DPRINTF_VERBOSE(ss->ei_instance, 235 "eib_ibt_link_mod: pkey has changed for vnic(s), " 236 "resetting all partitions"); 237 238 eib_ibt_reset_partitions(ss); 239 } 240 } 241 242 if (pi) { 243 ibt_free_portinfo(pi, sz_pi); 244 } 245 246 /* 247 * If the SM hasn't preserved our presence in MCGs, we need to 248 * rejoin all of them. 249 */ 250 if (EIB_PORT_PRES_NOT_PRESERVED(itr)) { 251 EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: " 252 "hca_guid=0x%llx, port=0x%x presence not preserved in SM, " 253 "rejoining all mcgs", ss->ei_props->ep_hca_guid, 254 ss->ei_props->ep_port_num); 255 256 all_need_rejoin = B_TRUE; 257 } 258 259 /* 260 * Before we do the actual work of restarting/rejoining, we need to 261 * see if the GW is reachable at this point of time. If not, we 262 * still continue to keep our link "down." Whenever the GW becomes 263 * reachable again, we'll restart/rejoin all the vnics that we've 264 * just marked. 265 */ 266 mutex_enter(&ss->ei_vnic_lock); 267 if (all_zombies) { 268 ss->ei_zombie_vnics = ss->ei_active_vnics; 269 } 270 if (all_need_rejoin) { 271 ss->ei_rejoin_vnics = ss->ei_active_vnics; 272 } 273 if (ss->ei_gw_unreachable) { 274 mutex_exit(&ss->ei_vnic_lock); 275 276 EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_link_mod: " 277 "gateway (gw_port=0x%x) unreachable for " 278 "hca_guid=0x%llx, port=0x%x, link state down", 279 ss->ei_gw_props->pp_gw_portid, ss->ei_props->ep_hca_guid, 280 ss->ei_props->ep_port_num); 281 282 eib_mac_link_down(ss, B_FALSE); 283 return; 284 } 285 mutex_exit(&ss->ei_vnic_lock); 286 287 /* 288 * Try to awaken the dead if possible 289 */ 290 bcopy(eib_zero_mac, vn0_mac, ETHERADDRL); 291 if (all_zombies) { 292 EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: " 293 "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, " 294 "attempting to resurrect zombies", 295 ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num, 296 ss->ei_gw_props->pp_gw_portid); 297 298 eib_vnic_resurrect_zombies(ss, vn0_mac); 299 } 300 301 /* 302 * Re-join the mcgs if we need to 303 */ 304 if (all_need_rejoin) { 305 EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: " 306 "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, " 307 "attempting to rejoin mcgs", 308 ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num, 309 ss->ei_gw_props->pp_gw_portid); 310 311 eib_vnic_rejoin_mcgs(ss); 312 } 313 314 /* 315 * If we've restarted the zombies because the gateway went down and 316 * came back, it is possible our unicast mac address changed from 317 * what it was earlier. If so, we need to update our unicast address 318 * with the mac layer before marking the link up. 319 */ 320 if (bcmp(vn0_mac, eib_zero_mac, ETHERADDRL) != 0) 321 mac_unicst_update(ss->ei_mac_hdl, vn0_mac); 322 323 /* 324 * Notify the link state up if required 325 */ 326 eib_mac_link_up(ss, B_FALSE); 327 } 328 329 int 330 eib_ibt_modify_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t pkey) 331 { 332 /* 333 * Make sure the channel pkey and index are set to what we need 334 */ 335 return (eib_ibt_chan_pkey(ss, chan, pkey, B_TRUE, NULL)); 336 } 337 338 eib_avect_t * 339 eib_ibt_hold_avect(eib_t *ss, ib_lid_t dlid, uint8_t sl) 340 { 341 uint_t ndx = dlid % EIB_AV_NBUCKETS; /* simple hashing */ 342 eib_avect_t *av; 343 eib_avect_t *prev; 344 int ret; 345 346 mutex_enter(&ss->ei_av_lock); 347 348 /* 349 * See if we have the address vector 350 */ 351 prev = NULL; 352 for (av = ss->ei_av[ndx]; av; av = av->av_next) { 353 prev = av; 354 if ((av->av_vect).av_dlid == dlid) 355 break; 356 } 357 358 /* 359 * If we don't have it, create a new one and chain it to 360 * the same bucket 361 */ 362 if (av == NULL) { 363 av = kmem_zalloc(sizeof (eib_avect_t), KM_NOSLEEP); 364 if (av == NULL) { 365 mutex_exit(&ss->ei_av_lock); 366 EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_hold_avect: " 367 "no memory, could not allocate address vector"); 368 return (NULL); 369 } 370 371 ret = EIB_E_FAILURE; 372 if (!eib_wa_no_av_discover) 373 ret = eib_ibt_fill_avect(ss, av, dlid); 374 375 if (ret != EIB_E_SUCCESS) { 376 (av->av_vect).av_srate = IBT_SRATE_10; 377 (av->av_vect).av_srvl = sl; 378 (av->av_vect).av_port_num = ss->ei_props->ep_port_num; 379 (av->av_vect).av_send_grh = B_FALSE; 380 (av->av_vect).av_dlid = dlid; 381 (av->av_vect).av_src_path = 0; /* we use base lid */ 382 } 383 384 if (prev) 385 prev->av_next = av; 386 else 387 ss->ei_av[ndx] = av; 388 } 389 390 /* 391 * Increment the address vector reference count before returning 392 */ 393 (av->av_ref)++; 394 395 mutex_exit(&ss->ei_av_lock); 396 397 return (av); 398 } 399 400 static int 401 eib_ibt_fill_avect(eib_t *ss, eib_avect_t *av, ib_lid_t dlid) 402 { 403 ibt_node_info_t ni; 404 ibt_path_attr_t attr; 405 ibt_path_info_t path; 406 ibt_status_t ret; 407 ib_gid_t dgid; 408 409 if ((ret = ibt_lid_to_node_info(dlid, &ni)) != IBT_SUCCESS) { 410 EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: " 411 "ibt_lid_to_node_info(dlid=0x%x) failed, ret=%d", 412 dlid, ret); 413 return (EIB_E_FAILURE); 414 } 415 dgid.gid_prefix = ss->ei_gw_props->pp_gw_sn_prefix; 416 dgid.gid_guid = ni.n_port_guid; 417 418 /* 419 * Get the reversible path information for this destination 420 */ 421 bzero(&attr, sizeof (ibt_path_info_t)); 422 attr.pa_sgid = ss->ei_props->ep_sgid; 423 attr.pa_dgids = &dgid; 424 attr.pa_num_dgids = 1; 425 426 bzero(&path, sizeof (ibt_path_info_t)); 427 ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS, 428 &attr, 1, &path, NULL); 429 if ((ret != IBT_SUCCESS) || (path.pi_hca_guid == 0)) { 430 EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: " 431 "ibt_get_paths(dgid=%llx.%llx) failed, ret=%d", 432 dgid.gid_prefix, dgid.gid_guid); 433 return (EIB_E_FAILURE); 434 } 435 436 /* 437 * Fill in the address vector 438 */ 439 bcopy(&path.pi_prim_cep_path.cep_adds_vect, &av->av_vect, 440 sizeof (ibt_adds_vect_t)); 441 442 return (EIB_E_SUCCESS); 443 } 444 445 void 446 eib_ibt_release_avect(eib_t *ss, eib_avect_t *av) 447 { 448 mutex_enter(&ss->ei_av_lock); 449 450 ASSERT(av->av_ref > 0); 451 (av->av_ref)--; 452 453 mutex_exit(&ss->ei_av_lock); 454 } 455 456 void 457 eib_ibt_free_avects(eib_t *ss) 458 { 459 eib_avect_t *av; 460 eib_avect_t *av_next; 461 int ndx; 462 463 mutex_enter(&ss->ei_av_lock); 464 for (ndx = 0; ndx < EIB_AV_NBUCKETS; ndx++) { 465 for (av = ss->ei_av[ndx]; av; av = av_next) { 466 av_next = av->av_next; 467 468 ASSERT(av->av_ref == 0); 469 kmem_free(av, sizeof (eib_avect_t)); 470 } 471 ss->ei_av[ndx] = NULL; 472 } 473 mutex_exit(&ss->ei_av_lock); 474 } 475 476 /*ARGSUSED*/ 477 void 478 eib_ibt_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 479 ibt_async_code_t code, ibt_async_event_t *event) 480 { 481 eib_t *ss = (eib_t *)clnt_private; 482 eib_event_t *evi; 483 uint_t ev_code; 484 485 ev_code = EIB_EV_NONE; 486 487 switch (code) { 488 case IBT_EVENT_SQD: 489 EIB_DPRINTF_VERBOSE(ss->ei_instance, 490 "eib_ibt_async_handler: got IBT_EVENT_SQD"); 491 eib_ibt_wakeup_sqd_waiters(ss, event->ev_chan_hdl); 492 break; 493 494 case IBT_EVENT_PORT_UP: 495 if (event->ev_port == ss->ei_props->ep_port_num) { 496 EIB_DPRINTF_VERBOSE(ss->ei_instance, 497 "eib_ibt_async_handler: got IBT_EVENT_PORT_UP"); 498 ev_code = EIB_EV_PORT_UP; 499 } 500 break; 501 502 case IBT_ERROR_PORT_DOWN: 503 if (event->ev_port == ss->ei_props->ep_port_num) { 504 EIB_DPRINTF_VERBOSE(ss->ei_instance, 505 "eib_ibt_async_handler: got IBT_ERROR_PORT_DOWN"); 506 ev_code = EIB_EV_PORT_DOWN; 507 } 508 break; 509 510 case IBT_CLNT_REREG_EVENT: 511 if (event->ev_port == ss->ei_props->ep_port_num) { 512 EIB_DPRINTF_VERBOSE(ss->ei_instance, 513 "eib_ibt_async_handler: got IBT_CLNT_REREG_EVENT"); 514 ev_code = EIB_EV_CLNT_REREG; 515 } 516 break; 517 518 case IBT_PORT_CHANGE_EVENT: 519 if ((event->ev_port == ss->ei_props->ep_port_num) && 520 (event->ev_port_flags & IBT_PORT_CHANGE_PKEY)) { 521 EIB_DPRINTF_VERBOSE(ss->ei_instance, 522 "eib_ibt_async_handler: " 523 "got IBT_PORT_CHANGE_EVENT(PKEY_CHANGE)"); 524 ev_code = EIB_EV_PKEY_CHANGE; 525 } else if ((event->ev_port == ss->ei_props->ep_port_num) && 526 (event->ev_port_flags & IBT_PORT_CHANGE_SGID)) { 527 EIB_DPRINTF_VERBOSE(ss->ei_instance, 528 "eib_ibt_async_handler: " 529 "got IBT_PORT_CHANGE_EVENT(SGID_CHANGE)"); 530 ev_code = EIB_EV_SGID_CHANGE; 531 } 532 break; 533 534 case IBT_HCA_ATTACH_EVENT: 535 /* 536 * For HCA attach, after a new HCA is plugged in and 537 * configured using cfgadm, an explicit plumb will need 538 * to be run, so we don't need to do anything here. 539 */ 540 EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: " 541 "got IBT_HCA_ATTACH_EVENT"); 542 break; 543 544 case IBT_HCA_DETACH_EVENT: 545 /* 546 * Before an HCA unplug, cfgadm is expected to trigger 547 * any rcm scripts to unplumb the EoIB instances on the 548 * card. If so, we should not be holding any hca resource, 549 * since we don't do ibt_open_hca() until plumb time. However, 550 * if an earlier unplumb hadn't cleaned up the hca resources 551 * properly because the network layer hadn't returned the 552 * buffers at that time, we could be holding hca resources. 553 * We'll try to release them here, and protect the code from 554 * racing with some other plumb/unplumb operation. 555 */ 556 EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: " 557 "got IBT_HCA_DETACH_EVENT"); 558 559 eib_mac_set_nic_state(ss, EIB_NIC_STOPPING); 560 eib_rb_rsrc_setup_bufs(ss, B_FALSE); 561 if (ss->ei_tx || ss->ei_rx || ss->ei_lso) { 562 EIB_DPRINTF_WARN(ss->ei_instance, 563 "eib_events_handler: nw layer still holding " 564 "hca resources, could not detach HCA"); 565 } else if (ss->ei_hca_hdl) { 566 eib_rb_ibt_hca_init(ss, ~0); 567 } 568 eib_mac_clr_nic_state(ss, EIB_NIC_STOPPING); 569 570 break; 571 } 572 573 if (ev_code != EIB_EV_NONE) { 574 evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP); 575 if (evi == NULL) { 576 EIB_DPRINTF_WARN(ss->ei_instance, 577 "eib_ibt_async_handler: " 578 "no memory, could not handle event 0x%lx", ev_code); 579 } else { 580 evi->ev_code = ev_code; 581 evi->ev_arg = NULL; 582 eib_svc_enqueue_event(ss, evi); 583 } 584 } 585 } 586 587 /*ARGSUSED*/ 588 void 589 eib_ibt_record_capab(eib_t *ss, ibt_hca_attr_t *hca_attrs, eib_caps_t *caps) 590 { 591 uint_t max_swqe = EIB_DATA_MAX_SWQE; 592 uint_t max_rwqe = EIB_DATA_MAX_RWQE; 593 594 /* 595 * Checksum 596 */ 597 caps->cp_cksum_flags = 0; 598 if ((!eib_wa_no_cksum_offload) && 599 (hca_attrs->hca_flags & IBT_HCA_CKSUM_FULL)) { 600 caps->cp_cksum_flags = 601 HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 602 /* HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM; */ 603 } 604 605 /* 606 * Reserved L-Key 607 */ 608 if (hca_attrs->hca_flags2 & IBT_HCA2_RES_LKEY) { 609 caps->cp_resv_lkey_capab = 1; 610 caps->cp_resv_lkey = hca_attrs->hca_reserved_lkey; 611 } 612 613 /* 614 * LSO 615 */ 616 caps->cp_lso_maxlen = 0; 617 if (!eib_wa_no_lso) { 618 if (hca_attrs->hca_max_lso_size > EIB_LSO_MAXLEN) { 619 caps->cp_lso_maxlen = EIB_LSO_MAXLEN; 620 } else { 621 caps->cp_lso_maxlen = hca_attrs->hca_max_lso_size; 622 } 623 } 624 625 /* 626 * SGL 627 * 628 * Translating virtual address regions into physical regions 629 * for using the Reserved LKey feature results in a wr sgl that 630 * is a little longer. Since failing ibt_map_mem_iov() is costly, 631 * we'll record a high-water mark (65%) when we should stop 632 * trying to use Reserved LKey 633 */ 634 if (hca_attrs->hca_flags & IBT_HCA_WQE_SIZE_INFO) { 635 caps->cp_max_sgl = hca_attrs->hca_ud_send_sgl_sz; 636 } else { 637 caps->cp_max_sgl = hca_attrs->hca_max_sgl; 638 } 639 if (caps->cp_max_sgl > EIB_MAX_SGL) { 640 caps->cp_max_sgl = EIB_MAX_SGL; 641 } 642 caps->cp_hiwm_sgl = (caps->cp_max_sgl * 65) / 100; 643 644 /* 645 * SWQE/RWQE: meet max chan size and max cq size limits (leave room 646 * to avoid cq overflow event) 647 */ 648 if (max_swqe > hca_attrs->hca_max_chan_sz) 649 max_swqe = hca_attrs->hca_max_chan_sz; 650 if (max_swqe > (hca_attrs->hca_max_cq_sz - 1)) 651 max_swqe = hca_attrs->hca_max_cq_sz - 1; 652 caps->cp_max_swqe = max_swqe; 653 654 if (max_rwqe > hca_attrs->hca_max_chan_sz) 655 max_rwqe = hca_attrs->hca_max_chan_sz; 656 if (max_rwqe > (hca_attrs->hca_max_cq_sz - 1)) 657 max_rwqe = hca_attrs->hca_max_cq_sz - 1; 658 caps->cp_max_rwqe = max_rwqe; 659 } 660 661 void 662 eib_rb_ibt_hca_init(eib_t *ss, uint_t progress) 663 { 664 ibt_status_t ret; 665 666 if (progress & EIB_HCAINIT_CAPAB_RECORDED) { 667 if (ss->ei_caps) { 668 kmem_free(ss->ei_caps, sizeof (eib_caps_t)); 669 ss->ei_caps = NULL; 670 } 671 } 672 673 if (progress & EIB_HCAINIT_PD_ALLOCD) { 674 if (ss->ei_pd_hdl) { 675 ret = ibt_free_pd(ss->ei_hca_hdl, ss->ei_pd_hdl); 676 if (ret != IBT_SUCCESS) { 677 EIB_DPRINTF_WARN(ss->ei_instance, 678 "eib_rb_ibt_hca_init: " 679 "ibt_free_pd(hca_hdl=0x%lx, pd_hdl=0x%lx) " 680 "failed, ret=%d", ss->ei_hca_hdl, 681 ss->ei_pd_hdl, ret); 682 } 683 ss->ei_pd_hdl = NULL; 684 } 685 } 686 687 if (progress & EIB_HCAINIT_HCA_PORTS_QUERIED) { 688 ss->ei_props->ep_mtu = 0; 689 bzero(&ss->ei_props->ep_sgid, sizeof (ib_gid_t)); 690 } 691 692 if (progress & EIB_HCAINIT_ATTRS_ALLOCD) { 693 kmem_free(ss->ei_hca_attrs, sizeof (ibt_hca_attr_t)); 694 ss->ei_hca_attrs = NULL; 695 } 696 697 if (progress & EIB_HCAINIT_HCA_OPENED) { 698 ret = ibt_close_hca(ss->ei_hca_hdl); 699 if (ret != IBT_SUCCESS) { 700 EIB_DPRINTF_WARN(ss->ei_instance, 701 "ibt_close_hca(hca_hdl=0x%lx) failed, " 702 "ret=%d", ss->ei_hca_hdl, ret); 703 } 704 ss->ei_hca_hdl = NULL; 705 } 706 } 707 708 static void 709 eib_ibt_reset_partitions(eib_t *ss) 710 { 711 eib_vnic_t *vnic; 712 eib_chan_t *chan = NULL; 713 uint64_t av; 714 int inst = 0; 715 716 /* 717 * We already have the vhub pkey recorded in our eib_chan_t. 718 * We only need to make sure our pkey index still matches it. 719 * If not, modify the channel appropriately and update our 720 * records. 721 */ 722 if ((chan = ss->ei_admin_chan) != NULL) 723 (void) eib_ibt_modify_chan_pkey(ss, chan, chan->ch_pkey); 724 725 mutex_enter(&ss->ei_vnic_lock); 726 av = ss->ei_active_vnics; 727 while ((inst = EIB_FIND_LSB_SET(av)) != -1) { 728 if ((vnic = ss->ei_vnic[inst]) != NULL) { 729 if ((chan = vnic->vn_ctl_chan) != NULL) { 730 (void) eib_ibt_modify_chan_pkey(ss, chan, 731 chan->ch_pkey); 732 } 733 if ((chan = vnic->vn_data_chan) != NULL) { 734 (void) eib_ibt_modify_chan_pkey(ss, chan, 735 chan->ch_pkey); 736 } 737 } 738 av &= (~((uint64_t)1 << inst)); 739 } 740 mutex_exit(&ss->ei_vnic_lock); 741 } 742 743 static void 744 eib_ibt_wakeup_sqd_waiters(eib_t *ss, ibt_channel_hdl_t ev_chan_hdl) 745 { 746 eib_vnic_t *vnic; 747 eib_chan_t *chan = NULL; 748 uint64_t av; 749 int inst = 0; 750 751 /* 752 * See if this channel has been waiting for its queue to drain. 753 * 754 * Note that since this is especially likely to be called during 755 * logging in to the gateway, we also need to check the vnic 756 * currently being created. 757 */ 758 mutex_enter(&ss->ei_vnic_lock); 759 760 if ((vnic = ss->ei_vnic_pending) != NULL) { 761 chan = vnic->vn_ctl_chan; 762 if ((chan) && (chan->ch_chan == ev_chan_hdl)) 763 goto wakeup_sqd_waiters; 764 765 chan = vnic->vn_data_chan; 766 if ((chan) && (chan->ch_chan == ev_chan_hdl)) 767 goto wakeup_sqd_waiters; 768 } 769 770 av = ss->ei_active_vnics; 771 while ((inst = EIB_FIND_LSB_SET(av)) != -1) { 772 if ((vnic = ss->ei_vnic[inst]) != NULL) { 773 chan = vnic->vn_ctl_chan; 774 if (chan->ch_chan == ev_chan_hdl) 775 break; 776 777 chan = vnic->vn_data_chan; 778 if (chan->ch_chan == ev_chan_hdl) 779 break; 780 } 781 av &= (~((uint64_t)1 << inst)); 782 } 783 784 wakeup_sqd_waiters: 785 if (chan) { 786 mutex_enter(&chan->ch_cep_lock); 787 chan->ch_cep_state = IBT_STATE_SQD; 788 cv_broadcast(&chan->ch_cep_cv); 789 mutex_exit(&chan->ch_cep_lock); 790 } 791 792 mutex_exit(&ss->ei_vnic_lock); 793 } 794 795 static int 796 eib_ibt_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t new_pkey, 797 boolean_t set, boolean_t *pkey_changed) 798 { 799 ibt_qp_info_t qp_attr; 800 ibt_status_t ret; 801 uint16_t new_pkey_ix; 802 803 ret = ibt_pkey2index(ss->ei_hca_hdl, ss->ei_props->ep_port_num, 804 new_pkey, &new_pkey_ix); 805 if (ret != IBT_SUCCESS) { 806 EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: " 807 "ibt_pkey2index(hca_hdl=0x%llx, port_num=0x%x, " 808 "pkey=0x%x) failed, ret=%d", 809 ss->ei_hca_hdl, ss->ei_props->ep_port_num, new_pkey, ret); 810 return (EIB_E_FAILURE); 811 } 812 813 /* 814 * If the pkey and the pkey index we have already matches the 815 * new one, nothing to do. 816 */ 817 mutex_enter(&chan->ch_pkey_lock); 818 if ((chan->ch_pkey == new_pkey) && (chan->ch_pkey_ix == new_pkey_ix)) { 819 if (pkey_changed) { 820 *pkey_changed = B_FALSE; 821 } 822 mutex_exit(&chan->ch_pkey_lock); 823 return (EIB_E_SUCCESS); 824 } 825 if (pkey_changed) { 826 *pkey_changed = B_TRUE; 827 } 828 mutex_exit(&chan->ch_pkey_lock); 829 830 /* 831 * Otherwise, if we're asked only to test if the pkey index 832 * supplied matches the one recorded in the channel, return 833 * success, but don't set the pkey. 834 */ 835 if (!set) { 836 return (EIB_E_SUCCESS); 837 } 838 839 /* 840 * Otherwise, we need to change channel pkey. Pause the 841 * channel sendq first. 842 */ 843 ret = ibt_pause_sendq(chan->ch_chan, IBT_CEP_SET_SQD_EVENT); 844 if (ret != IBT_SUCCESS) { 845 EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: " 846 "ibt_pause_sendq(chan_hdl=0x%llx) failed, ret=%d", 847 chan->ch_chan, ret); 848 return (EIB_E_FAILURE); 849 } 850 851 /* 852 * Wait for the channel to enter the IBT_STATE_SQD state 853 */ 854 mutex_enter(&chan->ch_cep_lock); 855 while (chan->ch_cep_state != IBT_STATE_SQD) 856 cv_wait(&chan->ch_cep_cv, &chan->ch_cep_lock); 857 mutex_exit(&chan->ch_cep_lock); 858 859 /* 860 * Modify the qp with the supplied pkey index and unpause the channel 861 * If either of these operations fail, we'll leave the channel in 862 * the paused state and fail. 863 */ 864 bzero(&qp_attr, sizeof (ibt_qp_info_t)); 865 866 qp_attr.qp_trans = IBT_UD_SRV; 867 qp_attr.qp_current_state = IBT_STATE_SQD; 868 qp_attr.qp_state = IBT_STATE_SQD; 869 qp_attr.qp_transport.ud.ud_pkey_ix = new_pkey_ix; 870 871 /* 872 * Modify the qp to set the new pkey index, then unpause the 873 * channel and put it in RTS state and update the new values 874 * in our records 875 */ 876 mutex_enter(&chan->ch_pkey_lock); 877 878 ret = ibt_modify_qp(chan->ch_chan, 879 IBT_CEP_SET_STATE | IBT_CEP_SET_PKEY_IX, &qp_attr, NULL); 880 if (ret != IBT_SUCCESS) { 881 mutex_exit(&chan->ch_pkey_lock); 882 EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: " 883 "ibt_modify_qp(chan_hdl=0x%llx, IBT_CEP_SET_PKEY_IX) " 884 "failed for new_pkey_ix=0x%x, ret=%d", 885 chan->ch_chan, new_pkey_ix, ret); 886 return (EIB_E_FAILURE); 887 } 888 889 if ((ret = ibt_unpause_sendq(chan->ch_chan)) != IBT_SUCCESS) { 890 mutex_exit(&chan->ch_pkey_lock); 891 EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: " 892 "ibt_unpause_sendq(chan_hdl=0x%llx) failed, ret=%d", 893 chan->ch_chan, ret); 894 return (EIB_E_FAILURE); 895 } 896 897 chan->ch_pkey = new_pkey; 898 chan->ch_pkey_ix = new_pkey_ix; 899 mutex_exit(&chan->ch_pkey_lock); 900 901 return (EIB_E_SUCCESS); 902 } 903 904 static boolean_t 905 eib_ibt_has_chan_pkey_changed(eib_t *ss, eib_chan_t *chan) 906 { 907 boolean_t changed; 908 int ret; 909 910 /* 911 * Don't modify the pkey, just ask if the pkey index for the channel's 912 * pkey has changed for any reason. If we fail, assume that the pkey 913 * has changed. 914 */ 915 ret = eib_ibt_chan_pkey(ss, chan, chan->ch_pkey, B_FALSE, &changed); 916 if (ret != EIB_E_SUCCESS) 917 changed = B_TRUE; 918 919 return (changed); 920 } 921 922 static boolean_t 923 eib_ibt_has_any_pkey_changed(eib_t *ss) 924 { 925 eib_vnic_t *vnic; 926 eib_chan_t *chan = NULL; 927 uint64_t av; 928 int inst = 0; 929 930 /* 931 * Return true if the pkey index of any our pkeys (of the channels 932 * of all active vnics) has changed. 933 */ 934 935 chan = ss->ei_admin_chan; 936 if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan))) 937 return (B_TRUE); 938 939 mutex_enter(&ss->ei_vnic_lock); 940 av = ss->ei_active_vnics; 941 while ((inst = EIB_FIND_LSB_SET(av)) != -1) { 942 if ((vnic = ss->ei_vnic[inst]) != NULL) { 943 chan = vnic->vn_ctl_chan; 944 if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan))) 945 return (B_TRUE); 946 947 chan = vnic->vn_data_chan; 948 if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan))) 949 return (B_TRUE); 950 } 951 av &= (~((uint64_t)1 << inst)); 952 } 953 mutex_exit(&ss->ei_vnic_lock); 954 955 return (B_FALSE); 956 } 957 958 /* 959 * This routine is currently used simply to derive and record the port 960 * speed from the loopback path information (for debug purposes). For 961 * EoIB, currently the srate used in address vectors to IB neighbors 962 * and the gateway is fixed at IBT_SRATE_10. Eventually though, this 963 * information (and sl) has to come from the gateway for all destinations 964 * in the vhub table. 965 */ 966 static void 967 eib_ibt_record_srate(eib_t *ss) 968 { 969 ib_gid_t sgid = ss->ei_props->ep_sgid; 970 ibt_srate_t srate = IBT_SRATE_10; 971 ibt_path_info_t path; 972 ibt_path_attr_t path_attr; 973 ibt_status_t ret; 974 uint8_t num_paths; 975 976 bzero(&path_attr, sizeof (path_attr)); 977 path_attr.pa_dgids = &sgid; 978 path_attr.pa_num_dgids = 1; 979 path_attr.pa_sgid = sgid; 980 981 ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS, 982 &path_attr, 1, &path, &num_paths); 983 if (ret == IBT_SUCCESS && num_paths >= 1) { 984 switch (srate = path.pi_prim_cep_path.cep_adds_vect.av_srate) { 985 case IBT_SRATE_2: 986 case IBT_SRATE_10: 987 case IBT_SRATE_30: 988 case IBT_SRATE_5: 989 case IBT_SRATE_20: 990 case IBT_SRATE_40: 991 case IBT_SRATE_60: 992 case IBT_SRATE_80: 993 case IBT_SRATE_120: 994 break; 995 default: 996 srate = IBT_SRATE_10; 997 } 998 } 999 1000 ss->ei_props->ep_srate = srate; 1001 1002 EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_ibt_record_srate: " 1003 "srate = %d", srate); 1004 } 1005