1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2022 Garrett D'Amore 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip6.h> /* for ip6_t */ 54 #include <inet/tcp.h> /* for tcph_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 64 65 #include <sys/priv_names.h> 66 #include <sys/dls.h> 67 #include <sys/dld_ioc.h> 68 #include <sys/policy.h> 69 #include <sys/ibpart.h> 70 #include <sys/file.h> 71 72 /* 73 * The write-up below includes details on the following: 74 * 1. The dladm administrative model. 75 * 2. Late HCA initialization feature. 76 * 3. Brussels support and its implications to the current architecture. 77 * 78 * 1. The dladm administrative model. 79 * ------------------------------------------ 80 * With the dladm model, ibnex will create one ibd instance per port. These 81 * instances will be created independent of the port state. 82 * 83 * The ibd driver is two faceted: One side of it working as the port driver and 84 * the other as the partition object driver. 85 * 86 * The port instance is a child of the HCA, and will have an entry in the devfs. 87 * A DDI attach only happens for the port driver, and its attach is 88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is 89 * handled in ibd_port_unattach(). 90 * 91 * The partition object is only a registrant to the mac layer via mac_register() 92 * and does not have an entry in the device tree. There is no DDI softstate 93 * managed by the DDI framework for the partition objects. However, the state is 94 * managed inside the ibd driver, and every partition object hangs off the 95 * "ibd_objlist_head". 96 * 97 * The partition object first comes into existence when a user runs the 98 * 'create-part' subcommand of dladm. This is like invoking the attach entry 99 * point of the partition object. The partition object goes away with the 100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry 101 * point of the partition object. 102 * 103 * The create-part and delete-part subcommands result in dld ioctls that end up 104 * calling ibd_create_parition() and ibd_delete_partition respectively. 105 * There ioctls are registered with the dld layer in _init() via a call to 106 * dld_ioc_register(). 107 * 108 * The port instance by itself cannot be plumbed. It is only the partition 109 * objects that can be plumbed and they alone participate in I/O and not the 110 * port driver. 111 * 112 * There are some info ioctls supported in ibd which are used by dladm(8) to 113 * display useful information. The info entry point for ibd is 114 * ibd_get_partition_info(). 115 * 116 * 2. Late HCA initialization feature. 117 * ------------------------------------ 118 * As mentioned in section 1, the user creates the partition objects via 119 * dladm(8). It is possible that: 120 * a) The physical port itself is down and the SM cannot be reached. 121 * b) The PKEY specified by the used has not been created in the SM yet. 122 * c) An IPoIB broadcast group for the specified PKEY is not present. 123 * 124 * In all of the above cases, complete initialization of the partition object is 125 * not possible. However, the new model allows the creation of partition 126 * objects even in such cases but will defer the initialization for later. 127 * When such a partition object is plumbed, the link state will be displayed as 128 * "down". 129 * The driver, at this point, is listening to events that herald the 130 * availability of resources - 131 * i) LINK_UP when the link becomes available 132 * ii) PORT_CHANGE when the PKEY has been created 133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been 134 * created 135 * via ibd_async_handler() for events i) and ii), and via 136 * ibd_snet_notices_handler() for iii. 137 * The driver handles these events (as and when they arrive) and completes the 138 * initialization of the partition object and transitions it to a usable state. 139 * 140 * 3. Brussels support and its implications to the current architecture. 141 * --------------------------------------------------------------------- 142 * The brussels support introduces two new interfaces to the ibd driver - 143 * ibd_m_getprop() and ibd_m_setprop(). 144 * These interfaces allow setting and retrieval of certain properties. 145 * Some of them are public properties while most other are private properties 146 * meant to be used by developers. Tuning the latter kind can cause 147 * performance issues and should not be used without understanding the 148 * implications. All properties are specific to an instance of either the 149 * partition object or the port driver. 150 * 151 * The public properties are : mtu and linkmode. 152 * mtu is a read-only property. 153 * linkmode can take two values - UD and CM. 154 * 155 * Changing the linkmode requires some bookkeeping in the driver. The 156 * capabilities need to be re-reported to the mac layer. This is done by 157 * calling mac_capab_update(). The maxsdu is updated by calling 158 * mac_maxsdu_update2(). 159 * The private properties retain their values across the change of linkmode. 160 * NOTE: 161 * - The port driver does not support any property apart from mtu. 162 * - All other properties are only meant for the partition object. 163 * - The properties cannot be set when an instance is plumbed. The 164 * instance has to be unplumbed to effect any setting. 165 */ 166 167 /* 168 * Driver wide tunables 169 * 170 * ibd_tx_softintr 171 * ibd_rx_softintr 172 * The softintr mechanism allows ibd to avoid event queue overflows if 173 * the receive/completion handlers are to be expensive. These are enabled 174 * by default. 175 * 176 * ibd_log_sz 177 * This specifies the size of the ibd log buffer in bytes. The buffer is 178 * allocated and logging is enabled only when IBD_LOGGING is defined. 179 * 180 */ 181 uint_t ibd_rx_softintr = 1; 182 uint_t ibd_tx_softintr = 1; 183 184 #ifdef IBD_LOGGING 185 uint_t ibd_log_sz = 0x20000; 186 #endif 187 188 #ifdef IBD_LOGGING 189 #define IBD_LOG_SZ ibd_log_sz 190 #endif 191 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 193 #define IBD_RX_POST_CNT 8 194 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 196 #define IBD_LOG_RX_POST 4 197 198 /* Minimum number of receive work requests driver needs to always have */ 199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 200 201 /* 202 * LSO parameters 203 */ 204 #define IBD_LSO_MAXLEN 65536 205 #define IBD_LSO_BUFSZ 8192 206 207 /* 208 * Async operation states 209 */ 210 #define IBD_OP_NOTSTARTED 0 211 #define IBD_OP_ONGOING 1 212 #define IBD_OP_COMPLETED 2 213 #define IBD_OP_ERRORED 3 214 #define IBD_OP_ROUTERED 4 215 216 /* 217 * Start/stop in-progress flags; note that restart must always remain 218 * the OR of start and stop flag values. 219 */ 220 #define IBD_DRV_START_IN_PROGRESS 0x10000000 221 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 222 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 223 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS 224 225 /* 226 * Miscellaneous constants 227 */ 228 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 229 #define IBD_DEF_MAX_SDU 2044 230 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE) 231 #define IBD_DEF_RC_MAX_SDU 65520 232 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE) 233 #define IBD_DEFAULT_QKEY 0xB1B 234 #ifdef IBD_LOGGING 235 #define IBD_DMAX_LINE 100 236 #endif 237 238 /* 239 * Enumerations for link states 240 */ 241 typedef enum { 242 IBD_LINK_DOWN, 243 IBD_LINK_UP, 244 IBD_LINK_UP_ABSENT 245 } ibd_link_op_t; 246 247 /* 248 * Driver State Pointer 249 */ 250 void *ibd_list; 251 252 /* 253 * Driver Global Data 254 */ 255 ibd_global_state_t ibd_gstate; 256 257 /* 258 * Partition object list 259 */ 260 ibd_state_t *ibd_objlist_head = NULL; 261 kmutex_t ibd_objlist_lock; 262 263 int ibd_rc_conn_timeout = 60 * 10; /* 10 minutes */ 264 265 /* 266 * Logging 267 */ 268 #ifdef IBD_LOGGING 269 kmutex_t ibd_lbuf_lock; 270 uint8_t *ibd_lbuf; 271 uint32_t ibd_lbuf_ndx; 272 #endif 273 274 /* 275 * Required system entry points 276 */ 277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 279 280 /* 281 * Required driver entry points for GLDv3 282 */ 283 static int ibd_m_stat(void *, uint_t, uint64_t *); 284 static int ibd_m_start(void *); 285 static void ibd_m_stop(void *); 286 static int ibd_m_promisc(void *, boolean_t); 287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 288 static int ibd_m_unicst(void *, const uint8_t *); 289 static mblk_t *ibd_m_tx(void *, mblk_t *); 290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 291 292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 293 const void *); 294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t, 296 mac_prop_info_handle_t); 297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t, 298 const void *); 299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *); 300 301 /* 302 * Private driver entry points for GLDv3 303 */ 304 305 /* 306 * Initialization 307 */ 308 static int ibd_state_init(ibd_state_t *, dev_info_t *); 309 static int ibd_init_txlist(ibd_state_t *); 310 static int ibd_init_rxlist(ibd_state_t *); 311 static int ibd_acache_init(ibd_state_t *); 312 #ifdef IBD_LOGGING 313 static void ibd_log_init(void); 314 #endif 315 316 /* 317 * Termination/cleanup 318 */ 319 static void ibd_state_fini(ibd_state_t *); 320 static void ibd_fini_txlist(ibd_state_t *); 321 static void ibd_fini_rxlist(ibd_state_t *); 322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 324 static void ibd_acache_fini(ibd_state_t *); 325 #ifdef IBD_LOGGING 326 static void ibd_log_fini(void); 327 #endif 328 329 /* 330 * Allocation/acquire/map routines 331 */ 332 static int ibd_alloc_tx_copybufs(ibd_state_t *); 333 static int ibd_alloc_rx_copybufs(ibd_state_t *); 334 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 337 uint32_t *); 338 339 /* 340 * Free/release/unmap routines 341 */ 342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 343 static void ibd_free_tx_copybufs(ibd_state_t *); 344 static void ibd_free_rx_copybufs(ibd_state_t *); 345 static void ibd_free_rx_rsrcs(ibd_state_t *); 346 static void ibd_free_tx_lsobufs(ibd_state_t *); 347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 350 351 /* 352 * Handlers/callback routines 353 */ 354 static uint_t ibd_intr(caddr_t); 355 static uint_t ibd_tx_recycle(caddr_t); 356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 357 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 362 static void ibd_freemsg_cb(char *); 363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 364 ibt_async_event_t *); 365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 366 ibt_async_event_t *); 367 static void ibd_snet_notices_handler(void *, ib_gid_t, 368 ibt_subnet_event_code_t, ibt_subnet_event_t *); 369 370 /* 371 * Send/receive routines 372 */ 373 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 377 378 /* 379 * Threads 380 */ 381 static void ibd_async_work(ibd_state_t *); 382 383 /* 384 * Async tasks 385 */ 386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 388 static void ibd_async_setprom(ibd_state_t *); 389 static void ibd_async_unsetprom(ibd_state_t *); 390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 392 static void ibd_async_txsched(ibd_state_t *); 393 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 394 395 /* 396 * Async task helpers 397 */ 398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 401 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 402 ipoib_mac_t *, ipoib_mac_t *); 403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 406 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 409 static uint64_t ibd_get_portspeed(ibd_state_t *); 410 static boolean_t ibd_async_safe(ibd_state_t *); 411 static void ibd_async_done(ibd_state_t *); 412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 416 417 /* 418 * Helpers for attach/start routines 419 */ 420 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 421 static int ibd_record_capab(ibd_state_t *); 422 static int ibd_get_port_details(ibd_state_t *); 423 static int ibd_alloc_cqs(ibd_state_t *); 424 static int ibd_setup_ud_channel(ibd_state_t *); 425 static int ibd_start(ibd_state_t *); 426 static int ibd_undo_start(ibd_state_t *, link_state_t); 427 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip); 430 static void ibd_part_unattach(ibd_state_t *state); 431 static int ibd_port_attach(dev_info_t *); 432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip); 433 static int ibd_get_port_state(ibd_state_t *, link_state_t *); 434 static int ibd_part_busy(ibd_state_t *); 435 436 /* 437 * Miscellaneous helpers 438 */ 439 static int ibd_sched_poll(ibd_state_t *, int, int); 440 static void ibd_resume_transmission(ibd_state_t *); 441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 443 static void *list_get_head(list_t *); 444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 446 447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *); 448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *); 449 450 #ifdef IBD_LOGGING 451 static void ibd_log(const char *, ...); 452 #endif 453 454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 455 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 456 457 /* Module Driver Info */ 458 static struct modldrv ibd_modldrv = { 459 &mod_driverops, /* This one is a driver */ 460 "InfiniBand GLDv3 Driver", /* short description */ 461 &ibd_dev_ops /* driver specific ops */ 462 }; 463 464 /* Module Linkage */ 465 static struct modlinkage ibd_modlinkage = { 466 MODREV_1, (void *)&ibd_modldrv, NULL 467 }; 468 469 /* 470 * Module (static) info passed to IBTL during ibt_attach 471 */ 472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 473 IBTI_V_CURR, 474 IBT_NETWORK, 475 ibd_async_handler, 476 NULL, 477 "IBPART" 478 }; 479 480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = { 481 IBTI_V_CURR, 482 IBT_NETWORK, 483 ibdpd_async_handler, 484 NULL, 485 "IPIB" 486 }; 487 488 /* 489 * GLDv3 entry points 490 */ 491 #define IBD_M_CALLBACK_FLAGS \ 492 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) 493 494 static mac_callbacks_t ibd_m_callbacks = { 495 IBD_M_CALLBACK_FLAGS, 496 ibd_m_stat, 497 ibd_m_start, 498 ibd_m_stop, 499 ibd_m_promisc, 500 ibd_m_multicst, 501 ibd_m_unicst, 502 ibd_m_tx, 503 NULL, 504 NULL, 505 ibd_m_getcapab, 506 NULL, 507 NULL, 508 ibd_m_setprop, 509 ibd_m_getprop, 510 ibd_m_propinfo 511 }; 512 513 /* Private properties */ 514 char *ibd_priv_props[] = { 515 "_ibd_broadcast_group", 516 "_ibd_coalesce_completions", 517 "_ibd_create_broadcast_group", 518 "_ibd_hash_size", 519 "_ibd_lso_enable", 520 "_ibd_num_ah", 521 "_ibd_num_lso_bufs", 522 "_ibd_rc_enable_srq", 523 "_ibd_rc_num_rwqe", 524 "_ibd_rc_num_srq", 525 "_ibd_rc_num_swqe", 526 "_ibd_rc_rx_comp_count", 527 "_ibd_rc_rx_comp_usec", 528 "_ibd_rc_rx_copy_thresh", 529 "_ibd_rc_rx_rwqe_thresh", 530 "_ibd_rc_tx_comp_count", 531 "_ibd_rc_tx_comp_usec", 532 "_ibd_rc_tx_copy_thresh", 533 "_ibd_ud_num_rwqe", 534 "_ibd_ud_num_swqe", 535 "_ibd_ud_rx_comp_count", 536 "_ibd_ud_rx_comp_usec", 537 "_ibd_ud_tx_comp_count", 538 "_ibd_ud_tx_comp_usec", 539 "_ibd_ud_tx_copy_thresh", 540 NULL 541 }; 542 543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *); 544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *); 545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *); 546 547 static dld_ioc_info_t ibd_dld_ioctl_list[] = { 548 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t), 549 ibd_create_partition, secpolicy_dl_config}, 550 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t), 551 ibd_delete_partition, secpolicy_dl_config}, 552 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t), 553 ibd_get_partition_info, NULL} 554 }; 555 556 /* 557 * Fill/clear <scope> and <p_key> in multicast/broadcast address 558 */ 559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 560 { \ 561 *(uint32_t *)((char *)(maddr) + 4) |= \ 562 htonl((uint32_t)(scope) << 16); \ 563 *(uint32_t *)((char *)(maddr) + 8) |= \ 564 htonl((uint32_t)(pkey) << 16); \ 565 } 566 567 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 568 { \ 569 *(uint32_t *)((char *)(maddr) + 4) &= \ 570 htonl(~((uint32_t)0xF << 16)); \ 571 *(uint32_t *)((char *)(maddr) + 8) &= \ 572 htonl(~((uint32_t)0xFFFF << 16)); \ 573 } 574 575 /* 576 * Rudimentary debugging support 577 */ 578 #ifdef DEBUG 579 int ibd_debuglevel = 100; 580 void 581 debug_print(int l, char *fmt, ...) 582 { 583 va_list ap; 584 585 if (l < ibd_debuglevel) 586 return; 587 va_start(ap, fmt); 588 vcmn_err(CE_CONT, fmt, ap); 589 va_end(ap); 590 } 591 #endif 592 593 /* 594 * Common routine to print warning messages; adds in hca guid, port number 595 * and pkey to be able to identify the IBA interface. 596 */ 597 void 598 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 599 { 600 ib_guid_t hca_guid; 601 char ibd_print_buf[MAXNAMELEN + 256]; 602 int len; 603 va_list ap; 604 char part_name[MAXNAMELEN]; 605 datalink_id_t linkid = state->id_plinkid; 606 607 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 608 0, "hca-guid", 0); 609 (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL); 610 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 611 "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ", 612 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 613 (u_longlong_t)hca_guid, state->id_port, state->id_pkey, 614 part_name); 615 va_start(ap, fmt); 616 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 617 fmt, ap); 618 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 619 va_end(ap); 620 } 621 622 /* 623 * Warlock directives 624 */ 625 626 /* 627 * id_lso_lock 628 * 629 * state->id_lso->bkt_nfree may be accessed without a lock to 630 * determine the threshold at which we have to ask the nw layer 631 * to resume transmission (see ibd_resume_transmission()). 632 */ 633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 634 ibd_state_t::id_lso)) 635 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 636 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 637 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 638 639 /* 640 * id_scq_poll_lock 641 */ 642 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 643 ibd_state_t::id_scq_poll_busy)) 644 645 /* 646 * id_txpost_lock 647 */ 648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 649 ibd_state_t::id_tx_head)) 650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 651 ibd_state_t::id_tx_busy)) 652 653 /* 654 * id_acache_req_lock 655 */ 656 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 657 ibd_state_t::id_acache_req_cv)) 658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 659 ibd_state_t::id_req_list)) 660 _NOTE(SCHEME_PROTECTS_DATA("atomic", 661 ibd_acache_s::ac_ref)) 662 663 /* 664 * id_ac_mutex 665 * 666 * This mutex is actually supposed to protect id_ah_op as well, 667 * but this path of the code isn't clean (see update of id_ah_op 668 * in ibd_async_acache(), immediately after the call to 669 * ibd_async_mcache()). For now, we'll skip this check by 670 * declaring that id_ah_op is protected by some internal scheme 671 * that warlock isn't aware of. 672 */ 673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 674 ibd_state_t::id_ah_active)) 675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 676 ibd_state_t::id_ah_free)) 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 678 ibd_state_t::id_ah_addr)) 679 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 680 ibd_state_t::id_ah_op)) 681 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 682 ibd_state_t::id_ah_error)) 683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 684 ibd_state_t::id_ac_hot_ace)) 685 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 686 687 /* 688 * id_mc_mutex 689 */ 690 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 691 ibd_state_t::id_mc_full)) 692 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 693 ibd_state_t::id_mc_non)) 694 695 /* 696 * id_trap_lock 697 */ 698 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 699 ibd_state_t::id_trap_cv)) 700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 701 ibd_state_t::id_trap_stop)) 702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 703 ibd_state_t::id_trap_inprog)) 704 705 /* 706 * id_prom_op 707 */ 708 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 709 ibd_state_t::id_prom_op)) 710 711 /* 712 * id_sched_lock 713 */ 714 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 715 ibd_state_t::id_sched_needed)) 716 717 /* 718 * id_link_mutex 719 */ 720 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 721 ibd_state_t::id_link_state)) 722 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 723 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 724 ibd_state_t::id_link_speed)) 725 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 726 727 /* 728 * id_tx_list.dl_mutex 729 */ 730 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 731 ibd_state_t::id_tx_list.dl_head)) 732 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 733 ibd_state_t::id_tx_list.dl_pending_sends)) 734 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 735 ibd_state_t::id_tx_list.dl_cnt)) 736 737 /* 738 * id_rx_list.dl_mutex 739 */ 740 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 741 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 742 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 743 ibd_state_t::id_rx_list.dl_cnt)) 744 745 /* 746 * rc_timeout_lock 747 */ 748 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, 749 ibd_state_t::rc_timeout_start)) 750 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock, 751 ibd_state_t::rc_timeout)) 752 753 754 /* 755 * Items protected by atomic updates 756 */ 757 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 758 ibd_state_s::id_brd_rcv 759 ibd_state_s::id_brd_xmt 760 ibd_state_s::id_multi_rcv 761 ibd_state_s::id_multi_xmt 762 ibd_state_s::id_num_intrs 763 ibd_state_s::id_rcv_bytes 764 ibd_state_s::id_rcv_pkt 765 ibd_state_s::id_rx_post_queue_index 766 ibd_state_s::id_tx_short 767 ibd_state_s::id_xmt_bytes 768 ibd_state_s::id_xmt_pkt 769 ibd_state_s::rc_rcv_trans_byte 770 ibd_state_s::rc_rcv_trans_pkt 771 ibd_state_s::rc_rcv_copy_byte 772 ibd_state_s::rc_rcv_copy_pkt 773 ibd_state_s::rc_xmt_bytes 774 ibd_state_s::rc_xmt_small_pkt 775 ibd_state_s::rc_xmt_fragmented_pkt 776 ibd_state_s::rc_xmt_map_fail_pkt 777 ibd_state_s::rc_xmt_map_succ_pkt 778 ibd_rc_chan_s::rcq_invoking)) 779 780 /* 781 * Non-mutex protection schemes for data elements. Almost all of 782 * these are non-shared items. 783 */ 784 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 785 callb_cpr 786 ib_gid_s 787 ib_header_info 788 ibd_acache_rq 789 ibd_acache_s::ac_mce 790 ibd_acache_s::ac_chan 791 ibd_mcache::mc_fullreap 792 ibd_mcache::mc_jstate 793 ibd_mcache::mc_req 794 ibd_rwqe_s 795 ibd_swqe_s 796 ibd_wqe_s 797 ibt_wr_ds_s::ds_va 798 ibt_wr_lso_s 799 ipoib_mac::ipoib_qpn 800 mac_capab_lso_s 801 msgb::b_next 802 msgb::b_cont 803 msgb::b_rptr 804 msgb::b_wptr 805 ibd_state_s::id_bgroup_created 806 ibd_state_s::id_mac_state 807 ibd_state_s::id_mtu 808 ibd_state_s::id_ud_num_rwqe 809 ibd_state_s::id_ud_num_swqe 810 ibd_state_s::id_qpnum 811 ibd_state_s::id_rcq_hdl 812 ibd_state_s::id_rx_buf_sz 813 ibd_state_s::id_rx_bufs 814 ibd_state_s::id_rx_mr_hdl 815 ibd_state_s::id_rx_wqes 816 ibd_state_s::id_rxwcs 817 ibd_state_s::id_rxwcs_size 818 ibd_state_s::id_rx_nqueues 819 ibd_state_s::id_rx_queues 820 ibd_state_s::id_scope 821 ibd_state_s::id_scq_hdl 822 ibd_state_s::id_tx_buf_sz 823 ibd_state_s::id_tx_bufs 824 ibd_state_s::id_tx_mr_hdl 825 ibd_state_s::id_tx_rel_list.dl_cnt 826 ibd_state_s::id_tx_wqes 827 ibd_state_s::id_txwcs 828 ibd_state_s::id_txwcs_size 829 ibd_state_s::rc_listen_hdl 830 ibd_state_s::rc_listen_hdl_OFED_interop 831 ibd_state_s::rc_srq_size 832 ibd_state_s::rc_srq_rwqes 833 ibd_state_s::rc_srq_rx_bufs 834 ibd_state_s::rc_srq_rx_mr_hdl 835 ibd_state_s::rc_tx_largebuf_desc_base 836 ibd_state_s::rc_tx_mr_bufs 837 ibd_state_s::rc_tx_mr_hdl 838 ipha_s 839 icmph_s 840 ibt_path_info_s::pi_sid 841 ibd_rc_chan_s::ace 842 ibd_rc_chan_s::chan_hdl 843 ibd_rc_chan_s::state 844 ibd_rc_chan_s::chan_state 845 ibd_rc_chan_s::is_tx_chan 846 ibd_rc_chan_s::rcq_hdl 847 ibd_rc_chan_s::rcq_size 848 ibd_rc_chan_s::scq_hdl 849 ibd_rc_chan_s::scq_size 850 ibd_rc_chan_s::rx_bufs 851 ibd_rc_chan_s::rx_mr_hdl 852 ibd_rc_chan_s::rx_rwqes 853 ibd_rc_chan_s::tx_wqes 854 ibd_rc_chan_s::tx_mr_bufs 855 ibd_rc_chan_s::tx_mr_hdl 856 ibd_rc_chan_s::tx_rel_list.dl_cnt 857 ibd_rc_chan_s::is_used 858 ibd_rc_tx_largebuf_s::lb_buf 859 ibd_rc_msg_hello_s 860 ibt_cm_return_args_s)) 861 862 /* 863 * ibd_rc_chan_s::next is protected by two mutexes: 864 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex 865 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. 866 */ 867 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", 868 ibd_rc_chan_s::next)) 869 870 /* 871 * ibd_state_s.rc_tx_large_bufs_lock 872 */ 873 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 874 ibd_state_s::rc_tx_largebuf_free_head)) 875 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 876 ibd_state_s::rc_tx_largebuf_nfree)) 877 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 878 ibd_rc_tx_largebuf_s::lb_next)) 879 880 /* 881 * ibd_acache_s.tx_too_big_mutex 882 */ 883 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, 884 ibd_acache_s::tx_too_big_ongoing)) 885 886 /* 887 * tx_wqe_list.dl_mutex 888 */ 889 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 890 ibd_rc_chan_s::tx_wqe_list.dl_head)) 891 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 892 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) 893 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 894 ibd_rc_chan_s::tx_wqe_list.dl_cnt)) 895 896 /* 897 * ibd_state_s.rc_ace_recycle_lock 898 */ 899 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, 900 ibd_state_s::rc_ace_recycle)) 901 902 /* 903 * rc_srq_rwqe_list.dl_mutex 904 */ 905 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 906 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) 907 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 908 ibd_state_t::rc_srq_rwqe_list.dl_cnt)) 909 910 /* 911 * Non-mutex protection schemes for data elements. They are counters 912 * for problem diagnosis. Don't need be protected. 913 */ 914 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 915 ibd_state_s::rc_rcv_alloc_fail 916 ibd_state_s::rc_rcq_err 917 ibd_state_s::rc_ace_not_found 918 ibd_state_s::rc_xmt_drop_too_long_pkt 919 ibd_state_s::rc_xmt_icmp_too_long_pkt 920 ibd_state_s::rc_xmt_reenter_too_long_pkt 921 ibd_state_s::rc_swqe_short 922 ibd_state_s::rc_swqe_mac_update 923 ibd_state_s::rc_xmt_buf_short 924 ibd_state_s::rc_xmt_buf_mac_update 925 ibd_state_s::rc_scq_no_swqe 926 ibd_state_s::rc_scq_no_largebuf 927 ibd_state_s::rc_conn_succ 928 ibd_state_s::rc_conn_fail 929 ibd_state_s::rc_null_conn 930 ibd_state_s::rc_no_estab_conn 931 ibd_state_s::rc_act_close 932 ibd_state_s::rc_pas_close 933 ibd_state_s::rc_delay_ace_recycle 934 ibd_state_s::rc_act_close_simultaneous 935 ibd_state_s::rc_act_close_not_clean 936 ibd_state_s::rc_pas_close_rcq_invoking 937 ibd_state_s::rc_reset_cnt 938 ibd_state_s::rc_timeout_act 939 ibd_state_s::rc_timeout_pas 940 ibd_state_s::rc_stop_connect)) 941 942 #ifdef DEBUG 943 /* 944 * Non-mutex protection schemes for data elements. They are counters 945 * for problem diagnosis. Don't need be protected. 946 */ 947 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 948 ibd_state_s::rc_rwqe_short 949 ibd_rc_stat_s::rc_rcv_trans_byte 950 ibd_rc_stat_s::rc_rcv_trans_pkt 951 ibd_rc_stat_s::rc_rcv_copy_byte 952 ibd_rc_stat_s::rc_rcv_copy_pkt 953 ibd_rc_stat_s::rc_rcv_alloc_fail 954 ibd_rc_stat_s::rc_rcq_err 955 ibd_rc_stat_s::rc_rwqe_short 956 ibd_rc_stat_s::rc_xmt_bytes 957 ibd_rc_stat_s::rc_xmt_small_pkt 958 ibd_rc_stat_s::rc_xmt_fragmented_pkt 959 ibd_rc_stat_s::rc_xmt_map_fail_pkt 960 ibd_rc_stat_s::rc_xmt_map_succ_pkt 961 ibd_rc_stat_s::rc_ace_not_found 962 ibd_rc_stat_s::rc_scq_no_swqe 963 ibd_rc_stat_s::rc_scq_no_largebuf 964 ibd_rc_stat_s::rc_swqe_short 965 ibd_rc_stat_s::rc_swqe_mac_update 966 ibd_rc_stat_s::rc_xmt_buf_short 967 ibd_rc_stat_s::rc_xmt_buf_mac_update 968 ibd_rc_stat_s::rc_conn_succ 969 ibd_rc_stat_s::rc_conn_fail 970 ibd_rc_stat_s::rc_null_conn 971 ibd_rc_stat_s::rc_no_estab_conn 972 ibd_rc_stat_s::rc_act_close 973 ibd_rc_stat_s::rc_pas_close 974 ibd_rc_stat_s::rc_delay_ace_recycle 975 ibd_rc_stat_s::rc_act_close_simultaneous 976 ibd_rc_stat_s::rc_reset_cnt 977 ibd_rc_stat_s::rc_timeout_act 978 ibd_rc_stat_s::rc_timeout_pas)) 979 #endif 980 981 int 982 _init() 983 { 984 int status; 985 986 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 987 PAGESIZE), 0); 988 if (status != 0) { 989 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 990 return (status); 991 } 992 993 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL); 994 995 mac_init_ops(&ibd_dev_ops, "ibp"); 996 status = mod_install(&ibd_modlinkage); 997 if (status != 0) { 998 DPRINT(10, "_init:failed in mod_install()"); 999 ddi_soft_state_fini(&ibd_list); 1000 mac_fini_ops(&ibd_dev_ops); 1001 return (status); 1002 } 1003 1004 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); 1005 mutex_enter(&ibd_gstate.ig_mutex); 1006 ibd_gstate.ig_ibt_hdl = NULL; 1007 ibd_gstate.ig_ibt_hdl_ref_cnt = 0; 1008 ibd_gstate.ig_service_list = NULL; 1009 mutex_exit(&ibd_gstate.ig_mutex); 1010 1011 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list, 1012 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) { 1013 return (EIO); 1014 } 1015 1016 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr); 1017 1018 #ifdef IBD_LOGGING 1019 ibd_log_init(); 1020 #endif 1021 return (0); 1022 } 1023 1024 int 1025 _info(struct modinfo *modinfop) 1026 { 1027 return (mod_info(&ibd_modlinkage, modinfop)); 1028 } 1029 1030 int 1031 _fini() 1032 { 1033 int status; 1034 1035 status = mod_remove(&ibd_modlinkage); 1036 if (status != 0) 1037 return (status); 1038 1039 ibt_unregister_part_attr_cb(); 1040 1041 mac_fini_ops(&ibd_dev_ops); 1042 mutex_destroy(&ibd_objlist_lock); 1043 ddi_soft_state_fini(&ibd_list); 1044 mutex_destroy(&ibd_gstate.ig_mutex); 1045 #ifdef IBD_LOGGING 1046 ibd_log_fini(); 1047 #endif 1048 return (0); 1049 } 1050 1051 /* 1052 * Convert the GID part of the mac address from network byte order 1053 * to host order. 1054 */ 1055 static void 1056 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 1057 { 1058 ib_sn_prefix_t nbopref; 1059 ib_guid_t nboguid; 1060 1061 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 1062 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 1063 dgid->gid_prefix = b2h64(nbopref); 1064 dgid->gid_guid = b2h64(nboguid); 1065 } 1066 1067 /* 1068 * Create the IPoIB address in network byte order from host order inputs. 1069 */ 1070 static void 1071 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 1072 ib_guid_t guid) 1073 { 1074 ib_sn_prefix_t nbopref; 1075 ib_guid_t nboguid; 1076 1077 mac->ipoib_qpn = htonl(qpn); 1078 nbopref = h2b64(prefix); 1079 nboguid = h2b64(guid); 1080 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 1081 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 1082 } 1083 1084 /* 1085 * Send to the appropriate all-routers group when the IBA multicast group 1086 * does not exist, based on whether the target group is v4 or v6. 1087 */ 1088 static boolean_t 1089 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 1090 ipoib_mac_t *rmac) 1091 { 1092 boolean_t retval = B_TRUE; 1093 uint32_t adjscope = state->id_scope << 16; 1094 uint32_t topword; 1095 1096 /* 1097 * Copy the first 4 bytes in without assuming any alignment of 1098 * input mac address; this will have IPoIB signature, flags and 1099 * scope bits. 1100 */ 1101 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 1102 topword = ntohl(topword); 1103 1104 /* 1105 * Generate proper address for IPv4/v6, adding in the Pkey properly. 1106 */ 1107 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 1108 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 1109 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 1110 ((uint32_t)(state->id_pkey << 16))), 1111 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 1112 else 1113 /* 1114 * Does not have proper bits in the mgid address. 1115 */ 1116 retval = B_FALSE; 1117 1118 return (retval); 1119 } 1120 1121 /* 1122 * Membership states for different mcg's are tracked by two lists: 1123 * the "non" list is used for promiscuous mode, when all mcg traffic 1124 * needs to be inspected. This type of membership is never used for 1125 * transmission, so there can not be an AH in the active list 1126 * corresponding to a member in this list. This list does not need 1127 * any protection, since all operations are performed by the async 1128 * thread. 1129 * 1130 * "Full" and "SendOnly" membership is tracked using a single list, 1131 * the "full" list. This is because this single list can then be 1132 * searched during transmit to a multicast group (if an AH for the 1133 * mcg is not found in the active list), since at least one type 1134 * of membership must be present before initiating the transmit. 1135 * This list is also emptied during driver detach, since sendonly 1136 * membership acquired during transmit is dropped at detach time 1137 * along with ipv4 broadcast full membership. Insert/deletes to 1138 * this list are done only by the async thread, but it is also 1139 * searched in program context (see multicast disable case), thus 1140 * the id_mc_mutex protects the list. The driver detach path also 1141 * deconstructs the "full" list, but it ensures that the async 1142 * thread will not be accessing the list (by blocking out mcg 1143 * trap handling and making sure no more Tx reaping will happen). 1144 * 1145 * Currently, an IBA attach is done in the SendOnly case too, 1146 * although this is not required. 1147 */ 1148 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1149 list_insert_head(&state->id_mc_full, mce) 1150 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1151 list_insert_head(&state->id_mc_non, mce) 1152 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1153 ibd_mcache_find(mgid, &state->id_mc_full) 1154 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1155 ibd_mcache_find(mgid, &state->id_mc_non) 1156 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1157 list_remove(&state->id_mc_full, mce) 1158 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1159 list_remove(&state->id_mc_non, mce) 1160 1161 static void * 1162 list_get_head(list_t *list) 1163 { 1164 list_node_t *lhead = list_head(list); 1165 1166 if (lhead != NULL) 1167 list_remove(list, lhead); 1168 return (lhead); 1169 } 1170 1171 /* 1172 * This is always guaranteed to be able to queue the work. 1173 */ 1174 void 1175 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1176 { 1177 /* Initialize request */ 1178 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1179 ptr->rq_op = op; 1180 1181 /* 1182 * Queue provided slot onto request pool. 1183 */ 1184 mutex_enter(&state->id_acache_req_lock); 1185 list_insert_tail(&state->id_req_list, ptr); 1186 1187 /* Go, fetch, async thread */ 1188 cv_signal(&state->id_acache_req_cv); 1189 mutex_exit(&state->id_acache_req_lock); 1190 } 1191 1192 /* 1193 * Main body of the per interface async thread. 1194 */ 1195 static void 1196 ibd_async_work(ibd_state_t *state) 1197 { 1198 ibd_req_t *ptr; 1199 callb_cpr_t cprinfo; 1200 1201 mutex_enter(&state->id_acache_req_lock); 1202 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1203 callb_generic_cpr, "ibd_async_work"); 1204 1205 for (;;) { 1206 ptr = list_get_head(&state->id_req_list); 1207 if (ptr != NULL) { 1208 mutex_exit(&state->id_acache_req_lock); 1209 1210 /* 1211 * If we are in late hca initialization mode, do not 1212 * process any other async request other than TRAP. TRAP 1213 * is used for indicating creation of a broadcast group; 1214 * in which case, we need to join/create the group. 1215 */ 1216 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 1217 (ptr->rq_op != IBD_ASYNC_TRAP)) { 1218 goto free_req_and_continue; 1219 } 1220 1221 /* 1222 * Once we have done the operation, there is no 1223 * guarantee the request slot is going to be valid, 1224 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1225 * TRAP). 1226 * 1227 * Perform the request. 1228 */ 1229 switch (ptr->rq_op) { 1230 case IBD_ASYNC_GETAH: 1231 ibd_async_acache(state, &ptr->rq_mac); 1232 break; 1233 case IBD_ASYNC_JOIN: 1234 case IBD_ASYNC_LEAVE: 1235 ibd_async_multicast(state, 1236 ptr->rq_gid, ptr->rq_op); 1237 break; 1238 case IBD_ASYNC_PROMON: 1239 ibd_async_setprom(state); 1240 break; 1241 case IBD_ASYNC_PROMOFF: 1242 ibd_async_unsetprom(state); 1243 break; 1244 case IBD_ASYNC_REAP: 1245 ibd_async_reap_group(state, 1246 ptr->rq_ptr, ptr->rq_gid, 1247 IB_MC_JSTATE_FULL); 1248 /* 1249 * the req buf contains in mce 1250 * structure, so we do not need 1251 * to free it here. 1252 */ 1253 ptr = NULL; 1254 break; 1255 case IBD_ASYNC_TRAP: 1256 ibd_async_trap(state, ptr); 1257 break; 1258 case IBD_ASYNC_SCHED: 1259 ibd_async_txsched(state); 1260 break; 1261 case IBD_ASYNC_LINK: 1262 ibd_async_link(state, ptr); 1263 break; 1264 case IBD_ASYNC_EXIT: 1265 mutex_enter(&state->id_acache_req_lock); 1266 #ifndef __lock_lint 1267 CALLB_CPR_EXIT(&cprinfo); 1268 #else 1269 mutex_exit(&state->id_acache_req_lock); 1270 #endif 1271 return; 1272 case IBD_ASYNC_RC_TOO_BIG: 1273 ibd_async_rc_process_too_big(state, 1274 ptr); 1275 break; 1276 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 1277 ibd_async_rc_close_act_chan(state, ptr); 1278 break; 1279 case IBD_ASYNC_RC_RECYCLE_ACE: 1280 ibd_async_rc_recycle_ace(state, ptr); 1281 break; 1282 case IBD_ASYNC_RC_CLOSE_PAS_CHAN: 1283 (void) ibd_rc_pas_close(ptr->rq_ptr, 1284 B_TRUE, B_TRUE); 1285 break; 1286 } 1287 free_req_and_continue: 1288 if (ptr != NULL) 1289 kmem_cache_free(state->id_req_kmc, ptr); 1290 1291 mutex_enter(&state->id_acache_req_lock); 1292 } else { 1293 #ifndef __lock_lint 1294 /* 1295 * Nothing to do: wait till new request arrives. 1296 */ 1297 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1298 cv_wait(&state->id_acache_req_cv, 1299 &state->id_acache_req_lock); 1300 CALLB_CPR_SAFE_END(&cprinfo, 1301 &state->id_acache_req_lock); 1302 #endif 1303 } 1304 } 1305 1306 /*NOTREACHED*/ 1307 _NOTE(NOT_REACHED) 1308 } 1309 1310 /* 1311 * Return when it is safe to queue requests to the async daemon; primarily 1312 * for subnet trap and async event handling. Disallow requests before the 1313 * daemon is created, and when interface deinitilization starts. 1314 */ 1315 static boolean_t 1316 ibd_async_safe(ibd_state_t *state) 1317 { 1318 mutex_enter(&state->id_trap_lock); 1319 if (state->id_trap_stop) { 1320 mutex_exit(&state->id_trap_lock); 1321 return (B_FALSE); 1322 } 1323 state->id_trap_inprog++; 1324 mutex_exit(&state->id_trap_lock); 1325 return (B_TRUE); 1326 } 1327 1328 /* 1329 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1330 * trap or event handling to complete to kill the async thread and deconstruct 1331 * the mcg/ace list. 1332 */ 1333 static void 1334 ibd_async_done(ibd_state_t *state) 1335 { 1336 mutex_enter(&state->id_trap_lock); 1337 if (--state->id_trap_inprog == 0) 1338 cv_signal(&state->id_trap_cv); 1339 mutex_exit(&state->id_trap_lock); 1340 } 1341 1342 /* 1343 * Hash functions: 1344 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1345 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1346 * These operate on mac addresses input into ibd_send, but there is no 1347 * guarantee on the alignment of the ipoib_mac_t structure. 1348 */ 1349 /*ARGSUSED*/ 1350 static uint_t 1351 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1352 { 1353 ulong_t ptraddr = (ulong_t)key; 1354 uint_t hval; 1355 1356 /* 1357 * If the input address is 4 byte aligned, we can just dereference 1358 * it. This is most common, since IP will send in a 4 byte aligned 1359 * IP header, which implies the 24 byte IPoIB psuedo header will be 1360 * 4 byte aligned too. 1361 */ 1362 if ((ptraddr & 3) == 0) 1363 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1364 1365 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1366 return (hval); 1367 } 1368 1369 static int 1370 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1371 { 1372 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1373 return (0); 1374 else 1375 return (1); 1376 } 1377 1378 /* 1379 * Initialize all the per interface caches and lists; AH cache, 1380 * MCG list etc. 1381 */ 1382 static int 1383 ibd_acache_init(ibd_state_t *state) 1384 { 1385 ibd_ace_t *ce; 1386 int i; 1387 1388 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1389 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1390 mutex_enter(&state->id_ac_mutex); 1391 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1392 offsetof(ibd_ace_t, ac_list)); 1393 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1394 offsetof(ibd_ace_t, ac_list)); 1395 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1396 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor, 1397 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1398 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1399 offsetof(ibd_mce_t, mc_list)); 1400 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1401 offsetof(ibd_mce_t, mc_list)); 1402 state->id_ac_hot_ace = NULL; 1403 1404 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1405 state->id_num_ah, KM_SLEEP); 1406 for (i = 0; i < state->id_num_ah; i++, ce++) { 1407 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1408 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1409 mutex_exit(&state->id_ac_mutex); 1410 ibd_acache_fini(state); 1411 return (DDI_FAILURE); 1412 } else { 1413 CLEAR_REFCYCLE(ce); 1414 ce->ac_mce = NULL; 1415 mutex_init(&ce->tx_too_big_mutex, NULL, 1416 MUTEX_DRIVER, NULL); 1417 IBD_ACACHE_INSERT_FREE(state, ce); 1418 } 1419 } 1420 mutex_exit(&state->id_ac_mutex); 1421 return (DDI_SUCCESS); 1422 } 1423 1424 static void 1425 ibd_acache_fini(ibd_state_t *state) 1426 { 1427 ibd_ace_t *ptr; 1428 1429 mutex_enter(&state->id_ac_mutex); 1430 1431 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1432 ASSERT(GET_REF(ptr) == 0); 1433 mutex_destroy(&ptr->tx_too_big_mutex); 1434 (void) ibt_free_ud_dest(ptr->ac_dest); 1435 } 1436 1437 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1438 ASSERT(GET_REF(ptr) == 0); 1439 mutex_destroy(&ptr->tx_too_big_mutex); 1440 (void) ibt_free_ud_dest(ptr->ac_dest); 1441 } 1442 1443 list_destroy(&state->id_ah_free); 1444 list_destroy(&state->id_ah_active); 1445 list_destroy(&state->id_mc_full); 1446 list_destroy(&state->id_mc_non); 1447 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah); 1448 mutex_exit(&state->id_ac_mutex); 1449 mutex_destroy(&state->id_ac_mutex); 1450 mutex_destroy(&state->id_mc_mutex); 1451 } 1452 1453 /* 1454 * Search AH active hash list for a cached path to input destination. 1455 * If we are "just looking", hold == F. When we are in the Tx path, 1456 * we set hold == T to grab a reference on the AH so that it can not 1457 * be recycled to a new destination while the Tx request is posted. 1458 */ 1459 ibd_ace_t * 1460 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1461 { 1462 ibd_ace_t *ptr; 1463 1464 ASSERT(mutex_owned(&state->id_ac_mutex)); 1465 1466 /* 1467 * Do hash search. 1468 */ 1469 if (mod_hash_find(state->id_ah_active_hash, 1470 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1471 if (hold) 1472 INC_REF(ptr, num); 1473 return (ptr); 1474 } 1475 return (NULL); 1476 } 1477 1478 /* 1479 * This is called by the tx side; if an initialized AH is found in 1480 * the active list, it is locked down and can be used; if no entry 1481 * is found, an async request is queued to do path resolution. 1482 */ 1483 static ibd_ace_t * 1484 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1485 { 1486 ibd_ace_t *ptr; 1487 ibd_req_t *req; 1488 1489 /* 1490 * Only attempt to print when we can; in the mdt pattr case, the 1491 * address is not aligned properly. 1492 */ 1493 if (((ulong_t)mac & 3) == 0) { 1494 DPRINT(4, 1495 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1496 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1497 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1498 htonl(mac->ipoib_gidsuff[1])); 1499 } 1500 1501 mutex_enter(&state->id_ac_mutex); 1502 1503 if (((ptr = state->id_ac_hot_ace) != NULL) && 1504 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1505 INC_REF(ptr, numwqe); 1506 mutex_exit(&state->id_ac_mutex); 1507 return (ptr); 1508 } 1509 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1510 state->id_ac_hot_ace = ptr; 1511 mutex_exit(&state->id_ac_mutex); 1512 return (ptr); 1513 } 1514 1515 /* 1516 * Implementation of a single outstanding async request; if 1517 * the operation is not started yet, queue a request and move 1518 * to ongoing state. Remember in id_ah_addr for which address 1519 * we are queueing the request, in case we need to flag an error; 1520 * Any further requests, for the same or different address, until 1521 * the operation completes, is sent back to GLDv3 to be retried. 1522 * The async thread will update id_ah_op with an error indication 1523 * or will set it to indicate the next look up can start; either 1524 * way, it will mac_tx_update() so that all blocked requests come 1525 * back here. 1526 */ 1527 *err = EAGAIN; 1528 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1529 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1530 if (req != NULL) { 1531 /* 1532 * We did not even find the entry; queue a request 1533 * for it. 1534 */ 1535 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1536 state->id_ah_op = IBD_OP_ONGOING; 1537 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1538 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1539 } 1540 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1541 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1542 /* 1543 * Check the status of the pathrecord lookup request 1544 * we had queued before. 1545 */ 1546 if (state->id_ah_op == IBD_OP_ERRORED) { 1547 *err = EFAULT; 1548 state->id_ah_error++; 1549 } else { 1550 /* 1551 * IBD_OP_ROUTERED case: We need to send to the 1552 * all-router MCG. If we can find the AH for 1553 * the mcg, the Tx will be attempted. If we 1554 * do not find the AH, we return NORESOURCES 1555 * to retry. 1556 */ 1557 ipoib_mac_t routermac; 1558 1559 (void) ibd_get_allroutergroup(state, mac, &routermac); 1560 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1561 numwqe); 1562 } 1563 state->id_ah_op = IBD_OP_NOTSTARTED; 1564 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1565 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1566 /* 1567 * This case can happen when we get a higher band 1568 * packet. The easiest way is to reset the state machine 1569 * to accommodate the higher priority packet. 1570 */ 1571 state->id_ah_op = IBD_OP_NOTSTARTED; 1572 } 1573 mutex_exit(&state->id_ac_mutex); 1574 1575 return (ptr); 1576 } 1577 1578 /* 1579 * Grab a not-currently-in-use AH/PathRecord from the active 1580 * list to recycle to a new destination. Only the async thread 1581 * executes this code. 1582 */ 1583 static ibd_ace_t * 1584 ibd_acache_get_unref(ibd_state_t *state) 1585 { 1586 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1587 boolean_t try_rc_chan_recycle = B_FALSE; 1588 1589 ASSERT(mutex_owned(&state->id_ac_mutex)); 1590 1591 /* 1592 * Do plain linear search. 1593 */ 1594 while (ptr != NULL) { 1595 /* 1596 * Note that it is possible that the "cycle" bit 1597 * is set on the AH w/o any reference count. The 1598 * mcg must have been deleted, and the tx cleanup 1599 * just decremented the reference count to 0, but 1600 * hasn't gotten around to grabbing the id_ac_mutex 1601 * to move the AH into the free list. 1602 */ 1603 if (GET_REF(ptr) == 0) { 1604 if (ptr->ac_chan != NULL) { 1605 ASSERT(state->id_enable_rc == B_TRUE); 1606 if (!try_rc_chan_recycle) { 1607 try_rc_chan_recycle = B_TRUE; 1608 ibd_rc_signal_ace_recycle(state, ptr); 1609 } 1610 } else { 1611 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1612 break; 1613 } 1614 } 1615 ptr = list_prev(&state->id_ah_active, ptr); 1616 } 1617 return (ptr); 1618 } 1619 1620 /* 1621 * Invoked to clean up AH from active list in case of multicast 1622 * disable and to handle sendonly memberships during mcg traps. 1623 * And for port up processing for multicast and unicast AHs. 1624 * Normally, the AH is taken off the active list, and put into 1625 * the free list to be recycled for a new destination. In case 1626 * Tx requests on the AH have not completed yet, the AH is marked 1627 * for reaping (which will put the AH on the free list) once the Tx's 1628 * complete; in this case, depending on the "force" input, we take 1629 * out the AH from the active list right now, or leave it also for 1630 * the reap operation. Returns TRUE if the AH is taken off the active 1631 * list (and either put into the free list right now, or arranged for 1632 * later), FALSE otherwise. 1633 */ 1634 boolean_t 1635 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1636 { 1637 ibd_ace_t *acactive; 1638 boolean_t ret = B_TRUE; 1639 1640 ASSERT(mutex_owned(&state->id_ac_mutex)); 1641 1642 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1643 1644 /* 1645 * Note that the AH might already have the cycle bit set 1646 * on it; this might happen if sequences of multicast 1647 * enables and disables are coming so fast, that posted 1648 * Tx's to the mcg have not completed yet, and the cycle 1649 * bit is set successively by each multicast disable. 1650 */ 1651 if (SET_CYCLE_IF_REF(acactive)) { 1652 if (!force) { 1653 /* 1654 * The ace is kept on the active list, further 1655 * Tx's can still grab a reference on it; the 1656 * ace is reaped when all pending Tx's 1657 * referencing the AH complete. 1658 */ 1659 ret = B_FALSE; 1660 } else { 1661 /* 1662 * In the mcg trap case, we always pull the 1663 * AH from the active list. And also the port 1664 * up multi/unicast case. 1665 */ 1666 ASSERT(acactive->ac_chan == NULL); 1667 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1668 acactive->ac_mce = NULL; 1669 } 1670 } else { 1671 /* 1672 * Determined the ref count is 0, thus reclaim 1673 * immediately after pulling out the ace from 1674 * the active list. 1675 */ 1676 ASSERT(acactive->ac_chan == NULL); 1677 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1678 acactive->ac_mce = NULL; 1679 IBD_ACACHE_INSERT_FREE(state, acactive); 1680 } 1681 1682 } 1683 return (ret); 1684 } 1685 1686 /* 1687 * Helper function for async path record lookup. If we are trying to 1688 * Tx to a MCG, check our membership, possibly trying to join the 1689 * group if required. If that fails, try to send the packet to the 1690 * all router group (indicated by the redirect output), pointing 1691 * the input mac address to the router mcg address. 1692 */ 1693 static ibd_mce_t * 1694 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1695 { 1696 ib_gid_t mgid; 1697 ibd_mce_t *mce; 1698 ipoib_mac_t routermac; 1699 1700 *redirect = B_FALSE; 1701 ibd_n2h_gid(mac, &mgid); 1702 1703 /* 1704 * Check the FullMember+SendOnlyNonMember list. 1705 * Since we are the only one who manipulates the 1706 * id_mc_full list, no locks are needed. 1707 */ 1708 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1709 if (mce != NULL) { 1710 DPRINT(4, "ibd_async_mcache : already joined to group"); 1711 return (mce); 1712 } 1713 1714 /* 1715 * Not found; try to join(SendOnlyNonMember) and attach. 1716 */ 1717 DPRINT(4, "ibd_async_mcache : not joined to group"); 1718 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1719 NULL) { 1720 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1721 return (mce); 1722 } 1723 1724 /* 1725 * MCGroup not present; try to join the all-router group. If 1726 * any of the following steps succeed, we will be redirecting 1727 * to the all router group. 1728 */ 1729 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1730 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1731 return (NULL); 1732 *redirect = B_TRUE; 1733 ibd_n2h_gid(&routermac, &mgid); 1734 bcopy(&routermac, mac, IPOIB_ADDRL); 1735 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1736 mgid.gid_prefix, mgid.gid_guid); 1737 1738 /* 1739 * Are we already joined to the router group? 1740 */ 1741 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1742 DPRINT(4, "ibd_async_mcache : using already joined router" 1743 "group\n"); 1744 return (mce); 1745 } 1746 1747 /* 1748 * Can we join(SendOnlyNonMember) the router group? 1749 */ 1750 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1751 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1752 NULL) { 1753 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1754 return (mce); 1755 } 1756 1757 return (NULL); 1758 } 1759 1760 /* 1761 * Async path record lookup code. 1762 */ 1763 static void 1764 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1765 { 1766 ibd_ace_t *ce; 1767 ibd_mce_t *mce = NULL; 1768 ibt_path_attr_t path_attr; 1769 ibt_path_info_t path_info; 1770 ib_gid_t destgid; 1771 char ret = IBD_OP_NOTSTARTED; 1772 1773 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1774 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1775 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1776 htonl(mac->ipoib_gidsuff[1])); 1777 1778 /* 1779 * Check whether we are trying to transmit to a MCG. 1780 * In that case, we need to make sure we are a member of 1781 * the MCG. 1782 */ 1783 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1784 boolean_t redirected; 1785 1786 /* 1787 * If we can not find or join the group or even 1788 * redirect, error out. 1789 */ 1790 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1791 NULL) { 1792 state->id_ah_op = IBD_OP_ERRORED; 1793 return; 1794 } 1795 1796 /* 1797 * If we got redirected, we need to determine whether 1798 * the AH for the new mcg is in the cache already, and 1799 * not pull it in then; otherwise proceed to get the 1800 * path for the new mcg. There is no guarantee that 1801 * if the AH is currently in the cache, it will still be 1802 * there when we look in ibd_acache_lookup(), but that's 1803 * okay, we will come back here. 1804 */ 1805 if (redirected) { 1806 ret = IBD_OP_ROUTERED; 1807 DPRINT(4, "ibd_async_acache : redirected to " 1808 "%08X:%08X:%08X:%08X:%08X", 1809 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1810 htonl(mac->ipoib_gidpref[1]), 1811 htonl(mac->ipoib_gidsuff[0]), 1812 htonl(mac->ipoib_gidsuff[1])); 1813 1814 mutex_enter(&state->id_ac_mutex); 1815 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1816 state->id_ah_op = IBD_OP_ROUTERED; 1817 mutex_exit(&state->id_ac_mutex); 1818 DPRINT(4, "ibd_async_acache : router AH found"); 1819 return; 1820 } 1821 mutex_exit(&state->id_ac_mutex); 1822 } 1823 } 1824 1825 /* 1826 * Get an AH from the free list. 1827 */ 1828 mutex_enter(&state->id_ac_mutex); 1829 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1830 /* 1831 * No free ones; try to grab an unreferenced active 1832 * one. Maybe we need to make the active list LRU, 1833 * but that will create more work for Tx callbacks. 1834 * Is there a way of not having to pull out the 1835 * entry from the active list, but just indicate it 1836 * is being recycled? Yes, but that creates one more 1837 * check in the fast lookup path. 1838 */ 1839 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1840 /* 1841 * Pretty serious shortage now. 1842 */ 1843 state->id_ah_op = IBD_OP_NOTSTARTED; 1844 mutex_exit(&state->id_ac_mutex); 1845 DPRINT(10, "ibd_async_acache : failed to find AH " 1846 "slot\n"); 1847 return; 1848 } 1849 /* 1850 * We could check whether ac_mce points to a SendOnly 1851 * member and drop that membership now. Or do it lazily 1852 * at detach time. 1853 */ 1854 ce->ac_mce = NULL; 1855 } 1856 mutex_exit(&state->id_ac_mutex); 1857 ASSERT(ce->ac_mce == NULL); 1858 1859 /* 1860 * Update the entry. 1861 */ 1862 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1863 1864 bzero(&path_info, sizeof (path_info)); 1865 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1866 path_attr.pa_sgid = state->id_sgid; 1867 path_attr.pa_num_dgids = 1; 1868 ibd_n2h_gid(&ce->ac_mac, &destgid); 1869 path_attr.pa_dgids = &destgid; 1870 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1871 path_attr.pa_pkey = state->id_pkey; 1872 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1, 1873 &path_info, NULL) != IBT_SUCCESS) { 1874 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1875 goto error; 1876 } 1877 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1878 ntohl(ce->ac_mac.ipoib_qpn), 1879 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1880 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1881 goto error; 1882 } 1883 1884 /* 1885 * mce is set whenever an AH is being associated with a 1886 * MCG; this will come in handy when we leave the MCG. The 1887 * lock protects Tx fastpath from scanning the active list. 1888 */ 1889 if (mce != NULL) 1890 ce->ac_mce = mce; 1891 1892 /* 1893 * initiate a RC mode connection for unicast address 1894 */ 1895 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1896 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1897 ASSERT(ce->ac_chan == NULL); 1898 DPRINT(10, "ibd_async_acache: call " 1899 "ibd_rc_try_connect(ace=%p)", ce); 1900 ibd_rc_try_connect(state, ce, &path_info); 1901 if (ce->ac_chan == NULL) { 1902 DPRINT(10, "ibd_async_acache: fail to setup RC" 1903 " channel"); 1904 state->rc_conn_fail++; 1905 goto error; 1906 } 1907 } 1908 1909 mutex_enter(&state->id_ac_mutex); 1910 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1911 state->id_ah_op = ret; 1912 mutex_exit(&state->id_ac_mutex); 1913 return; 1914 error: 1915 /* 1916 * We might want to drop SendOnly membership here if we 1917 * joined above. The lock protects Tx callbacks inserting 1918 * into the free list. 1919 */ 1920 mutex_enter(&state->id_ac_mutex); 1921 state->id_ah_op = IBD_OP_ERRORED; 1922 IBD_ACACHE_INSERT_FREE(state, ce); 1923 mutex_exit(&state->id_ac_mutex); 1924 } 1925 1926 /* 1927 * While restoring port's presence on the subnet on a port up, it is possible 1928 * that the port goes down again. 1929 */ 1930 static void 1931 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1932 { 1933 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1934 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1935 LINK_STATE_UP; 1936 ibd_mce_t *mce, *pmce; 1937 ibd_ace_t *ace, *pace; 1938 1939 DPRINT(10, "ibd_async_link(): %d", opcode); 1940 1941 /* 1942 * On a link up, revalidate the link speed/width. No point doing 1943 * this on a link down, since we will be unable to do SA operations, 1944 * defaulting to the lowest speed. Also notice that we update our 1945 * notion of speed before calling mac_link_update(), which will do 1946 * necessary higher level notifications for speed changes. 1947 */ 1948 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1949 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1950 state->id_link_speed = ibd_get_portspeed(state); 1951 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1952 } 1953 1954 /* 1955 * Do all the work required to establish our presence on 1956 * the subnet. 1957 */ 1958 if (opcode == IBD_LINK_UP_ABSENT) { 1959 /* 1960 * If in promiscuous mode ... 1961 */ 1962 if (state->id_prom_op == IBD_OP_COMPLETED) { 1963 /* 1964 * Drop all nonmembership. 1965 */ 1966 ibd_async_unsetprom(state); 1967 1968 /* 1969 * Then, try to regain nonmembership to all mcg's. 1970 */ 1971 ibd_async_setprom(state); 1972 1973 } 1974 1975 /* 1976 * Drop all sendonly membership (which also gets rid of the 1977 * AHs); try to reacquire all full membership. 1978 */ 1979 mce = list_head(&state->id_mc_full); 1980 while ((pmce = mce) != NULL) { 1981 mce = list_next(&state->id_mc_full, mce); 1982 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1983 ibd_leave_group(state, 1984 pmce->mc_info.mc_adds_vect.av_dgid, 1985 IB_MC_JSTATE_SEND_ONLY_NON); 1986 else 1987 ibd_reacquire_group(state, pmce); 1988 } 1989 1990 /* 1991 * Recycle all active AHs to free list (and if there are 1992 * pending posts, make sure they will go into the free list 1993 * once the Tx's complete). Grab the lock to prevent 1994 * concurrent Tx's as well as Tx cleanups. 1995 */ 1996 mutex_enter(&state->id_ac_mutex); 1997 ace = list_head(&state->id_ah_active); 1998 while ((pace = ace) != NULL) { 1999 boolean_t cycled; 2000 2001 ace = list_next(&state->id_ah_active, ace); 2002 mce = pace->ac_mce; 2003 if (pace->ac_chan != NULL) { 2004 ASSERT(mce == NULL); 2005 ASSERT(state->id_enable_rc == B_TRUE); 2006 if (pace->ac_chan->chan_state == 2007 IBD_RC_STATE_ACT_ESTAB) { 2008 INC_REF(pace, 1); 2009 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 2010 pace->ac_chan->chan_state = 2011 IBD_RC_STATE_ACT_CLOSING; 2012 ibd_rc_signal_act_close(state, pace); 2013 } else { 2014 state->rc_act_close_simultaneous++; 2015 DPRINT(40, "ibd_async_link: other " 2016 "thread is closing it, ace=%p, " 2017 "ac_chan=%p, chan_state=%d", 2018 pace, pace->ac_chan, 2019 pace->ac_chan->chan_state); 2020 } 2021 } else { 2022 cycled = ibd_acache_recycle(state, 2023 &pace->ac_mac, B_TRUE); 2024 } 2025 /* 2026 * If this is for an mcg, it must be for a fullmember, 2027 * since we got rid of send-only members above when 2028 * processing the mce list. 2029 */ 2030 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2031 IB_MC_JSTATE_FULL))); 2032 2033 /* 2034 * Check if the fullmember mce needs to be torn down, 2035 * ie whether the DLPI disable has already been done. 2036 * If so, do some of the work of tx_cleanup, namely 2037 * causing leave (which will fail), detach and 2038 * mce-freeing. tx_cleanup will put the AH into free 2039 * list. The reason to duplicate some of this 2040 * tx_cleanup work is because we want to delete the 2041 * AH right now instead of waiting for tx_cleanup, to 2042 * force subsequent Tx's to reacquire an AH. 2043 */ 2044 if ((mce != NULL) && (mce->mc_fullreap)) 2045 ibd_async_reap_group(state, mce, 2046 mce->mc_info.mc_adds_vect.av_dgid, 2047 mce->mc_jstate); 2048 } 2049 mutex_exit(&state->id_ac_mutex); 2050 } 2051 2052 /* 2053 * mac handle is guaranteed to exist since driver does ibt_close_hca() 2054 * (which stops further events from being delivered) before 2055 * mac_unregister(). At this point, it is guaranteed that mac_register 2056 * has already been done. 2057 */ 2058 mutex_enter(&state->id_link_mutex); 2059 state->id_link_state = lstate; 2060 mac_link_update(state->id_mh, lstate); 2061 mutex_exit(&state->id_link_mutex); 2062 2063 ibd_async_done(state); 2064 } 2065 2066 /* 2067 * Check the pkey table to see if we can find the pkey we're looking for. 2068 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 2069 * failure. 2070 */ 2071 static int 2072 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 2073 uint16_t *pkix) 2074 { 2075 uint16_t ndx; 2076 2077 ASSERT(pkix != NULL); 2078 2079 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 2080 if (pkey_tbl[ndx] == pkey) { 2081 *pkix = ndx; 2082 return (0); 2083 } 2084 } 2085 return (-1); 2086 } 2087 2088 /* 2089 * Late HCA Initialization: 2090 * If plumb had succeeded without the availability of an active port or the 2091 * pkey, and either of their availability is now being indicated via PORT_UP 2092 * or PORT_CHANGE respectively, try a start of the interface. 2093 * 2094 * Normal Operation: 2095 * When the link is notified up, we need to do a few things, based 2096 * on the port's current p_init_type_reply claiming a reinit has been 2097 * done or not. The reinit steps are: 2098 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2099 * the old Pkey and GID0 are correct. 2100 * 2. Register for mcg traps (already done by ibmf). 2101 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2102 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2103 * 4. Give up all sendonly memberships. 2104 * 5. Acquire all full memberships. 2105 * 6. In promiscuous mode, acquire all non memberships. 2106 * 7. Recycle all AHs to free list. 2107 */ 2108 static void 2109 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2110 { 2111 ibt_hca_portinfo_t *port_infop = NULL; 2112 ibt_status_t ibt_status; 2113 uint_t psize, port_infosz; 2114 ibd_link_op_t opcode; 2115 ibd_req_t *req; 2116 link_state_t new_link_state = LINK_STATE_UP; 2117 uint8_t itreply; 2118 uint16_t pkix; 2119 int ret; 2120 2121 /* 2122 * Let's not race with a plumb or an unplumb; if we detect a 2123 * pkey relocation event later on here, we may have to restart. 2124 */ 2125 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2126 2127 mutex_enter(&state->id_link_mutex); 2128 2129 /* 2130 * If the link state is unknown, a plumb has not yet been attempted 2131 * on the interface. Nothing to do. 2132 */ 2133 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2134 mutex_exit(&state->id_link_mutex); 2135 goto link_mod_return; 2136 } 2137 2138 /* 2139 * If link state is down because of plumb failure, and we are not in 2140 * late HCA init, and we were not successfully plumbed, nothing to do. 2141 */ 2142 if ((state->id_link_state == LINK_STATE_DOWN) && 2143 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) && 2144 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) { 2145 mutex_exit(&state->id_link_mutex); 2146 goto link_mod_return; 2147 } 2148 2149 /* 2150 * If this routine was called in response to a port down event, 2151 * we just need to see if this should be informed. 2152 */ 2153 if (code == IBT_ERROR_PORT_DOWN) { 2154 new_link_state = LINK_STATE_DOWN; 2155 goto update_link_state; 2156 } 2157 2158 /* 2159 * If it's not a port down event we've received, try to get the port 2160 * attributes first. If we fail here, the port is as good as down. 2161 * Otherwise, if the link went down by the time the handler gets 2162 * here, give up - we cannot even validate the pkey/gid since those 2163 * are not valid and this is as bad as a port down anyway. 2164 */ 2165 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2166 &port_infop, &psize, &port_infosz); 2167 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2168 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2169 new_link_state = LINK_STATE_DOWN; 2170 goto update_link_state; 2171 } 2172 2173 /* 2174 * If in the previous attempt, the pkey was not found either due to the 2175 * port state being down, or due to it's absence in the pkey table, 2176 * look for it now and try to start the interface. 2177 */ 2178 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) { 2179 mutex_exit(&state->id_link_mutex); 2180 if ((ret = ibd_start(state)) != 0) { 2181 DPRINT(10, "ibd_linkmod: cannot start from late HCA " 2182 "init, ret=%d", ret); 2183 } 2184 ibt_free_portinfo(port_infop, port_infosz); 2185 goto link_mod_return; 2186 } 2187 2188 /* 2189 * Check the SM InitTypeReply flags. If both NoLoadReply and 2190 * PreserveContentReply are 0, we don't know anything about the 2191 * data loaded into the port attributes, so we need to verify 2192 * if gid0 and pkey are still valid. 2193 */ 2194 itreply = port_infop->p_init_type_reply; 2195 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2196 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2197 /* 2198 * Check to see if the subnet part of GID0 has changed. If 2199 * not, check the simple case first to see if the pkey 2200 * index is the same as before; finally check to see if the 2201 * pkey has been relocated to a different index in the table. 2202 */ 2203 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2204 if (bcmp(port_infop->p_sgid_tbl, 2205 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2206 2207 new_link_state = LINK_STATE_DOWN; 2208 2209 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2210 state->id_pkey) { 2211 2212 new_link_state = LINK_STATE_UP; 2213 2214 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2215 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2216 2217 ibt_free_portinfo(port_infop, port_infosz); 2218 mutex_exit(&state->id_link_mutex); 2219 2220 /* 2221 * Currently a restart is required if our pkey has moved 2222 * in the pkey table. If we get the ibt_recycle_ud() to 2223 * work as documented (expected), we may be able to 2224 * avoid a complete restart. Note that we've already 2225 * marked both the start and stop 'in-progress' flags, 2226 * so it is ok to go ahead and do this restart. 2227 */ 2228 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2229 if ((ret = ibd_start(state)) != 0) { 2230 DPRINT(10, "ibd_restart: cannot restart, " 2231 "ret=%d", ret); 2232 } 2233 2234 goto link_mod_return; 2235 } else { 2236 new_link_state = LINK_STATE_DOWN; 2237 } 2238 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2239 } 2240 2241 update_link_state: 2242 if (port_infop) { 2243 ibt_free_portinfo(port_infop, port_infosz); 2244 } 2245 2246 /* 2247 * If we're reporting a link up, check InitTypeReply to see if 2248 * the SM has ensured that the port's presence in mcg, traps, 2249 * etc. is intact. 2250 */ 2251 if (new_link_state == LINK_STATE_DOWN) { 2252 opcode = IBD_LINK_DOWN; 2253 } else { 2254 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2255 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2256 opcode = IBD_LINK_UP; 2257 } else { 2258 opcode = IBD_LINK_UP_ABSENT; 2259 } 2260 } 2261 2262 /* 2263 * If the old state is the same as the new state, and the SM indicated 2264 * no change in the port parameters, nothing to do. 2265 */ 2266 if ((state->id_link_state == new_link_state) && (opcode != 2267 IBD_LINK_UP_ABSENT)) { 2268 mutex_exit(&state->id_link_mutex); 2269 goto link_mod_return; 2270 } 2271 2272 /* 2273 * Ok, so there was a link state change; see if it's safe to ask 2274 * the async thread to do the work 2275 */ 2276 if (!ibd_async_safe(state)) { 2277 state->id_link_state = new_link_state; 2278 mutex_exit(&state->id_link_mutex); 2279 goto link_mod_return; 2280 } 2281 2282 mutex_exit(&state->id_link_mutex); 2283 2284 /* 2285 * Queue up a request for ibd_async_link() to handle this link 2286 * state change event 2287 */ 2288 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2289 req->rq_ptr = (void *)opcode; 2290 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2291 2292 link_mod_return: 2293 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2294 } 2295 2296 /* 2297 * For the port up/down events, IBTL guarantees there will not be concurrent 2298 * invocations of the handler. IBTL might coalesce link transition events, 2299 * and not invoke the handler for _each_ up/down transition, but it will 2300 * invoke the handler with last known state 2301 */ 2302 static void 2303 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2304 ibt_async_code_t code, ibt_async_event_t *event) 2305 { 2306 ibd_state_t *state = (ibd_state_t *)clnt_private; 2307 2308 switch (code) { 2309 case IBT_ERROR_CATASTROPHIC_CHAN: 2310 ibd_print_warn(state, "catastrophic channel error"); 2311 break; 2312 case IBT_ERROR_CQ: 2313 ibd_print_warn(state, "completion queue error"); 2314 break; 2315 case IBT_PORT_CHANGE_EVENT: 2316 /* 2317 * Events will be delivered to all instances that have 2318 * done ibt_open_hca() but not yet done ibt_close_hca(). 2319 * Only need to do work for our port; IBTF will deliver 2320 * events for other ports on the hca we have ibt_open_hca'ed 2321 * too. Note that id_port is initialized in ibd_attach() 2322 * before we do an ibt_open_hca() in ibd_attach(). 2323 */ 2324 ASSERT(state->id_hca_hdl == hca_hdl); 2325 if (state->id_port != event->ev_port) 2326 break; 2327 2328 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2329 IBT_PORT_CHANGE_PKEY) { 2330 ibd_link_mod(state, code); 2331 } 2332 break; 2333 case IBT_ERROR_PORT_DOWN: 2334 case IBT_CLNT_REREG_EVENT: 2335 case IBT_EVENT_PORT_UP: 2336 /* 2337 * Events will be delivered to all instances that have 2338 * done ibt_open_hca() but not yet done ibt_close_hca(). 2339 * Only need to do work for our port; IBTF will deliver 2340 * events for other ports on the hca we have ibt_open_hca'ed 2341 * too. Note that id_port is initialized in ibd_attach() 2342 * before we do an ibt_open_hca() in ibd_attach(). 2343 */ 2344 ASSERT(state->id_hca_hdl == hca_hdl); 2345 if (state->id_port != event->ev_port) 2346 break; 2347 2348 ibd_link_mod(state, code); 2349 break; 2350 2351 case IBT_HCA_ATTACH_EVENT: 2352 case IBT_HCA_DETACH_EVENT: 2353 /* 2354 * When a new card is plugged to the system, attach_event is 2355 * invoked. Additionally, a cfgadm needs to be run to make the 2356 * card known to the system, and an ifconfig needs to be run to 2357 * plumb up any ibd interfaces on the card. In the case of card 2358 * unplug, a cfgadm is run that will trigger any RCM scripts to 2359 * unplumb the ibd interfaces on the card; when the card is 2360 * actually unplugged, the detach_event is invoked; 2361 * additionally, if any ibd instances are still active on the 2362 * card (eg there were no associated RCM scripts), driver's 2363 * detach routine is invoked. 2364 */ 2365 break; 2366 default: 2367 break; 2368 } 2369 } 2370 2371 static int 2372 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2373 { 2374 mac_register_t *macp; 2375 int ret; 2376 2377 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2378 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2379 return (DDI_FAILURE); 2380 } 2381 2382 /* 2383 * Note that when we register with mac during attach, we don't 2384 * have the id_macaddr yet, so we'll simply be registering a 2385 * zero macaddr that we'll overwrite later during plumb (in 2386 * ibd_m_start()). Similar is the case with id_mtu - we'll 2387 * update the mac layer with the correct mtu during plumb. 2388 */ 2389 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2390 macp->m_driver = state; 2391 macp->m_dip = dip; 2392 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2393 macp->m_callbacks = &ibd_m_callbacks; 2394 macp->m_min_sdu = 0; 2395 macp->m_multicast_sdu = IBD_DEF_MAX_SDU; 2396 if (state->id_type == IBD_PORT_DRIVER) { 2397 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 2398 } else if (state->id_enable_rc) { 2399 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2400 } else { 2401 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2402 } 2403 macp->m_priv_props = ibd_priv_props; 2404 2405 /* 2406 * Register ourselves with the GLDv3 interface 2407 */ 2408 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2409 mac_free(macp); 2410 DPRINT(10, 2411 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2412 return (DDI_FAILURE); 2413 } 2414 2415 mac_free(macp); 2416 return (DDI_SUCCESS); 2417 } 2418 2419 static int 2420 ibd_record_capab(ibd_state_t *state) 2421 { 2422 ibt_hca_attr_t hca_attrs; 2423 ibt_status_t ibt_status; 2424 2425 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2426 2427 /* 2428 * Query the HCA and fetch its attributes 2429 */ 2430 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2431 ASSERT(ibt_status == IBT_SUCCESS); 2432 2433 /* 2434 * 1. Set the Hardware Checksum capability. Currently we only consider 2435 * full checksum offload. 2436 */ 2437 if (state->id_enable_rc) { 2438 state->id_hwcksum_capab = 0; 2439 } else { 2440 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2441 == IBT_HCA_CKSUM_FULL) { 2442 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2443 } 2444 } 2445 2446 /* 2447 * 2. Set LSO policy, capability and maximum length 2448 */ 2449 if (state->id_enable_rc) { 2450 state->id_lso_capable = B_FALSE; 2451 state->id_lso_maxlen = 0; 2452 } else { 2453 if (hca_attrs.hca_max_lso_size > 0) { 2454 state->id_lso_capable = B_TRUE; 2455 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2456 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2457 else 2458 state->id_lso_maxlen = 2459 hca_attrs.hca_max_lso_size; 2460 } else { 2461 state->id_lso_capable = B_FALSE; 2462 state->id_lso_maxlen = 0; 2463 } 2464 } 2465 2466 /* 2467 * 3. Set Reserved L_Key capability 2468 */ 2469 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2470 state->id_hca_res_lkey_capab = 1; 2471 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2472 state->rc_enable_iov_map = B_TRUE; 2473 } else { 2474 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2475 state->rc_enable_iov_map = B_FALSE; 2476 } 2477 2478 /* 2479 * 4. Set maximum sqseg value after checking to see if extended sgl 2480 * size information is provided by the hca 2481 */ 2482 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2483 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2484 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2485 } else { 2486 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2487 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2488 } 2489 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2490 state->id_max_sqseg = IBD_MAX_SQSEG; 2491 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2492 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2493 state->id_max_sqseg, IBD_MAX_SQSEG); 2494 } 2495 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2496 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2497 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2498 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2499 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2500 } 2501 2502 /* 2503 * Translating the virtual address regions into physical regions 2504 * for using the Reserved LKey feature results in a wr sgl that 2505 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2506 * we'll fix a high-water mark (65%) for when we should stop. 2507 */ 2508 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2509 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2510 2511 /* 2512 * 5. Set number of recv and send wqes after checking hca maximum 2513 * channel size. Store the max channel size in the state so that it 2514 * can be referred to when the swqe/rwqe change is requested via 2515 * dladm. 2516 */ 2517 2518 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz; 2519 2520 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe) 2521 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz; 2522 2523 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe - 2524 IBD_RWQE_MIN; 2525 2526 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe) 2527 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz; 2528 2529 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2530 2531 return (DDI_SUCCESS); 2532 } 2533 2534 static int 2535 ibd_part_busy(ibd_state_t *state) 2536 { 2537 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2538 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n"); 2539 return (DDI_FAILURE); 2540 } 2541 2542 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { 2543 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n"); 2544 return (DDI_FAILURE); 2545 } 2546 2547 /* 2548 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is 2549 * connecting to a remote IPoIB port. We can't remove this port. 2550 */ 2551 if (state->id_ah_op == IBD_OP_ONGOING) { 2552 DPRINT(10, "ibd_part_busy: failed: connecting\n"); 2553 return (DDI_FAILURE); 2554 } 2555 2556 return (DDI_SUCCESS); 2557 } 2558 2559 2560 static void 2561 ibd_part_unattach(ibd_state_t *state) 2562 { 2563 uint32_t progress = state->id_mac_state; 2564 ibt_status_t ret; 2565 2566 /* make sure rx resources are freed */ 2567 ibd_free_rx_rsrcs(state); 2568 2569 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 2570 ASSERT(state->id_enable_rc); 2571 ibd_rc_fini_srq_list(state); 2572 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 2573 } 2574 2575 if (progress & IBD_DRV_MAC_REGISTERED) { 2576 (void) mac_unregister(state->id_mh); 2577 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2578 } 2579 2580 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 2581 /* 2582 * No new async requests will be posted since the device 2583 * link state has been marked as unknown; completion handlers 2584 * have been turned off, so Tx handler will not cause any 2585 * more IBD_ASYNC_REAP requests. 2586 * 2587 * Queue a request for the async thread to exit, which will 2588 * be serviced after any pending ones. This can take a while, 2589 * specially if the SM is unreachable, since IBMF will slowly 2590 * timeout each SM request issued by the async thread. Reap 2591 * the thread before continuing on, we do not want it to be 2592 * lingering in modunloaded code. 2593 */ 2594 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 2595 thread_join(state->id_async_thrid); 2596 2597 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 2598 } 2599 2600 if (progress & IBD_DRV_REQ_LIST_INITED) { 2601 list_destroy(&state->id_req_list); 2602 mutex_destroy(&state->id_acache_req_lock); 2603 cv_destroy(&state->id_acache_req_cv); 2604 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED; 2605 } 2606 2607 if (progress & IBD_DRV_PD_ALLOCD) { 2608 if ((ret = ibt_free_pd(state->id_hca_hdl, 2609 state->id_pd_hdl)) != IBT_SUCCESS) { 2610 ibd_print_warn(state, "failed to free " 2611 "protection domain, ret=%d", ret); 2612 } 2613 state->id_pd_hdl = NULL; 2614 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2615 } 2616 2617 if (progress & IBD_DRV_HCA_OPENED) { 2618 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2619 IBT_SUCCESS) { 2620 ibd_print_warn(state, "failed to close " 2621 "HCA device, ret=%d", ret); 2622 } 2623 state->id_hca_hdl = NULL; 2624 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2625 } 2626 2627 mutex_enter(&ibd_gstate.ig_mutex); 2628 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2629 if ((ret = ibt_detach(state->id_ibt_hdl)) != 2630 IBT_SUCCESS) { 2631 ibd_print_warn(state, 2632 "ibt_detach() failed, ret=%d", ret); 2633 } 2634 state->id_ibt_hdl = NULL; 2635 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2636 ibd_gstate.ig_ibt_hdl_ref_cnt--; 2637 } 2638 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && 2639 (ibd_gstate.ig_ibt_hdl != NULL)) { 2640 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != 2641 IBT_SUCCESS) { 2642 ibd_print_warn(state, "ibt_detach(): global " 2643 "failed, ret=%d", ret); 2644 } 2645 ibd_gstate.ig_ibt_hdl = NULL; 2646 } 2647 mutex_exit(&ibd_gstate.ig_mutex); 2648 2649 if (progress & IBD_DRV_TXINTR_ADDED) { 2650 ddi_remove_softintr(state->id_tx); 2651 state->id_tx = NULL; 2652 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2653 } 2654 2655 if (progress & IBD_DRV_RXINTR_ADDED) { 2656 ddi_remove_softintr(state->id_rx); 2657 state->id_rx = NULL; 2658 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2659 } 2660 2661 #ifdef DEBUG 2662 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2663 kstat_delete(state->rc_ksp); 2664 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2665 } 2666 #endif 2667 2668 if (progress & IBD_DRV_STATE_INITIALIZED) { 2669 ibd_state_fini(state); 2670 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2671 } 2672 } 2673 2674 int 2675 ibd_part_attach(ibd_state_t *state, dev_info_t *dip) 2676 { 2677 ibt_status_t ret; 2678 int rv; 2679 kthread_t *kht; 2680 2681 /* 2682 * Initialize mutexes and condition variables 2683 */ 2684 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2685 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()"); 2686 return (DDI_FAILURE); 2687 } 2688 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2689 2690 /* 2691 * Allocate rx,tx softintr 2692 */ 2693 if (ibd_rx_softintr == 1) { 2694 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2695 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2696 DPRINT(10, "ibd_part_attach: failed in " 2697 "ddi_add_softintr(id_rx), ret=%d", rv); 2698 return (DDI_FAILURE); 2699 } 2700 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2701 } 2702 if (ibd_tx_softintr == 1) { 2703 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2704 NULL, NULL, ibd_tx_recycle, 2705 (caddr_t)state)) != DDI_SUCCESS) { 2706 DPRINT(10, "ibd_part_attach: failed in " 2707 "ddi_add_softintr(id_tx), ret=%d", rv); 2708 return (DDI_FAILURE); 2709 } 2710 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2711 } 2712 2713 /* 2714 * Attach to IBTL 2715 */ 2716 mutex_enter(&ibd_gstate.ig_mutex); 2717 if (ibd_gstate.ig_ibt_hdl == NULL) { 2718 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2719 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { 2720 DPRINT(10, "ibd_part_attach: global: failed in " 2721 "ibt_attach(), ret=%d", ret); 2722 mutex_exit(&ibd_gstate.ig_mutex); 2723 return (DDI_FAILURE); 2724 } 2725 } 2726 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2727 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2728 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d", 2729 ret); 2730 mutex_exit(&ibd_gstate.ig_mutex); 2731 return (DDI_FAILURE); 2732 } 2733 ibd_gstate.ig_ibt_hdl_ref_cnt++; 2734 mutex_exit(&ibd_gstate.ig_mutex); 2735 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2736 2737 /* 2738 * Open the HCA 2739 */ 2740 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 2741 &state->id_hca_hdl)) != IBT_SUCCESS) { 2742 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d", 2743 ret); 2744 return (DDI_FAILURE); 2745 } 2746 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2747 2748 #ifdef DEBUG 2749 /* Initialize Driver Counters for Reliable Connected Mode */ 2750 if (state->id_enable_rc) { 2751 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2752 DPRINT(10, "ibd_part_attach: failed in " 2753 "ibd_rc_init_stats"); 2754 return (DDI_FAILURE); 2755 } 2756 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2757 } 2758 #endif 2759 2760 /* 2761 * Record capabilities 2762 */ 2763 (void) ibd_record_capab(state); 2764 2765 /* 2766 * Allocate a protection domain on the HCA 2767 */ 2768 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2769 &state->id_pd_hdl)) != IBT_SUCCESS) { 2770 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d", 2771 ret); 2772 return (DDI_FAILURE); 2773 } 2774 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2775 2776 2777 /* 2778 * We need to initialise the req_list that is required for the 2779 * operation of the async_thread. 2780 */ 2781 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 2782 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 2783 list_create(&state->id_req_list, sizeof (ibd_req_t), 2784 offsetof(ibd_req_t, rq_list)); 2785 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED; 2786 2787 /* 2788 * Create the async thread; thread_create never fails. 2789 */ 2790 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 2791 TS_RUN, minclsyspri); 2792 state->id_async_thrid = kht->t_did; 2793 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 2794 2795 return (DDI_SUCCESS); 2796 } 2797 2798 /* 2799 * Attach device to the IO framework. 2800 */ 2801 static int 2802 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2803 { 2804 int ret; 2805 2806 switch (cmd) { 2807 case DDI_ATTACH: 2808 ret = ibd_port_attach(dip); 2809 break; 2810 default: 2811 ret = DDI_FAILURE; 2812 break; 2813 } 2814 return (ret); 2815 } 2816 2817 /* 2818 * Detach device from the IO framework. 2819 */ 2820 static int 2821 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2822 { 2823 ibd_state_t *state; 2824 int instance; 2825 2826 /* 2827 * IBD doesn't support suspend/resume 2828 */ 2829 if (cmd != DDI_DETACH) 2830 return (DDI_FAILURE); 2831 2832 /* 2833 * Get the instance softstate 2834 */ 2835 instance = ddi_get_instance(dip); 2836 state = ddi_get_soft_state(ibd_list, instance); 2837 2838 /* 2839 * Release all resources we're holding still. Note that if we'd 2840 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2841 * so far, we should find all the flags we need in id_mac_state. 2842 */ 2843 return (ibd_port_unattach(state, dip)); 2844 } 2845 2846 /* 2847 * Pre ibt_attach() driver initialization 2848 */ 2849 static int 2850 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2851 { 2852 char buf[64]; 2853 2854 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2855 state->id_link_state = LINK_STATE_UNKNOWN; 2856 2857 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2858 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2859 state->id_trap_stop = B_TRUE; 2860 state->id_trap_inprog = 0; 2861 2862 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2863 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2864 state->id_dip = dip; 2865 2866 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2867 2868 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2869 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2870 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2871 state->id_tx_busy = 0; 2872 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2873 2874 state->id_rx_list.dl_bufs_outstanding = 0; 2875 state->id_rx_list.dl_cnt = 0; 2876 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2877 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2878 (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip), 2879 state->id_pkey, state->id_plinkid); 2880 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2881 0, NULL, NULL, NULL, NULL, NULL, 0); 2882 2883 /* For Reliable Connected Mode */ 2884 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2885 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2886 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2887 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2888 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2889 MUTEX_DRIVER, NULL); 2890 mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL); 2891 2892 /* 2893 * Make the default link mode as RC. If this fails during connection 2894 * setup, the link mode is automatically transitioned to UD. 2895 * Also set the RC MTU. 2896 */ 2897 state->id_enable_rc = IBD_DEF_LINK_MODE; 2898 state->rc_mtu = IBD_DEF_RC_MAX_MTU; 2899 state->id_mtu = IBD_DEF_MAX_MTU; 2900 2901 /* Iniatialize all tunables to default */ 2902 state->id_lso_policy = IBD_DEF_LSO_POLICY; 2903 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS; 2904 state->id_num_ah = IBD_DEF_NUM_AH; 2905 state->id_hash_size = IBD_DEF_HASH_SIZE; 2906 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP; 2907 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS; 2908 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT; 2909 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC; 2910 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT; 2911 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC; 2912 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT; 2913 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC; 2914 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT; 2915 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC; 2916 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH; 2917 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH; 2918 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH; 2919 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE; 2920 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE; 2921 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE; 2922 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE; 2923 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ; 2924 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ; 2925 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH; 2926 2927 return (DDI_SUCCESS); 2928 } 2929 2930 /* 2931 * Post ibt_detach() driver deconstruction 2932 */ 2933 static void 2934 ibd_state_fini(ibd_state_t *state) 2935 { 2936 kmem_cache_destroy(state->id_req_kmc); 2937 2938 mutex_destroy(&state->id_rx_list.dl_mutex); 2939 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2940 2941 mutex_destroy(&state->id_txpost_lock); 2942 mutex_destroy(&state->id_tx_list.dl_mutex); 2943 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2944 mutex_destroy(&state->id_lso_lock); 2945 2946 mutex_destroy(&state->id_sched_lock); 2947 mutex_destroy(&state->id_scq_poll_lock); 2948 mutex_destroy(&state->id_rcq_poll_lock); 2949 2950 cv_destroy(&state->id_trap_cv); 2951 mutex_destroy(&state->id_trap_lock); 2952 mutex_destroy(&state->id_link_mutex); 2953 2954 /* For Reliable Connected Mode */ 2955 mutex_destroy(&state->rc_timeout_lock); 2956 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2957 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2958 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2959 mutex_destroy(&state->rc_tx_large_bufs_lock); 2960 mutex_destroy(&state->rc_rx_lock); 2961 } 2962 2963 /* 2964 * Fetch link speed from SA for snmp ifspeed reporting. 2965 */ 2966 static uint64_t 2967 ibd_get_portspeed(ibd_state_t *state) 2968 { 2969 int ret; 2970 ibt_path_info_t path; 2971 ibt_path_attr_t path_attr; 2972 uint8_t num_paths; 2973 uint64_t ifspeed; 2974 2975 /* 2976 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2977 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2978 * 2000000000. Start with that as default. 2979 */ 2980 ifspeed = 2000000000; 2981 2982 bzero(&path_attr, sizeof (path_attr)); 2983 2984 /* 2985 * Get the port speed from Loopback path information. 2986 */ 2987 path_attr.pa_dgids = &state->id_sgid; 2988 path_attr.pa_num_dgids = 1; 2989 path_attr.pa_sgid = state->id_sgid; 2990 2991 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2992 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2993 goto earlydone; 2994 2995 if (num_paths < 1) 2996 goto earlydone; 2997 2998 /* 2999 * In case SA does not return an expected value, report the default 3000 * speed as 1X. 3001 */ 3002 ret = 1; 3003 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 3004 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 3005 ret = 1; 3006 break; 3007 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 3008 ret = 4; 3009 break; 3010 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 3011 ret = 12; 3012 break; 3013 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 3014 ret = 2; 3015 break; 3016 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 3017 ret = 8; 3018 break; 3019 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 3020 ret = 16; 3021 break; 3022 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 3023 ret = 24; 3024 break; 3025 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 3026 ret = 32; 3027 break; 3028 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 3029 ret = 48; 3030 break; 3031 } 3032 3033 ifspeed *= ret; 3034 3035 earlydone: 3036 return (ifspeed); 3037 } 3038 3039 /* 3040 * Search input mcg list (id_mc_full or id_mc_non) for an entry 3041 * representing the input mcg mgid. 3042 */ 3043 static ibd_mce_t * 3044 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 3045 { 3046 ibd_mce_t *ptr = list_head(mlist); 3047 3048 /* 3049 * Do plain linear search. 3050 */ 3051 while (ptr != NULL) { 3052 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 3053 sizeof (ib_gid_t)) == 0) 3054 return (ptr); 3055 ptr = list_next(mlist, ptr); 3056 } 3057 return (NULL); 3058 } 3059 3060 /* 3061 * Execute IBA JOIN. 3062 */ 3063 static ibt_status_t 3064 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 3065 { 3066 ibt_mcg_attr_t mcg_attr; 3067 3068 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3069 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 3070 mcg_attr.mc_mgid = mgid; 3071 mcg_attr.mc_join_state = mce->mc_jstate; 3072 mcg_attr.mc_scope = state->id_scope; 3073 mcg_attr.mc_pkey = state->id_pkey; 3074 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 3075 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 3076 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 3077 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 3078 NULL, NULL)); 3079 } 3080 3081 /* 3082 * This code JOINs the port in the proper way (depending on the join 3083 * state) so that IBA fabric will forward mcg packets to/from the port. 3084 * It also attaches the QPN to the mcg so it can receive those mcg 3085 * packets. This code makes sure not to attach the mcg to the QP if 3086 * that has been previously done due to the mcg being joined with a 3087 * different join state, even though this is not required by SWG_0216, 3088 * refid 3610. 3089 */ 3090 static ibd_mce_t * 3091 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3092 { 3093 ibt_status_t ibt_status; 3094 ibd_mce_t *mce, *tmce, *omce = NULL; 3095 boolean_t do_attach = B_TRUE; 3096 3097 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 3098 jstate, mgid.gid_prefix, mgid.gid_guid); 3099 3100 /* 3101 * For enable_multicast Full member joins, we need to do some 3102 * extra work. If there is already an mce on the list that 3103 * indicates full membership, that means the membership has 3104 * not yet been dropped (since the disable_multicast was issued) 3105 * because there are pending Tx's to the mcg; in that case, just 3106 * mark the mce not to be reaped when the Tx completion queues 3107 * an async reap operation. 3108 * 3109 * If there is already an mce on the list indicating sendonly 3110 * membership, try to promote to full membership. Be careful 3111 * not to deallocate the old mce, since there might be an AH 3112 * pointing to it; instead, update the old mce with new data 3113 * that tracks the full membership. 3114 */ 3115 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 3116 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 3117 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 3118 ASSERT(omce->mc_fullreap); 3119 omce->mc_fullreap = B_FALSE; 3120 return (omce); 3121 } else { 3122 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 3123 } 3124 } 3125 3126 /* 3127 * Allocate the ibd_mce_t to track this JOIN. 3128 */ 3129 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 3130 mce->mc_fullreap = B_FALSE; 3131 mce->mc_jstate = jstate; 3132 3133 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 3134 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 3135 ibt_status); 3136 kmem_free(mce, sizeof (ibd_mce_t)); 3137 return (NULL); 3138 } 3139 3140 /* 3141 * Is an IBA attach required? Not if the interface is already joined 3142 * to the mcg in a different appropriate join state. 3143 */ 3144 if (jstate == IB_MC_JSTATE_NON) { 3145 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3146 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3147 do_attach = B_FALSE; 3148 } else if (jstate == IB_MC_JSTATE_FULL) { 3149 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3150 do_attach = B_FALSE; 3151 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3152 do_attach = B_FALSE; 3153 } 3154 3155 if (do_attach) { 3156 /* 3157 * Do the IBA attach. 3158 */ 3159 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 3160 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 3161 &mce->mc_info)) != IBT_SUCCESS) { 3162 DPRINT(10, "ibd_join_group : failed qp attachment " 3163 "%d\n", ibt_status); 3164 /* 3165 * NOTE that we should probably preserve the join info 3166 * in the list and later try to leave again at detach 3167 * time. 3168 */ 3169 (void) ibt_leave_mcg(state->id_sgid, mgid, 3170 state->id_sgid, jstate); 3171 kmem_free(mce, sizeof (ibd_mce_t)); 3172 return (NULL); 3173 } 3174 } 3175 3176 /* 3177 * Insert the ibd_mce_t in the proper list. 3178 */ 3179 if (jstate == IB_MC_JSTATE_NON) { 3180 IBD_MCACHE_INSERT_NON(state, mce); 3181 } else { 3182 /* 3183 * Set up the mc_req fields used for reaping the 3184 * mcg in case of delayed tx completion (see 3185 * ibd_tx_cleanup()). Also done for sendonly join in 3186 * case we are promoted to fullmembership later and 3187 * keep using the same mce. 3188 */ 3189 mce->mc_req.rq_gid = mgid; 3190 mce->mc_req.rq_ptr = mce; 3191 /* 3192 * Check whether this is the case of trying to join 3193 * full member, and we were already joined send only. 3194 * We try to drop our SendOnly membership, but it is 3195 * possible that the mcg does not exist anymore (and 3196 * the subnet trap never reached us), so the leave 3197 * operation might fail. 3198 */ 3199 if (omce != NULL) { 3200 (void) ibt_leave_mcg(state->id_sgid, mgid, 3201 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 3202 omce->mc_jstate = IB_MC_JSTATE_FULL; 3203 bcopy(&mce->mc_info, &omce->mc_info, 3204 sizeof (ibt_mcg_info_t)); 3205 kmem_free(mce, sizeof (ibd_mce_t)); 3206 return (omce); 3207 } 3208 mutex_enter(&state->id_mc_mutex); 3209 IBD_MCACHE_INSERT_FULL(state, mce); 3210 mutex_exit(&state->id_mc_mutex); 3211 } 3212 3213 return (mce); 3214 } 3215 3216 /* 3217 * Called during port up event handling to attempt to reacquire full 3218 * membership to an mcg. Stripped down version of ibd_join_group(). 3219 * Note that it is possible that the mcg might have gone away, and 3220 * gets recreated at this point. 3221 */ 3222 static void 3223 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 3224 { 3225 ib_gid_t mgid; 3226 3227 /* 3228 * If the mc_fullreap flag is set, or this join fails, a subsequent 3229 * reap/leave is going to try to leave the group. We could prevent 3230 * that by adding a boolean flag into ibd_mce_t, if required. 3231 */ 3232 if (mce->mc_fullreap) 3233 return; 3234 3235 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3236 3237 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 3238 mgid.gid_guid); 3239 3240 /* While reacquiring, leave and then join the MCG */ 3241 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, 3242 mce->mc_jstate); 3243 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 3244 ibd_print_warn(state, "Failure on port up to rejoin " 3245 "multicast gid %016llx:%016llx", 3246 (u_longlong_t)mgid.gid_prefix, 3247 (u_longlong_t)mgid.gid_guid); 3248 } 3249 3250 /* 3251 * This code handles delayed Tx completion cleanups for mcg's to which 3252 * disable_multicast has been issued, regular mcg related cleanups during 3253 * disable_multicast, disable_promiscuous and mcg traps, as well as 3254 * cleanups during driver detach time. Depending on the join state, 3255 * it deletes the mce from the appropriate list and issues the IBA 3256 * leave/detach; except in the disable_multicast case when the mce 3257 * is left on the active list for a subsequent Tx completion cleanup. 3258 */ 3259 static void 3260 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3261 uint8_t jstate) 3262 { 3263 ibd_mce_t *tmce; 3264 boolean_t do_detach = B_TRUE; 3265 3266 /* 3267 * Before detaching, we must check whether the other list 3268 * contains the mcg; if we detach blindly, the consumer 3269 * who set up the other list will also stop receiving 3270 * traffic. 3271 */ 3272 if (jstate == IB_MC_JSTATE_FULL) { 3273 /* 3274 * The following check is only relevant while coming 3275 * from the Tx completion path in the reap case. 3276 */ 3277 if (!mce->mc_fullreap) 3278 return; 3279 mutex_enter(&state->id_mc_mutex); 3280 IBD_MCACHE_PULLOUT_FULL(state, mce); 3281 mutex_exit(&state->id_mc_mutex); 3282 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3283 do_detach = B_FALSE; 3284 } else if (jstate == IB_MC_JSTATE_NON) { 3285 IBD_MCACHE_PULLOUT_NON(state, mce); 3286 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3287 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3288 do_detach = B_FALSE; 3289 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3290 mutex_enter(&state->id_mc_mutex); 3291 IBD_MCACHE_PULLOUT_FULL(state, mce); 3292 mutex_exit(&state->id_mc_mutex); 3293 do_detach = B_FALSE; 3294 } 3295 3296 /* 3297 * If we are reacting to a mcg trap and leaving our sendonly or 3298 * non membership, the mcg is possibly already gone, so attempting 3299 * to leave might fail. On the other hand, we must try to leave 3300 * anyway, since this might be a trap from long ago, and we could 3301 * have potentially sendonly joined to a recent incarnation of 3302 * the mcg and are about to loose track of this information. 3303 */ 3304 if (do_detach) { 3305 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3306 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3307 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3308 } 3309 3310 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3311 kmem_free(mce, sizeof (ibd_mce_t)); 3312 } 3313 3314 /* 3315 * Async code executed due to multicast and promiscuous disable requests 3316 * and mcg trap handling; also executed during driver detach. Mostly, a 3317 * leave and detach is done; except for the fullmember case when Tx 3318 * requests are pending, whence arrangements are made for subsequent 3319 * cleanup on Tx completion. 3320 */ 3321 static void 3322 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3323 { 3324 ipoib_mac_t mcmac; 3325 boolean_t recycled; 3326 ibd_mce_t *mce; 3327 3328 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3329 jstate, mgid.gid_prefix, mgid.gid_guid); 3330 3331 if (jstate == IB_MC_JSTATE_NON) { 3332 recycled = B_TRUE; 3333 mce = IBD_MCACHE_FIND_NON(state, mgid); 3334 /* 3335 * In case we are handling a mcg trap, we might not find 3336 * the mcg in the non list. 3337 */ 3338 if (mce == NULL) { 3339 return; 3340 } 3341 } else { 3342 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3343 3344 /* 3345 * In case we are handling a mcg trap, make sure the trap 3346 * is not arriving late; if we have an mce that indicates 3347 * that we are already a fullmember, that would be a clear 3348 * indication that the trap arrived late (ie, is for a 3349 * previous incarnation of the mcg). 3350 */ 3351 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3352 if ((mce == NULL) || (mce->mc_jstate == 3353 IB_MC_JSTATE_FULL)) { 3354 return; 3355 } 3356 } else { 3357 ASSERT(jstate == IB_MC_JSTATE_FULL); 3358 3359 /* 3360 * If join group failed, mce will be NULL here. 3361 * This is because in GLDv3 driver, set multicast 3362 * will always return success. 3363 */ 3364 if (mce == NULL) { 3365 return; 3366 } 3367 3368 mce->mc_fullreap = B_TRUE; 3369 } 3370 3371 /* 3372 * If no pending Tx's remain that reference the AH 3373 * for the mcg, recycle it from active to free list. 3374 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3375 * so the last completing Tx will cause an async reap 3376 * operation to be invoked, at which time we will drop our 3377 * membership to the mcg so that the pending Tx's complete 3378 * successfully. Refer to comments on "AH and MCE active 3379 * list manipulation" at top of this file. The lock protects 3380 * against Tx fast path and Tx cleanup code. 3381 */ 3382 mutex_enter(&state->id_ac_mutex); 3383 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3384 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3385 IB_MC_JSTATE_SEND_ONLY_NON)); 3386 mutex_exit(&state->id_ac_mutex); 3387 } 3388 3389 if (recycled) { 3390 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3391 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3392 ibd_async_reap_group(state, mce, mgid, jstate); 3393 } 3394 } 3395 3396 /* 3397 * Find the broadcast address as defined by IPoIB; implicitly 3398 * determines the IBA scope, mtu, tclass etc of the link the 3399 * interface is going to be a member of. 3400 */ 3401 static ibt_status_t 3402 ibd_find_bgroup(ibd_state_t *state) 3403 { 3404 ibt_mcg_attr_t mcg_attr; 3405 uint_t numg; 3406 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3407 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3408 IB_MC_SCOPE_GLOBAL }; 3409 int i, mcgmtu; 3410 boolean_t found = B_FALSE; 3411 int ret; 3412 ibt_mcg_info_t mcg_info; 3413 3414 state->id_bgroup_created = B_FALSE; 3415 state->id_bgroup_present = B_FALSE; 3416 3417 query_bcast_grp: 3418 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3419 mcg_attr.mc_pkey = state->id_pkey; 3420 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3421 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3422 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3423 3424 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3425 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3426 3427 /* 3428 * Look for the IPoIB broadcast group. 3429 */ 3430 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3431 state->id_mgid.gid_prefix = 3432 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3433 ((uint64_t)state->id_scope << 48) | 3434 ((uint32_t)(state->id_pkey << 16))); 3435 mcg_attr.mc_mgid = state->id_mgid; 3436 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3437 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3438 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3439 found = B_TRUE; 3440 break; 3441 } 3442 } 3443 3444 if (!found) { 3445 if (state->id_create_broadcast_group) { 3446 /* 3447 * If we created the broadcast group, but failed to 3448 * find it, we can't do anything except leave the 3449 * one we created and return failure. 3450 */ 3451 if (state->id_bgroup_created) { 3452 ibd_print_warn(state, "IPoIB broadcast group " 3453 "absent. Unable to query after create."); 3454 goto find_bgroup_fail; 3455 } 3456 3457 /* 3458 * Create the ipoib broadcast group if it didn't exist 3459 */ 3460 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3461 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3462 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3463 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3464 mcg_attr.mc_pkey = state->id_pkey; 3465 mcg_attr.mc_flow = 0; 3466 mcg_attr.mc_sl = 0; 3467 mcg_attr.mc_tclass = 0; 3468 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3469 state->id_mgid.gid_prefix = 3470 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3471 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3472 ((uint32_t)(state->id_pkey << 16))); 3473 mcg_attr.mc_mgid = state->id_mgid; 3474 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3475 3476 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3477 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3478 ibd_print_warn(state, "IPoIB broadcast group " 3479 "absent, create failed: ret = %d\n", ret); 3480 state->id_bgroup_created = B_FALSE; 3481 return (IBT_FAILURE); 3482 } 3483 state->id_bgroup_created = B_TRUE; 3484 goto query_bcast_grp; 3485 } else { 3486 ibd_print_warn(state, "IPoIB broadcast group absent"); 3487 return (IBT_FAILURE); 3488 } 3489 } 3490 3491 /* 3492 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3493 */ 3494 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3495 if (state->id_mtu < mcgmtu) { 3496 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3497 "greater than port's maximum MTU %d", mcgmtu, 3498 state->id_mtu); 3499 ibt_free_mcg_info(state->id_mcinfo, 1); 3500 goto find_bgroup_fail; 3501 } 3502 state->id_mtu = mcgmtu; 3503 state->id_bgroup_present = B_TRUE; 3504 3505 return (IBT_SUCCESS); 3506 3507 find_bgroup_fail: 3508 if (state->id_bgroup_created) { 3509 (void) ibt_leave_mcg(state->id_sgid, 3510 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3511 IB_MC_JSTATE_FULL); 3512 } 3513 3514 return (IBT_FAILURE); 3515 } 3516 3517 static int 3518 ibd_alloc_tx_copybufs(ibd_state_t *state) 3519 { 3520 ibt_mr_attr_t mem_attr; 3521 3522 /* 3523 * Allocate one big chunk for all regular tx copy bufs 3524 */ 3525 state->id_tx_buf_sz = state->id_mtu; 3526 if (state->id_lso_policy && state->id_lso_capable && 3527 (state->id_ud_tx_copy_thresh > state->id_mtu)) { 3528 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh; 3529 } 3530 3531 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe * 3532 state->id_tx_buf_sz, KM_SLEEP); 3533 3534 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe * 3535 sizeof (ibd_swqe_t), KM_SLEEP); 3536 3537 /* 3538 * Do one memory registration on the entire txbuf area 3539 */ 3540 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3541 mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz; 3542 mem_attr.mr_as = NULL; 3543 mem_attr.mr_flags = IBT_MR_SLEEP; 3544 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3545 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3546 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3547 kmem_free(state->id_tx_wqes, 3548 state->id_ud_num_swqe * sizeof (ibd_swqe_t)); 3549 kmem_free(state->id_tx_bufs, 3550 state->id_ud_num_swqe * state->id_tx_buf_sz); 3551 state->id_tx_bufs = NULL; 3552 return (DDI_FAILURE); 3553 } 3554 3555 return (DDI_SUCCESS); 3556 } 3557 3558 static int 3559 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3560 { 3561 ibt_mr_attr_t mem_attr; 3562 ibd_lsobuf_t *buflist; 3563 ibd_lsobuf_t *lbufp; 3564 ibd_lsobuf_t *tail; 3565 ibd_lsobkt_t *bktp; 3566 uint8_t *membase; 3567 uint8_t *memp; 3568 uint_t memsz; 3569 int i; 3570 3571 /* 3572 * Allocate the lso bucket 3573 */ 3574 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3575 3576 /* 3577 * Allocate the entire lso memory and register it 3578 */ 3579 memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ; 3580 membase = kmem_zalloc(memsz, KM_SLEEP); 3581 3582 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3583 mem_attr.mr_len = memsz; 3584 mem_attr.mr_as = NULL; 3585 mem_attr.mr_flags = IBT_MR_SLEEP; 3586 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3587 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3588 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3589 kmem_free(membase, memsz); 3590 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3591 return (DDI_FAILURE); 3592 } 3593 3594 mutex_enter(&state->id_lso_lock); 3595 3596 /* 3597 * Now allocate the buflist. Note that the elements in the buflist and 3598 * the buffers in the lso memory have a permanent 1-1 relation, so we 3599 * can always derive the address of a buflist entry from the address of 3600 * an lso buffer. 3601 */ 3602 buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t), 3603 KM_SLEEP); 3604 3605 /* 3606 * Set up the lso buf chain 3607 */ 3608 memp = membase; 3609 lbufp = buflist; 3610 for (i = 0; i < state->id_num_lso_bufs; i++) { 3611 lbufp->lb_isfree = 1; 3612 lbufp->lb_buf = memp; 3613 lbufp->lb_next = lbufp + 1; 3614 3615 tail = lbufp; 3616 3617 memp += IBD_LSO_BUFSZ; 3618 lbufp++; 3619 } 3620 tail->lb_next = NULL; 3621 3622 /* 3623 * Set up the LSO buffer information in ibd state 3624 */ 3625 bktp->bkt_bufl = buflist; 3626 bktp->bkt_free_head = buflist; 3627 bktp->bkt_mem = membase; 3628 bktp->bkt_nelem = state->id_num_lso_bufs; 3629 bktp->bkt_nfree = bktp->bkt_nelem; 3630 3631 state->id_lso = bktp; 3632 mutex_exit(&state->id_lso_lock); 3633 3634 return (DDI_SUCCESS); 3635 } 3636 3637 /* 3638 * Statically allocate Tx buffer list(s). 3639 */ 3640 static int 3641 ibd_init_txlist(ibd_state_t *state) 3642 { 3643 ibd_swqe_t *swqe; 3644 ibt_lkey_t lkey; 3645 int i; 3646 uint_t len; 3647 uint8_t *bufaddr; 3648 3649 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3650 return (DDI_FAILURE); 3651 3652 if (state->id_lso_policy && state->id_lso_capable) { 3653 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3654 state->id_lso_capable = B_FALSE; 3655 } 3656 3657 mutex_enter(&state->id_tx_list.dl_mutex); 3658 state->id_tx_list.dl_head = NULL; 3659 state->id_tx_list.dl_pending_sends = B_FALSE; 3660 state->id_tx_list.dl_cnt = 0; 3661 mutex_exit(&state->id_tx_list.dl_mutex); 3662 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3663 state->id_tx_rel_list.dl_head = NULL; 3664 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3665 state->id_tx_rel_list.dl_cnt = 0; 3666 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3667 3668 /* 3669 * Allocate and setup the swqe list 3670 */ 3671 lkey = state->id_tx_mr_desc.md_lkey; 3672 bufaddr = state->id_tx_bufs; 3673 len = state->id_tx_buf_sz; 3674 swqe = state->id_tx_wqes; 3675 mutex_enter(&state->id_tx_list.dl_mutex); 3676 for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) { 3677 swqe->swqe_next = NULL; 3678 swqe->swqe_im_mblk = NULL; 3679 3680 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3681 bufaddr; 3682 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3683 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3684 3685 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3686 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3687 swqe->w_swr.wr_trans = IBT_UD_SRV; 3688 3689 /* These are set in send */ 3690 swqe->w_swr.wr_nds = 0; 3691 swqe->w_swr.wr_sgl = NULL; 3692 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3693 3694 /* add to list */ 3695 state->id_tx_list.dl_cnt++; 3696 swqe->swqe_next = state->id_tx_list.dl_head; 3697 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3698 } 3699 mutex_exit(&state->id_tx_list.dl_mutex); 3700 3701 return (DDI_SUCCESS); 3702 } 3703 3704 static int 3705 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3706 uint32_t *nds_p) 3707 { 3708 ibd_lsobkt_t *bktp; 3709 ibd_lsobuf_t *lbufp; 3710 ibd_lsobuf_t *nextp; 3711 ibt_lkey_t lso_lkey; 3712 uint_t frag_sz; 3713 uint_t num_needed; 3714 int i; 3715 3716 ASSERT(sgl_p != NULL); 3717 ASSERT(nds_p != NULL); 3718 ASSERT(req_sz != 0); 3719 3720 /* 3721 * Determine how many bufs we'd need for the size requested 3722 */ 3723 num_needed = req_sz / IBD_LSO_BUFSZ; 3724 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3725 num_needed++; 3726 3727 mutex_enter(&state->id_lso_lock); 3728 3729 /* 3730 * If we don't have enough lso bufs, return failure 3731 */ 3732 ASSERT(state->id_lso != NULL); 3733 bktp = state->id_lso; 3734 if (bktp->bkt_nfree < num_needed) { 3735 mutex_exit(&state->id_lso_lock); 3736 return (-1); 3737 } 3738 3739 /* 3740 * Pick the first 'num_needed' bufs from the free list 3741 */ 3742 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3743 lbufp = bktp->bkt_free_head; 3744 for (i = 0; i < num_needed; i++) { 3745 ASSERT(lbufp->lb_isfree != 0); 3746 ASSERT(lbufp->lb_buf != NULL); 3747 3748 nextp = lbufp->lb_next; 3749 3750 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3751 sgl_p[i].ds_key = lso_lkey; 3752 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3753 3754 lbufp->lb_isfree = 0; 3755 lbufp->lb_next = NULL; 3756 3757 lbufp = nextp; 3758 } 3759 bktp->bkt_free_head = lbufp; 3760 3761 /* 3762 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3763 * to adjust the last sgl entry's length. Since we know we need atleast 3764 * one, the i-1 use below is ok. 3765 */ 3766 if (frag_sz) { 3767 sgl_p[i-1].ds_len = frag_sz; 3768 } 3769 3770 /* 3771 * Update nfree count and return 3772 */ 3773 bktp->bkt_nfree -= num_needed; 3774 3775 mutex_exit(&state->id_lso_lock); 3776 3777 *nds_p = num_needed; 3778 3779 return (0); 3780 } 3781 3782 static void 3783 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3784 { 3785 ibd_lsobkt_t *bktp; 3786 ibd_lsobuf_t *lbufp; 3787 uint8_t *lso_mem_end; 3788 uint_t ndx; 3789 int i; 3790 3791 mutex_enter(&state->id_lso_lock); 3792 3793 bktp = state->id_lso; 3794 ASSERT(bktp != NULL); 3795 3796 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3797 for (i = 0; i < nds; i++) { 3798 uint8_t *va; 3799 3800 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3801 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3802 3803 /* 3804 * Figure out the buflist element this sgl buffer corresponds 3805 * to and put it back at the head 3806 */ 3807 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3808 lbufp = bktp->bkt_bufl + ndx; 3809 3810 ASSERT(lbufp->lb_isfree == 0); 3811 ASSERT(lbufp->lb_buf == va); 3812 3813 lbufp->lb_isfree = 1; 3814 lbufp->lb_next = bktp->bkt_free_head; 3815 bktp->bkt_free_head = lbufp; 3816 } 3817 bktp->bkt_nfree += nds; 3818 3819 mutex_exit(&state->id_lso_lock); 3820 } 3821 3822 static void 3823 ibd_free_tx_copybufs(ibd_state_t *state) 3824 { 3825 /* 3826 * Unregister txbuf mr 3827 */ 3828 if (ibt_deregister_mr(state->id_hca_hdl, 3829 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3830 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3831 } 3832 state->id_tx_mr_hdl = NULL; 3833 3834 /* 3835 * Free txbuf memory 3836 */ 3837 kmem_free(state->id_tx_wqes, state->id_ud_num_swqe * 3838 sizeof (ibd_swqe_t)); 3839 kmem_free(state->id_tx_bufs, state->id_ud_num_swqe * 3840 state->id_tx_buf_sz); 3841 state->id_tx_wqes = NULL; 3842 state->id_tx_bufs = NULL; 3843 } 3844 3845 static void 3846 ibd_free_tx_lsobufs(ibd_state_t *state) 3847 { 3848 ibd_lsobkt_t *bktp; 3849 3850 mutex_enter(&state->id_lso_lock); 3851 3852 if ((bktp = state->id_lso) == NULL) { 3853 mutex_exit(&state->id_lso_lock); 3854 return; 3855 } 3856 3857 /* 3858 * First, free the buflist 3859 */ 3860 ASSERT(bktp->bkt_bufl != NULL); 3861 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3862 3863 /* 3864 * Unregister the LSO memory and free it 3865 */ 3866 ASSERT(bktp->bkt_mr_hdl != NULL); 3867 if (ibt_deregister_mr(state->id_hca_hdl, 3868 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3869 DPRINT(10, 3870 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3871 } 3872 ASSERT(bktp->bkt_mem); 3873 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3874 3875 /* 3876 * Finally free the bucket 3877 */ 3878 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3879 state->id_lso = NULL; 3880 3881 mutex_exit(&state->id_lso_lock); 3882 } 3883 3884 /* 3885 * Free the statically allocated Tx buffer list. 3886 */ 3887 static void 3888 ibd_fini_txlist(ibd_state_t *state) 3889 { 3890 /* 3891 * Free the allocated swqes 3892 */ 3893 mutex_enter(&state->id_tx_list.dl_mutex); 3894 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3895 state->id_tx_list.dl_head = NULL; 3896 state->id_tx_list.dl_pending_sends = B_FALSE; 3897 state->id_tx_list.dl_cnt = 0; 3898 state->id_tx_rel_list.dl_head = NULL; 3899 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3900 state->id_tx_rel_list.dl_cnt = 0; 3901 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3902 mutex_exit(&state->id_tx_list.dl_mutex); 3903 3904 ibd_free_tx_lsobufs(state); 3905 ibd_free_tx_copybufs(state); 3906 } 3907 3908 /* 3909 * post a list of rwqes, NULL terminated. 3910 */ 3911 static void 3912 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3913 { 3914 uint_t i; 3915 uint_t num_posted; 3916 ibt_status_t ibt_status; 3917 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3918 3919 while (rwqe) { 3920 /* Post up to IBD_RX_POST_CNT receive work requests */ 3921 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3922 wrs[i] = rwqe->w_rwr; 3923 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3924 if (rwqe == NULL) { 3925 i++; 3926 break; 3927 } 3928 } 3929 3930 /* 3931 * If posting fails for some reason, we'll never receive 3932 * completion intimation, so we'll need to cleanup. But 3933 * we need to make sure we don't clean up nodes whose 3934 * wrs have been successfully posted. We assume that the 3935 * hca driver returns on the first failure to post and 3936 * therefore the first 'num_posted' entries don't need 3937 * cleanup here. 3938 */ 3939 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3940 3941 num_posted = 0; 3942 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3943 &num_posted); 3944 if (ibt_status != IBT_SUCCESS) { 3945 /* This cannot happen unless the device has an error. */ 3946 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3947 "posting multiple wrs failed: " 3948 "requested=%d, done=%d, ret=%d", 3949 IBD_RX_POST_CNT, num_posted, ibt_status); 3950 atomic_add_32(&state->id_rx_list.dl_cnt, 3951 num_posted - i); 3952 } 3953 } 3954 } 3955 3956 /* 3957 * Grab a list of rwqes from the array of lists, and post the list. 3958 */ 3959 static void 3960 ibd_post_recv_intr(ibd_state_t *state) 3961 { 3962 ibd_rx_queue_t *rxp; 3963 ibd_rwqe_t *list; 3964 3965 /* rotate through the rx_queue array, expecting an adequate number */ 3966 state->id_rx_post_queue_index = 3967 (state->id_rx_post_queue_index + 1) & 3968 (state->id_rx_nqueues - 1); 3969 3970 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3971 mutex_enter(&rxp->rx_post_lock); 3972 list = WQE_TO_RWQE(rxp->rx_head); 3973 rxp->rx_head = NULL; 3974 rxp->rx_cnt = 0; 3975 mutex_exit(&rxp->rx_post_lock); 3976 ibd_post_recv_list(state, list); 3977 } 3978 3979 /* macro explained below */ 3980 #define RX_QUEUE_HASH(rwqe) \ 3981 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3982 3983 /* 3984 * Add a rwqe to one of the the Rx lists. If the list is large enough 3985 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3986 * 3987 * Note: one of 2^N lists is chosen via a hash. This is done 3988 * because using one list is contentious. If the first list is busy 3989 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3990 * 3991 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3992 * even distribution of mapping rwqes to the 2^N queues. 3993 */ 3994 static void 3995 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3996 { 3997 ibd_rx_queue_t *rxp; 3998 3999 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 4000 4001 if (!mutex_tryenter(&rxp->rx_post_lock)) { 4002 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 4003 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 4004 mutex_enter(&rxp->rx_post_lock); 4005 } 4006 rwqe->rwqe_next = rxp->rx_head; 4007 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 4008 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 4009 4010 /* only call ibt_post_recv() every Nth time through here */ 4011 if ((active & (state->id_rx_nqueues - 1)) == 0) { 4012 rxp->rx_head = NULL; 4013 rxp->rx_cnt = 0; 4014 mutex_exit(&rxp->rx_post_lock); 4015 ibd_post_recv_list(state, rwqe); 4016 return; 4017 } 4018 } 4019 rxp->rx_head = RWQE_TO_WQE(rwqe); 4020 mutex_exit(&rxp->rx_post_lock); 4021 } 4022 4023 static int 4024 ibd_alloc_rx_copybufs(ibd_state_t *state) 4025 { 4026 ibt_mr_attr_t mem_attr; 4027 int i; 4028 4029 /* 4030 * Allocate one big chunk for all regular rx copy bufs 4031 */ 4032 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 4033 4034 state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe * 4035 state->id_rx_buf_sz, KM_SLEEP); 4036 4037 state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe * 4038 sizeof (ibd_rwqe_t), KM_SLEEP); 4039 4040 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 4041 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 4042 sizeof (ibd_rx_queue_t), KM_SLEEP); 4043 for (i = 0; i < state->id_rx_nqueues; i++) { 4044 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4045 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 4046 } 4047 4048 /* 4049 * Do one memory registration on the entire rxbuf area 4050 */ 4051 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 4052 mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz; 4053 mem_attr.mr_as = NULL; 4054 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 4055 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 4056 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 4057 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 4058 kmem_free(state->id_rx_wqes, 4059 state->id_ud_num_rwqe * sizeof (ibd_rwqe_t)); 4060 kmem_free(state->id_rx_bufs, 4061 state->id_ud_num_rwqe * state->id_rx_buf_sz); 4062 state->id_rx_bufs = NULL; 4063 state->id_rx_wqes = NULL; 4064 return (DDI_FAILURE); 4065 } 4066 4067 return (DDI_SUCCESS); 4068 } 4069 4070 /* 4071 * Allocate the statically allocated Rx buffer list. 4072 */ 4073 static int 4074 ibd_init_rxlist(ibd_state_t *state) 4075 { 4076 ibd_rwqe_t *rwqe, *next; 4077 ibd_wqe_t *list; 4078 ibt_lkey_t lkey; 4079 int i; 4080 uint_t len; 4081 uint8_t *bufaddr; 4082 4083 mutex_enter(&state->id_rx_free_list.dl_mutex); 4084 if (state->id_rx_free_list.dl_head != NULL) { 4085 /* rx rsrcs were never freed. Just repost them */ 4086 len = state->id_rx_buf_sz; 4087 list = state->id_rx_free_list.dl_head; 4088 state->id_rx_free_list.dl_head = NULL; 4089 state->id_rx_free_list.dl_cnt = 0; 4090 mutex_exit(&state->id_rx_free_list.dl_mutex); 4091 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4092 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4093 if ((rwqe->rwqe_im_mblk = desballoc( 4094 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 4095 &rwqe->w_freemsg_cb)) == NULL) { 4096 /* allow freemsg_cb to free the rwqes */ 4097 if (atomic_dec_32_nv(&state->id_running) != 0) { 4098 cmn_err(CE_WARN, "ibd_init_rxlist: " 4099 "id_running was not 1\n"); 4100 } 4101 DPRINT(10, "ibd_init_rxlist : " 4102 "failed in desballoc()"); 4103 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4104 rwqe = next) { 4105 next = WQE_TO_RWQE(rwqe->rwqe_next); 4106 if (rwqe->rwqe_im_mblk) { 4107 atomic_inc_32(&state-> 4108 id_rx_list. 4109 dl_bufs_outstanding); 4110 freemsg(rwqe->rwqe_im_mblk); 4111 } else 4112 ibd_free_rwqe(state, rwqe); 4113 } 4114 atomic_inc_32(&state->id_running); 4115 return (DDI_FAILURE); 4116 } 4117 } 4118 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4119 return (DDI_SUCCESS); 4120 } 4121 mutex_exit(&state->id_rx_free_list.dl_mutex); 4122 4123 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 4124 return (DDI_FAILURE); 4125 4126 /* 4127 * Allocate and setup the rwqe list 4128 */ 4129 len = state->id_rx_buf_sz; 4130 lkey = state->id_rx_mr_desc.md_lkey; 4131 rwqe = state->id_rx_wqes; 4132 bufaddr = state->id_rx_bufs; 4133 list = NULL; 4134 for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) { 4135 rwqe->w_state = state; 4136 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 4137 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 4138 4139 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 4140 4141 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 4142 &rwqe->w_freemsg_cb)) == NULL) { 4143 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 4144 /* allow freemsg_cb to free the rwqes */ 4145 if (atomic_dec_32_nv(&state->id_running) != 0) { 4146 cmn_err(CE_WARN, "ibd_init_rxlist: " 4147 "id_running was not 1\n"); 4148 } 4149 DPRINT(10, "ibd_init_rxlist : " 4150 "failed in desballoc()"); 4151 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4152 rwqe = next) { 4153 next = WQE_TO_RWQE(rwqe->rwqe_next); 4154 freemsg(rwqe->rwqe_im_mblk); 4155 } 4156 atomic_inc_32(&state->id_running); 4157 4158 /* remove reference to free'd rwqes */ 4159 mutex_enter(&state->id_rx_free_list.dl_mutex); 4160 state->id_rx_free_list.dl_head = NULL; 4161 state->id_rx_free_list.dl_cnt = 0; 4162 mutex_exit(&state->id_rx_free_list.dl_mutex); 4163 4164 ibd_fini_rxlist(state); 4165 return (DDI_FAILURE); 4166 } 4167 4168 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 4169 rwqe->rwqe_copybuf.ic_sgl.ds_va = 4170 (ib_vaddr_t)(uintptr_t)bufaddr; 4171 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 4172 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 4173 rwqe->w_rwr.wr_nds = 1; 4174 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 4175 4176 rwqe->rwqe_next = list; 4177 list = RWQE_TO_WQE(rwqe); 4178 } 4179 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4180 4181 return (DDI_SUCCESS); 4182 } 4183 4184 static void 4185 ibd_free_rx_copybufs(ibd_state_t *state) 4186 { 4187 int i; 4188 4189 /* 4190 * Unregister rxbuf mr 4191 */ 4192 if (ibt_deregister_mr(state->id_hca_hdl, 4193 state->id_rx_mr_hdl) != IBT_SUCCESS) { 4194 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 4195 } 4196 state->id_rx_mr_hdl = NULL; 4197 4198 /* 4199 * Free rxbuf memory 4200 */ 4201 for (i = 0; i < state->id_rx_nqueues; i++) { 4202 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4203 mutex_destroy(&rxp->rx_post_lock); 4204 } 4205 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 4206 sizeof (ibd_rx_queue_t)); 4207 kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe * 4208 sizeof (ibd_rwqe_t)); 4209 kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe * 4210 state->id_rx_buf_sz); 4211 state->id_rx_queues = NULL; 4212 state->id_rx_wqes = NULL; 4213 state->id_rx_bufs = NULL; 4214 } 4215 4216 static void 4217 ibd_free_rx_rsrcs(ibd_state_t *state) 4218 { 4219 mutex_enter(&state->id_rx_free_list.dl_mutex); 4220 if (state->id_rx_free_list.dl_head == NULL) { 4221 /* already freed */ 4222 mutex_exit(&state->id_rx_free_list.dl_mutex); 4223 return; 4224 } 4225 ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe); 4226 ibd_free_rx_copybufs(state); 4227 state->id_rx_free_list.dl_cnt = 0; 4228 state->id_rx_free_list.dl_head = NULL; 4229 mutex_exit(&state->id_rx_free_list.dl_mutex); 4230 } 4231 4232 /* 4233 * Free the statically allocated Rx buffer list. 4234 */ 4235 static void 4236 ibd_fini_rxlist(ibd_state_t *state) 4237 { 4238 ibd_rwqe_t *rwqe; 4239 int i; 4240 4241 /* run through the rx_queue's, calling freemsg() */ 4242 for (i = 0; i < state->id_rx_nqueues; i++) { 4243 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4244 mutex_enter(&rxp->rx_post_lock); 4245 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 4246 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4247 freemsg(rwqe->rwqe_im_mblk); 4248 rxp->rx_cnt--; 4249 } 4250 rxp->rx_head = NULL; 4251 mutex_exit(&rxp->rx_post_lock); 4252 } 4253 4254 /* cannot free rx resources unless gld returned everything */ 4255 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 4256 ibd_free_rx_rsrcs(state); 4257 } 4258 4259 /* 4260 * Free an allocated recv wqe. 4261 */ 4262 /* ARGSUSED */ 4263 static void 4264 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 4265 { 4266 /* 4267 * desballoc() failed (no memory). 4268 * 4269 * This rwqe is placed on a free list so that it 4270 * can be reinstated when memory is available. 4271 * 4272 * NOTE: no code currently exists to reinstate 4273 * these "lost" rwqes. 4274 */ 4275 mutex_enter(&state->id_rx_free_list.dl_mutex); 4276 state->id_rx_free_list.dl_cnt++; 4277 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 4278 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 4279 mutex_exit(&state->id_rx_free_list.dl_mutex); 4280 } 4281 4282 /* 4283 * IBA Rx completion queue handler. Guaranteed to be single 4284 * threaded and nonreentrant for this CQ. 4285 */ 4286 /* ARGSUSED */ 4287 static void 4288 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4289 { 4290 ibd_state_t *state = (ibd_state_t *)arg; 4291 4292 atomic_inc_64(&state->id_num_intrs); 4293 4294 if (ibd_rx_softintr == 1) { 4295 mutex_enter(&state->id_rcq_poll_lock); 4296 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 4297 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 4298 mutex_exit(&state->id_rcq_poll_lock); 4299 return; 4300 } else { 4301 mutex_exit(&state->id_rcq_poll_lock); 4302 ddi_trigger_softintr(state->id_rx); 4303 } 4304 } else 4305 (void) ibd_intr((caddr_t)state); 4306 } 4307 4308 /* 4309 * CQ handler for Tx completions, when the Tx CQ is in 4310 * interrupt driven mode. 4311 */ 4312 /* ARGSUSED */ 4313 static void 4314 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4315 { 4316 ibd_state_t *state = (ibd_state_t *)arg; 4317 4318 atomic_inc_64(&state->id_num_intrs); 4319 4320 if (ibd_tx_softintr == 1) { 4321 mutex_enter(&state->id_scq_poll_lock); 4322 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 4323 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 4324 mutex_exit(&state->id_scq_poll_lock); 4325 return; 4326 } else { 4327 mutex_exit(&state->id_scq_poll_lock); 4328 ddi_trigger_softintr(state->id_tx); 4329 } 4330 } else 4331 (void) ibd_tx_recycle((caddr_t)state); 4332 } 4333 4334 /* 4335 * Multicast group create/delete trap handler. These will be delivered 4336 * on a kernel thread (handling can thus block) and can be invoked 4337 * concurrently. The handler can be invoked anytime after it is 4338 * registered and before ibt_detach(). 4339 */ 4340 /* ARGSUSED */ 4341 static void 4342 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4343 ibt_subnet_event_t *event) 4344 { 4345 ibd_state_t *state = (ibd_state_t *)arg; 4346 ibd_req_t *req; 4347 4348 /* 4349 * The trap handler will get invoked once for every event for 4350 * every port. The input "gid" is the GID0 of the port the 4351 * trap came in on; we just need to act on traps that came 4352 * to our port, meaning the port on which the ipoib interface 4353 * resides. Since ipoib uses GID0 of the port, we just match 4354 * the gids to check whether we need to handle the trap. 4355 */ 4356 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4357 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4358 return; 4359 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4360 4361 DPRINT(10, "ibd_notices_handler : %d\n", code); 4362 4363 switch (code) { 4364 case IBT_SM_EVENT_UNAVAILABLE: 4365 /* 4366 * If we are in promiscuous mode or have 4367 * sendnonmembers, we need to print a warning 4368 * message right now. Else, just store the 4369 * information, print when we enter promiscuous 4370 * mode or attempt nonmember send. We might 4371 * also want to stop caching sendnonmember. 4372 */ 4373 ibd_print_warn(state, "IBA multicast support " 4374 "degraded due to unavailability of multicast " 4375 "traps"); 4376 break; 4377 case IBT_SM_EVENT_AVAILABLE: 4378 /* 4379 * If we printed a warning message above or 4380 * while trying to nonmember send or get into 4381 * promiscuous mode, print an okay message. 4382 */ 4383 ibd_print_warn(state, "IBA multicast support " 4384 "restored due to availability of multicast " 4385 "traps"); 4386 break; 4387 case IBT_SM_EVENT_MCG_CREATED: 4388 case IBT_SM_EVENT_MCG_DELETED: 4389 /* 4390 * If it is a "deleted" event and we are in late hca 4391 * init, nothing to do. 4392 */ 4393 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4394 IBD_DRV_IN_LATE_HCA_INIT) && (code == 4395 IBT_SM_EVENT_MCG_DELETED)) { 4396 break; 4397 } 4398 /* 4399 * Common processing of creation/deletion traps. 4400 * First check if the instance is being 4401 * [de]initialized; back off then, without doing 4402 * anything more, since we are not sure if the 4403 * async thread is around, or whether we might 4404 * be racing with the detach code in ibd_m_stop() 4405 * that scans the mcg list. 4406 */ 4407 if (!ibd_async_safe(state)) 4408 return; 4409 4410 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4411 req->rq_gid = event->sm_notice_gid; 4412 req->rq_ptr = (void *)code; 4413 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4414 break; 4415 } 4416 } 4417 4418 static void 4419 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4420 { 4421 ib_gid_t mgid = req->rq_gid; 4422 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4423 int ret; 4424 ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff; 4425 4426 DPRINT(10, "ibd_async_trap : %d\n", code); 4427 4428 /* 4429 * Check if we have already joined the IPoIB broadcast group for our 4430 * PKEY. If joined, perform the rest of the operation. 4431 * Else, the interface is not initialised. Do the initialisation here 4432 * by calling ibd_start() and return. 4433 */ 4434 4435 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4436 IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) && 4437 (code == IBT_SM_EVENT_MCG_CREATED)) { 4438 /* 4439 * If we are in late HCA init and a notification for the 4440 * creation of a MCG came in, check if it is the IPoIB MCG for 4441 * this pkey. If not, return. 4442 */ 4443 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey != 4444 state->id_pkey)) { 4445 ibd_async_done(state); 4446 return; 4447 } 4448 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4449 /* 4450 * Check if there is still a necessity to start the interface. 4451 * It is possible that the user attempted unplumb at just about 4452 * the same time, and if unplumb succeeded, we have nothing to 4453 * do. 4454 */ 4455 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4456 IBD_DRV_IN_LATE_HCA_INIT) && 4457 ((ret = ibd_start(state)) != 0)) { 4458 DPRINT(10, "ibd_async_trap: cannot start from late HCA " 4459 "init, ret=%d", ret); 4460 } 4461 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4462 ibd_async_done(state); 4463 return; 4464 } 4465 4466 /* 4467 * Atomically search the nonmember and sendonlymember lists and 4468 * delete. 4469 */ 4470 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4471 4472 if (state->id_prom_op == IBD_OP_COMPLETED) { 4473 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4474 4475 /* 4476 * If in promiscuous mode, try to join/attach to the new 4477 * mcg. Given the unreliable out-of-order mode of trap 4478 * delivery, we can never be sure whether it is a problem 4479 * if the join fails. Thus, we warn the admin of a failure 4480 * if this was a creation trap. Note that the trap might 4481 * actually be reporting a long past event, and the mcg 4482 * might already have been deleted, thus we might be warning 4483 * in vain. 4484 */ 4485 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4486 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4487 ibd_print_warn(state, "IBA promiscuous mode missed " 4488 "new multicast gid %016llx:%016llx", 4489 (u_longlong_t)mgid.gid_prefix, 4490 (u_longlong_t)mgid.gid_guid); 4491 } 4492 4493 /* 4494 * Free the request slot allocated by the subnet event thread. 4495 */ 4496 ibd_async_done(state); 4497 } 4498 4499 /* 4500 * GLDv3 entry point to get capabilities. 4501 */ 4502 static boolean_t 4503 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4504 { 4505 ibd_state_t *state = arg; 4506 4507 if (state->id_type == IBD_PORT_DRIVER) 4508 return (B_FALSE); 4509 4510 switch (cap) { 4511 case MAC_CAPAB_HCKSUM: { 4512 uint32_t *txflags = cap_data; 4513 4514 /* 4515 * We either do full checksum or not do it at all 4516 */ 4517 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4518 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4519 else 4520 return (B_FALSE); 4521 break; 4522 } 4523 4524 case MAC_CAPAB_LSO: { 4525 mac_capab_lso_t *cap_lso = cap_data; 4526 4527 /* 4528 * In addition to the capability and policy, since LSO 4529 * relies on hw checksum, we'll not enable LSO if we 4530 * don't have hw checksum. Of course, if the HCA doesn't 4531 * provide the reserved lkey capability, enabling LSO will 4532 * actually affect performance adversely, so we'll disable 4533 * LSO even for that case. 4534 */ 4535 if (!state->id_lso_policy || !state->id_lso_capable) 4536 return (B_FALSE); 4537 4538 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4539 return (B_FALSE); 4540 4541 if (state->id_hca_res_lkey_capab == 0) { 4542 ibd_print_warn(state, "no reserved-lkey capability, " 4543 "disabling LSO"); 4544 return (B_FALSE); 4545 } 4546 4547 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4548 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4549 break; 4550 } 4551 4552 default: 4553 return (B_FALSE); 4554 } 4555 4556 return (B_TRUE); 4557 } 4558 4559 /* 4560 * callback function for set/get of properties 4561 */ 4562 static int 4563 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4564 uint_t pr_valsize, const void *pr_val) 4565 { 4566 ibd_state_t *state = arg; 4567 int err = 0; 4568 uint32_t link_mode; 4569 4570 /* Cannot set properties on a port driver */ 4571 if (state->id_type == IBD_PORT_DRIVER) { 4572 return (ENOTSUP); 4573 } 4574 4575 switch (pr_num) { 4576 case MAC_PROP_IB_LINKMODE: 4577 if (state->id_mac_state & IBD_DRV_STARTED) { 4578 err = EBUSY; 4579 break; 4580 } 4581 if (pr_val == NULL) { 4582 err = EINVAL; 4583 break; 4584 } 4585 bcopy(pr_val, &link_mode, sizeof (link_mode)); 4586 if (link_mode != IBD_LINK_MODE_UD && 4587 link_mode != IBD_LINK_MODE_RC) { 4588 err = EINVAL; 4589 } else { 4590 if (link_mode == IBD_LINK_MODE_RC) { 4591 if (state->id_enable_rc) { 4592 return (0); 4593 } 4594 state->id_enable_rc = 1; 4595 /* inform MAC framework of new MTU */ 4596 err = mac_maxsdu_update2(state->id_mh, 4597 state->rc_mtu - IPOIB_HDRSIZE, 4598 state->id_mtu - IPOIB_HDRSIZE); 4599 } else { 4600 if (!state->id_enable_rc) { 4601 return (0); 4602 } 4603 state->id_enable_rc = 0; 4604 err = mac_maxsdu_update2(state->id_mh, 4605 state->id_mtu - IPOIB_HDRSIZE, 4606 state->id_mtu - IPOIB_HDRSIZE); 4607 } 4608 (void) ibd_record_capab(state); 4609 mac_capab_update(state->id_mh); 4610 } 4611 break; 4612 case MAC_PROP_PRIVATE: 4613 err = ibd_set_priv_prop(state, pr_name, 4614 pr_valsize, pr_val); 4615 break; 4616 default: 4617 err = ENOTSUP; 4618 break; 4619 } 4620 return (err); 4621 } 4622 4623 static int 4624 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4625 uint_t pr_valsize, void *pr_val) 4626 { 4627 ibd_state_t *state = arg; 4628 int err = 0; 4629 4630 switch (pr_num) { 4631 case MAC_PROP_MTU: 4632 break; 4633 default: 4634 if (state->id_type == IBD_PORT_DRIVER) { 4635 return (ENOTSUP); 4636 } 4637 break; 4638 } 4639 4640 switch (pr_num) { 4641 case MAC_PROP_IB_LINKMODE: 4642 *(uint_t *)pr_val = state->id_enable_rc; 4643 break; 4644 case MAC_PROP_PRIVATE: 4645 err = ibd_get_priv_prop(state, pr_name, pr_valsize, 4646 pr_val); 4647 break; 4648 default: 4649 err = ENOTSUP; 4650 break; 4651 } 4652 return (err); 4653 } 4654 4655 static void 4656 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4657 mac_prop_info_handle_t prh) 4658 { 4659 ibd_state_t *state = arg; 4660 4661 switch (pr_num) { 4662 case MAC_PROP_IB_LINKMODE: { 4663 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE); 4664 break; 4665 } 4666 case MAC_PROP_MTU: { 4667 uint32_t min, max; 4668 if (state->id_type == IBD_PORT_DRIVER) { 4669 min = 1500; 4670 max = IBD_DEF_RC_MAX_SDU; 4671 } else if (state->id_enable_rc) { 4672 min = max = IBD_DEF_RC_MAX_SDU; 4673 } else { 4674 min = max = state->id_mtu - IPOIB_HDRSIZE; 4675 } 4676 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4677 mac_prop_info_set_range_uint32(prh, min, max); 4678 break; 4679 } 4680 case MAC_PROP_PRIVATE: { 4681 char valstr[64]; 4682 int value; 4683 4684 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 4685 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4686 return; 4687 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4688 value = IBD_DEF_COALESCE_COMPLETIONS; 4689 } else if (strcmp(pr_name, 4690 "_ibd_create_broadcast_group") == 0) { 4691 value = IBD_DEF_CREATE_BCAST_GROUP; 4692 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4693 value = IBD_DEF_HASH_SIZE; 4694 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4695 value = IBD_DEF_LSO_POLICY; 4696 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4697 value = IBD_DEF_NUM_AH; 4698 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4699 value = IBD_DEF_NUM_LSO_BUFS; 4700 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4701 value = IBD_DEF_RC_ENABLE_SRQ; 4702 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4703 value = IBD_DEF_RC_NUM_RWQE; 4704 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4705 value = IBD_DEF_RC_NUM_SRQ; 4706 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4707 value = IBD_DEF_RC_NUM_SWQE; 4708 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4709 value = IBD_DEF_RC_RX_COMP_COUNT; 4710 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4711 value = IBD_DEF_RC_RX_COMP_USEC; 4712 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4713 value = IBD_DEF_RC_RX_COPY_THRESH; 4714 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4715 value = IBD_DEF_RC_RX_RWQE_THRESH; 4716 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4717 value = IBD_DEF_RC_TX_COMP_COUNT; 4718 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 4719 value = IBD_DEF_RC_TX_COMP_USEC; 4720 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 4721 value = IBD_DEF_RC_TX_COPY_THRESH; 4722 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 4723 value = IBD_DEF_UD_NUM_RWQE; 4724 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 4725 value = IBD_DEF_UD_NUM_SWQE; 4726 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 4727 value = IBD_DEF_UD_RX_COMP_COUNT; 4728 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 4729 value = IBD_DEF_UD_RX_COMP_USEC; 4730 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 4731 value = IBD_DEF_UD_TX_COMP_COUNT; 4732 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 4733 value = IBD_DEF_UD_TX_COMP_USEC; 4734 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 4735 value = IBD_DEF_UD_TX_COPY_THRESH; 4736 } else { 4737 return; 4738 } 4739 4740 (void) snprintf(valstr, sizeof (valstr), "%d", value); 4741 mac_prop_info_set_default_str(prh, valstr); 4742 break; 4743 } 4744 } /* switch (pr_num) */ 4745 } 4746 4747 /* ARGSUSED2 */ 4748 static int 4749 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name, 4750 uint_t pr_valsize, const void *pr_val) 4751 { 4752 int err = 0; 4753 long result; 4754 4755 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4756 if (pr_val == NULL) { 4757 return (EINVAL); 4758 } 4759 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4760 if (result < 0 || result > 1) { 4761 err = EINVAL; 4762 } else { 4763 state->id_allow_coalesce_comp_tuning = (result == 1) ? 4764 B_TRUE: B_FALSE; 4765 } 4766 return (err); 4767 } 4768 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 4769 if (state->id_mac_state & IBD_DRV_STARTED) { 4770 return (EBUSY); 4771 } 4772 if (pr_val == NULL) { 4773 return (EINVAL); 4774 } 4775 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4776 if (result < 0 || result > 1) { 4777 err = EINVAL; 4778 } else { 4779 state->id_create_broadcast_group = (result == 1) ? 4780 B_TRUE: B_FALSE; 4781 } 4782 return (err); 4783 } 4784 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4785 if (state->id_mac_state & IBD_DRV_STARTED) { 4786 return (EBUSY); 4787 } 4788 if (pr_val == NULL) { 4789 return (EINVAL); 4790 } 4791 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4792 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) { 4793 err = EINVAL; 4794 } else { 4795 state->id_hash_size = (uint32_t)result; 4796 } 4797 return (err); 4798 } 4799 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4800 if (state->id_mac_state & IBD_DRV_STARTED) { 4801 return (EBUSY); 4802 } 4803 if (pr_val == NULL) { 4804 return (EINVAL); 4805 } 4806 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4807 if (result < 0 || result > 1) { 4808 err = EINVAL; 4809 } else { 4810 state->id_lso_policy = (result == 1) ? 4811 B_TRUE: B_FALSE; 4812 } 4813 mac_capab_update(state->id_mh); 4814 return (err); 4815 } 4816 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4817 if (state->id_mac_state & IBD_DRV_STARTED) { 4818 return (EBUSY); 4819 } 4820 if (pr_val == NULL) { 4821 return (EINVAL); 4822 } 4823 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4824 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) { 4825 err = EINVAL; 4826 } else { 4827 state->id_num_ah = (uint32_t)result; 4828 } 4829 return (err); 4830 } 4831 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4832 if (state->id_mac_state & IBD_DRV_STARTED) { 4833 return (EBUSY); 4834 } 4835 if (!state->id_lso_policy || !state->id_lso_capable) { 4836 return (EINVAL); 4837 } 4838 if (pr_val == NULL) { 4839 return (EINVAL); 4840 } 4841 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4842 if (result < IBD_MIN_NUM_LSO_BUFS || 4843 result > IBD_MAX_NUM_LSO_BUFS) { 4844 err = EINVAL; 4845 } else { 4846 state->id_num_lso_bufs = (uint32_t)result; 4847 } 4848 return (err); 4849 } 4850 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4851 if (state->id_mac_state & IBD_DRV_STARTED) { 4852 return (EBUSY); 4853 } 4854 if (pr_val == NULL) { 4855 return (EINVAL); 4856 } 4857 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4858 if (result < 0 || result > 1) { 4859 err = EINVAL; 4860 } else { 4861 state->rc_enable_srq = (result == 1) ? 4862 B_TRUE: B_FALSE; 4863 } 4864 if (!state->rc_enable_srq) { 4865 state->id_rc_num_srq = 0; 4866 } 4867 return (err); 4868 } 4869 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4870 if (state->id_mac_state & IBD_DRV_STARTED) { 4871 return (EBUSY); 4872 } 4873 if (pr_val == NULL) { 4874 return (EINVAL); 4875 } 4876 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4877 if (result < IBD_MIN_RC_NUM_RWQE || 4878 result > IBD_MAX_RC_NUM_RWQE) { 4879 err = EINVAL; 4880 } else { 4881 state->id_rc_num_rwqe = (uint32_t)result; 4882 if (state->id_allow_coalesce_comp_tuning && 4883 state->id_rc_rx_comp_count > state->id_rc_num_rwqe) 4884 state->id_rc_rx_comp_count = 4885 state->id_rc_num_rwqe; 4886 if (state->id_rc_num_srq > state->id_rc_num_rwqe) 4887 state->id_rc_num_srq = 4888 state->id_rc_num_rwqe - 1; 4889 /* 4890 * If rx_rwqe_threshold is greater than the number of 4891 * rwqes, pull it back to 25% of number of rwqes. 4892 */ 4893 if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe) 4894 state->id_rc_rx_rwqe_thresh = 4895 (state->id_rc_num_rwqe >> 2); 4896 4897 } 4898 return (err); 4899 } 4900 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4901 if (state->id_mac_state & IBD_DRV_STARTED) { 4902 return (EBUSY); 4903 } 4904 if (pr_val == NULL) { 4905 return (EINVAL); 4906 } 4907 if (!state->rc_enable_srq) 4908 return (EINVAL); 4909 4910 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4911 if (result < IBD_MIN_RC_NUM_SRQ || 4912 result >= state->id_rc_num_rwqe) { 4913 err = EINVAL; 4914 } else 4915 state->id_rc_num_srq = (uint32_t)result; 4916 return (err); 4917 } 4918 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4919 if (state->id_mac_state & IBD_DRV_STARTED) { 4920 return (EBUSY); 4921 } 4922 if (pr_val == NULL) { 4923 return (EINVAL); 4924 } 4925 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4926 if (result < IBD_MIN_RC_NUM_SWQE || 4927 result > IBD_MAX_RC_NUM_SWQE) { 4928 err = EINVAL; 4929 } else { 4930 state->id_rc_num_swqe = (uint32_t)result; 4931 if (state->id_allow_coalesce_comp_tuning && 4932 state->id_rc_tx_comp_count > state->id_rc_num_swqe) 4933 state->id_rc_tx_comp_count = 4934 state->id_rc_num_swqe; 4935 } 4936 return (err); 4937 } 4938 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4939 if (!state->id_allow_coalesce_comp_tuning) { 4940 return (ENOTSUP); 4941 } 4942 if (pr_val == NULL) { 4943 return (EINVAL); 4944 } 4945 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4946 if (result < 1 || result > state->id_rc_num_rwqe) { 4947 err = EINVAL; 4948 } else { 4949 state->id_rc_rx_comp_count = (uint32_t)result; 4950 } 4951 return (err); 4952 } 4953 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4954 if (!state->id_allow_coalesce_comp_tuning) { 4955 return (ENOTSUP); 4956 } 4957 if (pr_val == NULL) { 4958 return (EINVAL); 4959 } 4960 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4961 if (result < 1) { 4962 err = EINVAL; 4963 } else { 4964 state->id_rc_rx_comp_usec = (uint32_t)result; 4965 } 4966 return (err); 4967 } 4968 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4969 if (state->id_mac_state & IBD_DRV_STARTED) { 4970 return (EBUSY); 4971 } 4972 if (pr_val == NULL) { 4973 return (EINVAL); 4974 } 4975 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4976 if (result < IBD_MIN_RC_RX_COPY_THRESH || 4977 result > state->rc_mtu) { 4978 err = EINVAL; 4979 } else { 4980 state->id_rc_rx_copy_thresh = (uint32_t)result; 4981 } 4982 return (err); 4983 } 4984 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4985 if (state->id_mac_state & IBD_DRV_STARTED) { 4986 return (EBUSY); 4987 } 4988 if (pr_val == NULL) { 4989 return (EINVAL); 4990 } 4991 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4992 if (result < IBD_MIN_RC_RX_RWQE_THRESH || 4993 result >= state->id_rc_num_rwqe) { 4994 err = EINVAL; 4995 } else { 4996 state->id_rc_rx_rwqe_thresh = (uint32_t)result; 4997 } 4998 return (err); 4999 } 5000 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 5001 if (!state->id_allow_coalesce_comp_tuning) { 5002 return (ENOTSUP); 5003 } 5004 if (pr_val == NULL) { 5005 return (EINVAL); 5006 } 5007 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5008 if (result < 1 || result > state->id_rc_num_swqe) { 5009 err = EINVAL; 5010 } else { 5011 state->id_rc_tx_comp_count = (uint32_t)result; 5012 } 5013 return (err); 5014 } 5015 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5016 if (!state->id_allow_coalesce_comp_tuning) { 5017 return (ENOTSUP); 5018 } 5019 if (pr_val == NULL) { 5020 return (EINVAL); 5021 } 5022 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5023 if (result < 1) 5024 err = EINVAL; 5025 else { 5026 state->id_rc_tx_comp_usec = (uint32_t)result; 5027 } 5028 return (err); 5029 } 5030 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5031 if (state->id_mac_state & IBD_DRV_STARTED) { 5032 return (EBUSY); 5033 } 5034 if (pr_val == NULL) { 5035 return (EINVAL); 5036 } 5037 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5038 if (result < IBD_MIN_RC_TX_COPY_THRESH || 5039 result > state->rc_mtu) { 5040 err = EINVAL; 5041 } else { 5042 state->id_rc_tx_copy_thresh = (uint32_t)result; 5043 } 5044 return (err); 5045 } 5046 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5047 if (state->id_mac_state & IBD_DRV_STARTED) { 5048 return (EBUSY); 5049 } 5050 if (pr_val == NULL) { 5051 return (EINVAL); 5052 } 5053 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5054 if (result < IBD_MIN_UD_NUM_RWQE || 5055 result > IBD_MAX_UD_NUM_RWQE) { 5056 err = EINVAL; 5057 } else { 5058 if (result > state->id_hca_max_chan_sz) { 5059 state->id_ud_num_rwqe = 5060 state->id_hca_max_chan_sz; 5061 } else { 5062 state->id_ud_num_rwqe = (uint32_t)result; 5063 } 5064 if (state->id_allow_coalesce_comp_tuning && 5065 state->id_ud_rx_comp_count > state->id_ud_num_rwqe) 5066 state->id_ud_rx_comp_count = 5067 state->id_ud_num_rwqe; 5068 } 5069 return (err); 5070 } 5071 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5072 if (state->id_mac_state & IBD_DRV_STARTED) { 5073 return (EBUSY); 5074 } 5075 if (pr_val == NULL) { 5076 return (EINVAL); 5077 } 5078 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5079 if (result < IBD_MIN_UD_NUM_SWQE || 5080 result > IBD_MAX_UD_NUM_SWQE) { 5081 err = EINVAL; 5082 } else { 5083 if (result > state->id_hca_max_chan_sz) { 5084 state->id_ud_num_swqe = 5085 state->id_hca_max_chan_sz; 5086 } else { 5087 state->id_ud_num_swqe = (uint32_t)result; 5088 } 5089 if (state->id_allow_coalesce_comp_tuning && 5090 state->id_ud_tx_comp_count > state->id_ud_num_swqe) 5091 state->id_ud_tx_comp_count = 5092 state->id_ud_num_swqe; 5093 } 5094 return (err); 5095 } 5096 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5097 if (!state->id_allow_coalesce_comp_tuning) { 5098 return (ENOTSUP); 5099 } 5100 if (pr_val == NULL) { 5101 return (EINVAL); 5102 } 5103 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5104 if (result < 1 || result > state->id_ud_num_rwqe) { 5105 err = EINVAL; 5106 } else { 5107 state->id_ud_rx_comp_count = (uint32_t)result; 5108 } 5109 return (err); 5110 } 5111 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5112 if (!state->id_allow_coalesce_comp_tuning) { 5113 return (ENOTSUP); 5114 } 5115 if (pr_val == NULL) { 5116 return (EINVAL); 5117 } 5118 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5119 if (result < 1) { 5120 err = EINVAL; 5121 } else { 5122 state->id_ud_rx_comp_usec = (uint32_t)result; 5123 } 5124 return (err); 5125 } 5126 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5127 if (!state->id_allow_coalesce_comp_tuning) { 5128 return (ENOTSUP); 5129 } 5130 if (pr_val == NULL) { 5131 return (EINVAL); 5132 } 5133 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5134 if (result < 1 || result > state->id_ud_num_swqe) { 5135 err = EINVAL; 5136 } else { 5137 state->id_ud_tx_comp_count = (uint32_t)result; 5138 } 5139 return (err); 5140 } 5141 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5142 if (!state->id_allow_coalesce_comp_tuning) { 5143 return (ENOTSUP); 5144 } 5145 if (pr_val == NULL) { 5146 return (EINVAL); 5147 } 5148 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5149 if (result < 1) { 5150 err = EINVAL; 5151 } else { 5152 state->id_ud_tx_comp_usec = (uint32_t)result; 5153 } 5154 return (err); 5155 } 5156 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5157 if (state->id_mac_state & IBD_DRV_STARTED) { 5158 return (EBUSY); 5159 } 5160 if (pr_val == NULL) { 5161 return (EINVAL); 5162 } 5163 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5164 if (result < IBD_MIN_UD_TX_COPY_THRESH || 5165 result > IBD_MAX_UD_TX_COPY_THRESH) { 5166 err = EINVAL; 5167 } else { 5168 state->id_ud_tx_copy_thresh = (uint32_t)result; 5169 } 5170 return (err); 5171 } 5172 return (ENOTSUP); 5173 } 5174 5175 static int 5176 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize, 5177 void *pr_val) 5178 { 5179 int err = ENOTSUP; 5180 int value; 5181 5182 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 5183 value = state->id_bgroup_present; 5184 err = 0; 5185 goto done; 5186 } 5187 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 5188 value = state->id_allow_coalesce_comp_tuning; 5189 err = 0; 5190 goto done; 5191 } 5192 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 5193 value = state->id_create_broadcast_group; 5194 err = 0; 5195 goto done; 5196 } 5197 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 5198 value = state->id_hash_size; 5199 err = 0; 5200 goto done; 5201 } 5202 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 5203 value = state->id_lso_policy; 5204 err = 0; 5205 goto done; 5206 } 5207 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 5208 value = state->id_num_ah; 5209 err = 0; 5210 goto done; 5211 } 5212 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 5213 value = state->id_num_lso_bufs; 5214 err = 0; 5215 goto done; 5216 } 5217 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 5218 value = state->rc_enable_srq; 5219 err = 0; 5220 goto done; 5221 } 5222 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 5223 value = state->id_rc_num_rwqe; 5224 err = 0; 5225 goto done; 5226 } 5227 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 5228 value = state->id_rc_num_srq; 5229 err = 0; 5230 goto done; 5231 } 5232 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 5233 value = state->id_rc_num_swqe; 5234 err = 0; 5235 goto done; 5236 } 5237 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 5238 value = state->id_rc_rx_comp_count; 5239 err = 0; 5240 goto done; 5241 } 5242 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 5243 value = state->id_rc_rx_comp_usec; 5244 err = 0; 5245 goto done; 5246 } 5247 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 5248 value = state->id_rc_rx_copy_thresh; 5249 err = 0; 5250 goto done; 5251 } 5252 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 5253 value = state->id_rc_rx_rwqe_thresh; 5254 err = 0; 5255 goto done; 5256 } 5257 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 5258 value = state->id_rc_tx_comp_count; 5259 err = 0; 5260 goto done; 5261 } 5262 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5263 value = state->id_rc_tx_comp_usec; 5264 err = 0; 5265 goto done; 5266 } 5267 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5268 value = state->id_rc_tx_copy_thresh; 5269 err = 0; 5270 goto done; 5271 } 5272 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5273 value = state->id_ud_num_rwqe; 5274 err = 0; 5275 goto done; 5276 } 5277 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5278 value = state->id_ud_num_swqe; 5279 err = 0; 5280 goto done; 5281 } 5282 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5283 value = state->id_ud_rx_comp_count; 5284 err = 0; 5285 goto done; 5286 } 5287 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5288 value = state->id_ud_rx_comp_usec; 5289 err = 0; 5290 goto done; 5291 } 5292 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5293 value = state->id_ud_tx_comp_count; 5294 err = 0; 5295 goto done; 5296 } 5297 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5298 value = state->id_ud_tx_comp_usec; 5299 err = 0; 5300 goto done; 5301 } 5302 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5303 value = state->id_ud_tx_copy_thresh; 5304 err = 0; 5305 goto done; 5306 } 5307 done: 5308 if (err == 0) { 5309 (void) snprintf(pr_val, pr_valsize, "%d", value); 5310 } 5311 return (err); 5312 } 5313 5314 static int 5315 ibd_get_port_details(ibd_state_t *state) 5316 { 5317 ibt_hca_portinfo_t *port_infop; 5318 ibt_status_t ret; 5319 uint_t psize, port_infosz; 5320 5321 mutex_enter(&state->id_link_mutex); 5322 5323 /* 5324 * Query for port information 5325 */ 5326 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 5327 &port_infop, &psize, &port_infosz); 5328 if ((ret != IBT_SUCCESS) || (psize != 1)) { 5329 mutex_exit(&state->id_link_mutex); 5330 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 5331 "failed, ret=%d", ret); 5332 return (ENETDOWN); 5333 } 5334 5335 /* 5336 * If the link is active, verify the pkey 5337 */ 5338 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) { 5339 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 5340 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 5341 state->id_link_state = LINK_STATE_DOWN; 5342 } else { 5343 state->id_link_state = LINK_STATE_UP; 5344 } 5345 state->id_mtu = (128 << port_infop->p_mtu); 5346 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5347 state->id_sgid = *port_infop->p_sgid_tbl; 5348 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5349 /* 5350 * Now that the port is active, record the port speed 5351 */ 5352 state->id_link_speed = ibd_get_portspeed(state); 5353 } else { 5354 /* Make sure that these are handled in PORT_UP/CHANGE */ 5355 state->id_mtu = 0; 5356 state->id_link_state = LINK_STATE_DOWN; 5357 state->id_link_speed = 0; 5358 } 5359 mutex_exit(&state->id_link_mutex); 5360 ibt_free_portinfo(port_infop, port_infosz); 5361 5362 return (0); 5363 } 5364 5365 static int 5366 ibd_alloc_cqs(ibd_state_t *state) 5367 { 5368 ibt_hca_attr_t hca_attrs; 5369 ibt_cq_attr_t cq_attr; 5370 ibt_status_t ret; 5371 uint32_t real_size; 5372 uint_t num_rwqe_change = 0; 5373 uint_t num_swqe_change = 0; 5374 5375 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 5376 ASSERT(ret == IBT_SUCCESS); 5377 5378 /* 5379 * Allocate Rx/combined CQ: 5380 * Theoretically, there is no point in having more than #rwqe 5381 * plus #swqe cqe's, except that the CQ will be signaled for 5382 * overflow when the last wqe completes, if none of the previous 5383 * cqe's have been polled. Thus, we allocate just a few less wqe's 5384 * to make sure such overflow does not occur. 5385 */ 5386 cq_attr.cq_sched = NULL; 5387 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 5388 5389 /* 5390 * Allocate Receive CQ. 5391 */ 5392 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) { 5393 cq_attr.cq_size = state->id_ud_num_rwqe + 1; 5394 } else { 5395 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5396 num_rwqe_change = state->id_ud_num_rwqe; 5397 state->id_ud_num_rwqe = cq_attr.cq_size - 1; 5398 } 5399 5400 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5401 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 5402 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 5403 "failed, ret=%d\n", ret); 5404 return (DDI_FAILURE); 5405 } 5406 5407 if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count, 5408 state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) { 5409 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 5410 "moderation failed, ret=%d\n", ret); 5411 } 5412 5413 /* make the #rx wc's the same as max rx chain size */ 5414 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 5415 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 5416 state->id_rxwcs_size, KM_SLEEP); 5417 5418 /* 5419 * Allocate Send CQ. 5420 */ 5421 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) { 5422 cq_attr.cq_size = state->id_ud_num_swqe + 1; 5423 } else { 5424 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5425 num_swqe_change = state->id_ud_num_swqe; 5426 state->id_ud_num_swqe = cq_attr.cq_size - 1; 5427 } 5428 5429 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5430 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 5431 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 5432 "failed, ret=%d\n", ret); 5433 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 5434 state->id_rxwcs_size); 5435 (void) ibt_free_cq(state->id_rcq_hdl); 5436 return (DDI_FAILURE); 5437 } 5438 if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count, 5439 state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) { 5440 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 5441 "moderation failed, ret=%d\n", ret); 5442 } 5443 5444 state->id_txwcs_size = IBD_TX_POLL_THRESH; 5445 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 5446 state->id_txwcs_size, KM_SLEEP); 5447 5448 /* 5449 * Print message in case we could not allocate as many wqe's 5450 * as was requested. 5451 */ 5452 if (num_rwqe_change) { 5453 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 5454 "%d", state->id_ud_num_rwqe, num_rwqe_change); 5455 } 5456 if (num_swqe_change) { 5457 ibd_print_warn(state, "Setting #swqe = %d instead of default " 5458 "%d", state->id_ud_num_swqe, num_swqe_change); 5459 } 5460 5461 return (DDI_SUCCESS); 5462 } 5463 5464 static int 5465 ibd_setup_ud_channel(ibd_state_t *state) 5466 { 5467 ibt_ud_chan_alloc_args_t ud_alloc_attr; 5468 ibt_ud_chan_query_attr_t ud_chan_attr; 5469 ibt_status_t ret; 5470 5471 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 5472 if (state->id_hca_res_lkey_capab) 5473 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 5474 if (state->id_lso_policy && state->id_lso_capable) 5475 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 5476 5477 ud_alloc_attr.ud_hca_port_num = state->id_port; 5478 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 5479 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 5480 ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe; 5481 ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe; 5482 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 5483 ud_alloc_attr.ud_scq = state->id_scq_hdl; 5484 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 5485 ud_alloc_attr.ud_pd = state->id_pd_hdl; 5486 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 5487 ud_alloc_attr.ud_clone_chan = NULL; 5488 5489 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 5490 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 5491 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 5492 "failed, ret=%d\n", ret); 5493 return (DDI_FAILURE); 5494 } 5495 5496 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 5497 &ud_chan_attr)) != IBT_SUCCESS) { 5498 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 5499 "failed, ret=%d\n", ret); 5500 (void) ibt_free_channel(state->id_chnl_hdl); 5501 return (DDI_FAILURE); 5502 } 5503 5504 state->id_qpnum = ud_chan_attr.ud_qpn; 5505 5506 return (DDI_SUCCESS); 5507 } 5508 5509 static int 5510 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 5511 { 5512 uint32_t progress = state->id_mac_state; 5513 uint_t attempts; 5514 ibt_status_t ret; 5515 ib_gid_t mgid; 5516 ibd_mce_t *mce; 5517 uint8_t jstate; 5518 timeout_id_t tid; 5519 5520 if (atomic_dec_32_nv(&state->id_running) != 0) 5521 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 5522 5523 /* 5524 * Before we try to stop/undo whatever we did in ibd_start(), 5525 * we need to mark the link state appropriately to prevent the 5526 * ip layer from using this instance for any new transfers. Note 5527 * that if the original state of the link was "up" when we're 5528 * here, we'll set the final link state to "unknown", to behave 5529 * in the same fashion as other ethernet drivers. 5530 */ 5531 mutex_enter(&state->id_link_mutex); 5532 if (cur_link_state == LINK_STATE_DOWN) { 5533 state->id_link_state = cur_link_state; 5534 } else { 5535 state->id_link_state = LINK_STATE_UNKNOWN; 5536 } 5537 mutex_exit(&state->id_link_mutex); 5538 bzero(&state->id_macaddr, sizeof (ipoib_mac_t)); 5539 mac_link_update(state->id_mh, state->id_link_state); 5540 5541 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 5542 if (progress & IBD_DRV_STARTED) { 5543 state->id_mac_state &= (~IBD_DRV_STARTED); 5544 } 5545 5546 if (progress & IBD_DRV_IN_LATE_HCA_INIT) { 5547 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT); 5548 } 5549 5550 /* Stop listen under Reliable Connected Mode */ 5551 if (progress & IBD_DRV_RC_LISTEN) { 5552 ASSERT(state->id_enable_rc); 5553 if (state->rc_listen_hdl != NULL) { 5554 ibd_rc_stop_listen(state); 5555 } 5556 state->id_mac_state &= (~IBD_DRV_RC_LISTEN); 5557 } 5558 5559 /* Stop timeout routine */ 5560 if (progress & IBD_DRV_RC_TIMEOUT) { 5561 ASSERT(state->id_enable_rc); 5562 mutex_enter(&state->rc_timeout_lock); 5563 state->rc_timeout_start = B_FALSE; 5564 tid = state->rc_timeout; 5565 state->rc_timeout = 0; 5566 mutex_exit(&state->rc_timeout_lock); 5567 if (tid != 0) 5568 (void) untimeout(tid); 5569 state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT); 5570 } 5571 5572 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) { 5573 attempts = 100; 5574 while (state->id_ah_op == IBD_OP_ONGOING) { 5575 /* 5576 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB 5577 * port is connecting to a remote IPoIB port. Wait for 5578 * the end of this connecting operation. 5579 */ 5580 delay(drv_usectohz(100000)); 5581 if (--attempts == 0) { 5582 state->rc_stop_connect++; 5583 DPRINT(40, "ibd_undo_start: connecting"); 5584 break; 5585 } 5586 } 5587 mutex_enter(&state->id_sched_lock); 5588 state->id_sched_needed = 0; 5589 mutex_exit(&state->id_sched_lock); 5590 (void) ibd_rc_close_all_chan(state); 5591 } 5592 5593 /* 5594 * First, stop receive interrupts; this stops the driver from 5595 * handing up buffers to higher layers. Wait for receive buffers 5596 * to be returned and give up after 1 second. 5597 */ 5598 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 5599 attempts = 10; 5600 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 5601 0) > 0) { 5602 delay(drv_usectohz(100000)); 5603 if (--attempts == 0) { 5604 /* 5605 * There are pending bufs with the network 5606 * layer and we have no choice but to wait 5607 * for them to be done with. Reap all the 5608 * Tx/Rx completions that were posted since 5609 * we turned off the notification and 5610 * return failure. 5611 */ 5612 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 5613 DPRINT(2, "ibd_undo_start: " 5614 "reclaiming failed"); 5615 break; 5616 } 5617 } 5618 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 5619 } 5620 5621 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { 5622 ibd_rc_fini_tx_largebuf_list(state); 5623 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); 5624 } 5625 5626 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 5627 ASSERT(state->id_enable_rc); 5628 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) { 5629 if (state->id_ah_op == IBD_OP_ONGOING) { 5630 delay(drv_usectohz(10000)); 5631 if (state->id_ah_op == IBD_OP_ONGOING) { 5632 /* 5633 * "state->id_ah_op == IBD_OP_ONGOING" 5634 * means this IPoIB port is connecting 5635 * to a remote IPoIB port. We can't 5636 * delete SRQ here. 5637 */ 5638 state->rc_stop_connect++; 5639 DPRINT(40, "ibd_undo_start: " 5640 "connecting"); 5641 } else { 5642 ibd_rc_fini_srq_list(state); 5643 state->id_mac_state &= 5644 (~IBD_DRV_RC_SRQ_ALLOCD); 5645 } 5646 } else { 5647 ibd_rc_fini_srq_list(state); 5648 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 5649 } 5650 } else { 5651 DPRINT(40, "ibd_undo_start: srq bufs outstanding\n"); 5652 } 5653 } 5654 5655 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 5656 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 5657 5658 mutex_enter(&state->id_trap_lock); 5659 state->id_trap_stop = B_TRUE; 5660 while (state->id_trap_inprog > 0) 5661 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 5662 mutex_exit(&state->id_trap_lock); 5663 5664 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 5665 } 5666 5667 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 5668 /* 5669 * Flushing the channel ensures that all pending WQE's 5670 * are marked with flush_error and handed to the CQ. It 5671 * does not guarantee the invocation of the CQ handler. 5672 * This call is guaranteed to return successfully for 5673 * UD QPNs. 5674 */ 5675 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 5676 IBT_SUCCESS) { 5677 DPRINT(10, "ibd_undo_start: flush_channel " 5678 "failed, ret=%d", ret); 5679 } 5680 5681 /* 5682 * Give some time for the TX CQ handler to process the 5683 * completions. 5684 */ 5685 attempts = 10; 5686 mutex_enter(&state->id_tx_list.dl_mutex); 5687 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5688 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 5689 != state->id_ud_num_swqe) { 5690 if (--attempts == 0) 5691 break; 5692 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5693 mutex_exit(&state->id_tx_list.dl_mutex); 5694 delay(drv_usectohz(100000)); 5695 mutex_enter(&state->id_tx_list.dl_mutex); 5696 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5697 } 5698 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5699 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 5700 state->id_ud_num_swqe) { 5701 cmn_err(CE_WARN, "tx resources not freed\n"); 5702 } 5703 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5704 mutex_exit(&state->id_tx_list.dl_mutex); 5705 5706 attempts = 10; 5707 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5708 if (--attempts == 0) 5709 break; 5710 delay(drv_usectohz(100000)); 5711 } 5712 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 5713 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5714 cmn_err(CE_WARN, "rx resources not freed\n"); 5715 } 5716 5717 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 5718 } 5719 5720 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 5721 /* 5722 * Drop all residual full/non membership. This includes full 5723 * membership to the broadcast group, and any nonmembership 5724 * acquired during transmits. We do this after the Tx completion 5725 * handlers are done, since those might result in some late 5726 * leaves; this also eliminates a potential race with that 5727 * path wrt the mc full list insert/delete. Trap handling 5728 * has also been suppressed at this point. Thus, no locks 5729 * are required while traversing the mc full list. 5730 */ 5731 DPRINT(2, "ibd_undo_start: clear full cache entries"); 5732 mce = list_head(&state->id_mc_full); 5733 while (mce != NULL) { 5734 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5735 jstate = mce->mc_jstate; 5736 mce = list_next(&state->id_mc_full, mce); 5737 ibd_leave_group(state, mgid, jstate); 5738 } 5739 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 5740 } 5741 5742 if (progress & IBD_DRV_RXLIST_ALLOCD) { 5743 ibd_fini_rxlist(state); 5744 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 5745 } 5746 5747 if (progress & IBD_DRV_TXLIST_ALLOCD) { 5748 ibd_fini_txlist(state); 5749 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 5750 } 5751 5752 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 5753 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 5754 IBT_SUCCESS) { 5755 DPRINT(10, "ibd_undo_start: free_channel " 5756 "failed, ret=%d", ret); 5757 } 5758 5759 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 5760 } 5761 5762 if (progress & IBD_DRV_CQS_ALLOCD) { 5763 kmem_free(state->id_txwcs, 5764 sizeof (ibt_wc_t) * state->id_txwcs_size); 5765 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 5766 IBT_SUCCESS) { 5767 DPRINT(10, "ibd_undo_start: free_cq(scq) " 5768 "failed, ret=%d", ret); 5769 } 5770 5771 kmem_free(state->id_rxwcs, 5772 sizeof (ibt_wc_t) * state->id_rxwcs_size); 5773 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 5774 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 5775 "ret=%d", ret); 5776 } 5777 5778 state->id_txwcs = NULL; 5779 state->id_rxwcs = NULL; 5780 state->id_scq_hdl = NULL; 5781 state->id_rcq_hdl = NULL; 5782 5783 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 5784 } 5785 5786 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 5787 mutex_enter(&state->id_ac_mutex); 5788 mod_hash_destroy_hash(state->id_ah_active_hash); 5789 mutex_exit(&state->id_ac_mutex); 5790 ibd_acache_fini(state); 5791 5792 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 5793 } 5794 5795 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 5796 /* 5797 * If we'd created the ipoib broadcast group and had 5798 * successfully joined it, leave it now 5799 */ 5800 if (state->id_bgroup_created) { 5801 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 5802 jstate = IB_MC_JSTATE_FULL; 5803 (void) ibt_leave_mcg(state->id_sgid, mgid, 5804 state->id_sgid, jstate); 5805 } 5806 ibt_free_mcg_info(state->id_mcinfo, 1); 5807 5808 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 5809 } 5810 5811 return (DDI_SUCCESS); 5812 } 5813 5814 /* 5815 * These pair of routines are used to set/clear the condition that 5816 * the caller is likely to do something to change the id_mac_state. 5817 * If there's already someone doing either a start or a stop (possibly 5818 * due to the async handler detecting a pkey relocation event, a plumb 5819 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 5820 * that's done. 5821 */ 5822 static void 5823 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 5824 { 5825 mutex_enter(&state->id_macst_lock); 5826 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 5827 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 5828 5829 state->id_mac_state |= flag; 5830 mutex_exit(&state->id_macst_lock); 5831 } 5832 5833 static void 5834 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 5835 { 5836 mutex_enter(&state->id_macst_lock); 5837 state->id_mac_state &= (~flag); 5838 cv_signal(&state->id_macst_cv); 5839 mutex_exit(&state->id_macst_lock); 5840 } 5841 5842 /* 5843 * GLDv3 entry point to start hardware. 5844 */ 5845 /*ARGSUSED*/ 5846 static int 5847 ibd_m_start(void *arg) 5848 { 5849 ibd_state_t *state = arg; 5850 int ret; 5851 5852 if (state->id_type == IBD_PORT_DRIVER) 5853 return (EINVAL); 5854 5855 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5856 if (state->id_mac_state & IBD_DRV_IN_DELETION) { 5857 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5858 return (EIO); 5859 } 5860 5861 ret = ibd_start(state); 5862 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5863 return (ret); 5864 } 5865 5866 static int 5867 ibd_start(ibd_state_t *state) 5868 { 5869 int err; 5870 ibt_status_t ret; 5871 int late_hca_init = 0; 5872 5873 if (state->id_mac_state & IBD_DRV_STARTED) 5874 return (DDI_SUCCESS); 5875 5876 /* 5877 * We do not increment the running flag when calling ibd_start() as 5878 * a result of some event which moves the state away from late HCA 5879 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability. 5880 */ 5881 if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 5882 (atomic_inc_32_nv(&state->id_running) != 1)) { 5883 DPRINT(10, "ibd_start: id_running is non-zero"); 5884 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 5885 atomic_dec_32(&state->id_running); 5886 return (EINVAL); 5887 } 5888 5889 /* 5890 * Get port details; if we fail here, something bad happened. 5891 * Fail plumb. 5892 */ 5893 if ((err = ibd_get_port_details(state)) != 0) { 5894 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 5895 goto start_fail; 5896 } 5897 /* 5898 * If state->id_link_state is DOWN, it indicates that either the port 5899 * is down, or the pkey is not available. In both cases, resort to late 5900 * initialization. Register for subnet notices, and return success. 5901 */ 5902 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 5903 if (state->id_link_state == LINK_STATE_DOWN) { 5904 late_hca_init = 1; 5905 goto late_hca_init_return; 5906 } 5907 5908 /* 5909 * Find the IPoIB broadcast group 5910 */ 5911 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 5912 /* Resort to late initialization */ 5913 late_hca_init = 1; 5914 goto reg_snet_notices; 5915 } 5916 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 5917 5918 /* 5919 * Initialize per-interface caches and lists; if we fail here, 5920 * it is most likely due to a lack of resources 5921 */ 5922 if (ibd_acache_init(state) != DDI_SUCCESS) { 5923 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 5924 err = ENOMEM; 5925 goto start_fail; 5926 } 5927 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 5928 5929 /* 5930 * Allocate send and receive completion queues 5931 */ 5932 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 5933 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 5934 err = ENOMEM; 5935 goto start_fail; 5936 } 5937 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 5938 5939 /* 5940 * Setup a UD channel 5941 */ 5942 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 5943 err = ENOMEM; 5944 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 5945 goto start_fail; 5946 } 5947 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 5948 5949 /* 5950 * Allocate and initialize the tx buffer list 5951 */ 5952 if (ibd_init_txlist(state) != DDI_SUCCESS) { 5953 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 5954 err = ENOMEM; 5955 goto start_fail; 5956 } 5957 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 5958 5959 /* 5960 * Create the send cq handler here 5961 */ 5962 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 5963 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 5964 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 5965 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 5966 "failed, ret=%d", ret); 5967 err = EINVAL; 5968 goto start_fail; 5969 } 5970 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 5971 5972 /* 5973 * Allocate and initialize the rx buffer list 5974 */ 5975 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 5976 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 5977 err = ENOMEM; 5978 goto start_fail; 5979 } 5980 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 5981 5982 /* 5983 * Join IPoIB broadcast group 5984 */ 5985 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 5986 DPRINT(10, "ibd_start: ibd_join_group() failed"); 5987 err = ENOTACTIVE; 5988 goto start_fail; 5989 } 5990 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 5991 5992 /* 5993 * When we did mac_register() in ibd_attach(), we didn't register 5994 * the real macaddr and we didn't have the true port mtu. Now that 5995 * we're almost ready, set the local mac address and broadcast 5996 * addresses and update gldv3 about the real values of these 5997 * parameters. 5998 */ 5999 if (state->id_enable_rc) { 6000 ibd_h2n_mac(&state->id_macaddr, 6001 IBD_MAC_ADDR_RC + state->id_qpnum, 6002 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 6003 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum, 6004 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 6005 } else { 6006 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 6007 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 6008 } 6009 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 6010 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 6011 6012 if (!state->id_enable_rc) { 6013 (void) mac_maxsdu_update2(state->id_mh, 6014 state->id_mtu - IPOIB_HDRSIZE, 6015 state->id_mtu - IPOIB_HDRSIZE); 6016 } 6017 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 6018 6019 /* 6020 * Setup the receive cq handler 6021 */ 6022 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 6023 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 6024 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 6025 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 6026 "failed, ret=%d", ret); 6027 err = EINVAL; 6028 goto start_fail; 6029 } 6030 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 6031 6032 reg_snet_notices: 6033 /* 6034 * In case of normal initialization sequence, 6035 * Setup the subnet notices handler after we've initialized the acache/ 6036 * mcache and started the async thread, both of which are required for 6037 * the trap handler to function properly. 6038 * 6039 * Now that the async thread has been started (and we've already done 6040 * a mac_register() during attach so mac_tx_update() can be called 6041 * if necessary without any problem), we can enable the trap handler 6042 * to queue requests to the async thread. 6043 * 6044 * In case of late hca initialization, the subnet notices handler will 6045 * only handle MCG created/deleted event. The action performed as part 6046 * of handling these events is to start the interface. So, the 6047 * acache/mcache initialization is not a necessity in such cases for 6048 * registering the subnet notices handler. Also, if we are in 6049 * ibd_start() as a result of, say, some event handling after entering 6050 * late hca initialization phase no need to register again. 6051 */ 6052 if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) { 6053 ibt_register_subnet_notices(state->id_ibt_hdl, 6054 ibd_snet_notices_handler, state); 6055 mutex_enter(&state->id_trap_lock); 6056 state->id_trap_stop = B_FALSE; 6057 mutex_exit(&state->id_trap_lock); 6058 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 6059 } 6060 6061 late_hca_init_return: 6062 if (late_hca_init == 1) { 6063 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT; 6064 /* 6065 * In case of late initialization, mark the link state as down, 6066 * immaterial of the actual link state as reported in the 6067 * port_info. 6068 */ 6069 state->id_link_state = LINK_STATE_DOWN; 6070 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 6071 mac_link_update(state->id_mh, state->id_link_state); 6072 return (DDI_SUCCESS); 6073 } 6074 6075 if (state->id_enable_rc) { 6076 if (state->rc_enable_srq) { 6077 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) { 6078 if (ibd_rc_repost_srq_free_list(state) != 6079 IBT_SUCCESS) { 6080 err = ENOMEM; 6081 goto start_fail; 6082 } 6083 } else { 6084 /* Allocate SRQ resource */ 6085 if (ibd_rc_init_srq_list(state) != 6086 IBT_SUCCESS) { 6087 err = ENOMEM; 6088 goto start_fail; 6089 } 6090 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; 6091 } 6092 } 6093 6094 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { 6095 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " 6096 "failed"); 6097 err = ENOMEM; 6098 goto start_fail; 6099 } 6100 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; 6101 6102 /* RC: begin to listen only after everything is available */ 6103 if (ibd_rc_listen(state) != IBT_SUCCESS) { 6104 DPRINT(10, "ibd_start: ibd_rc_listen() failed"); 6105 err = EINVAL; 6106 goto start_fail; 6107 } 6108 state->id_mac_state |= IBD_DRV_RC_LISTEN; 6109 } 6110 6111 /* 6112 * Indicate link status to GLDv3 and higher layers. By default, 6113 * we assume we are in up state (which must have been true at 6114 * least at the time the broadcast mcg's were probed); if there 6115 * were any up/down transitions till the time we come here, the 6116 * async handler will have updated last known state, which we 6117 * use to tell GLDv3. The async handler will not send any 6118 * notifications to GLDv3 till we reach here in the initialization 6119 * sequence. 6120 */ 6121 mac_link_update(state->id_mh, state->id_link_state); 6122 state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT; 6123 state->id_mac_state |= IBD_DRV_STARTED; 6124 6125 /* Start timer after everything is ready */ 6126 if (state->id_enable_rc) { 6127 mutex_enter(&state->rc_timeout_lock); 6128 state->rc_timeout_start = B_TRUE; 6129 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state, 6130 SEC_TO_TICK(ibd_rc_conn_timeout)); 6131 mutex_exit(&state->rc_timeout_lock); 6132 state->id_mac_state |= IBD_DRV_RC_TIMEOUT; 6133 } 6134 6135 return (DDI_SUCCESS); 6136 6137 start_fail: 6138 /* 6139 * If we ran into a problem during ibd_start() and ran into 6140 * some other problem during undoing our partial work, we can't 6141 * do anything about it. Ignore any errors we might get from 6142 * ibd_undo_start() and just return the original error we got. 6143 */ 6144 (void) ibd_undo_start(state, LINK_STATE_DOWN); 6145 return (err); 6146 } 6147 6148 /* 6149 * GLDv3 entry point to stop hardware from receiving packets. 6150 */ 6151 /*ARGSUSED*/ 6152 static void 6153 ibd_m_stop(void *arg) 6154 { 6155 ibd_state_t *state = (ibd_state_t *)arg; 6156 6157 if (state->id_type == IBD_PORT_DRIVER) 6158 return; 6159 6160 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6161 6162 (void) ibd_undo_start(state, state->id_link_state); 6163 6164 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6165 } 6166 6167 /* 6168 * GLDv3 entry point to modify device's mac address. We do not 6169 * allow address modifications. 6170 */ 6171 static int 6172 ibd_m_unicst(void *arg, const uint8_t *macaddr) 6173 { 6174 ibd_state_t *state = arg; 6175 6176 if (state->id_type == IBD_PORT_DRIVER) 6177 return (EINVAL); 6178 6179 /* 6180 * Don't bother even comparing the macaddr if we haven't 6181 * completed ibd_m_start(). 6182 */ 6183 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6184 return (0); 6185 6186 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 6187 return (0); 6188 else 6189 return (EINVAL); 6190 } 6191 6192 /* 6193 * The blocking part of the IBA join/leave operations are done out 6194 * of here on the async thread. 6195 */ 6196 static void 6197 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 6198 { 6199 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 6200 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 6201 6202 if (op == IBD_ASYNC_JOIN) { 6203 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 6204 ibd_print_warn(state, "Join multicast group failed :" 6205 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6206 } 6207 } else { 6208 /* 6209 * Here, we must search for the proper mcg_info and 6210 * use that to leave the group. 6211 */ 6212 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 6213 } 6214 } 6215 6216 /* 6217 * GLDv3 entry point for multicast enable/disable requests. 6218 * This function queues the operation to the async thread and 6219 * return success for a valid multicast address. 6220 */ 6221 static int 6222 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 6223 { 6224 ibd_state_t *state = (ibd_state_t *)arg; 6225 ipoib_mac_t maddr, *mcast; 6226 ib_gid_t mgid; 6227 ibd_req_t *req; 6228 6229 if (state->id_type == IBD_PORT_DRIVER) 6230 return (EINVAL); 6231 6232 /* 6233 * If we haven't completed ibd_m_start(), async thread wouldn't 6234 * have been started and id_bcaddr wouldn't be set, so there's 6235 * no point in continuing. 6236 */ 6237 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6238 return (0); 6239 6240 /* 6241 * The incoming multicast address might not be aligned properly 6242 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 6243 * it to look like one though, to get the offsets of the mc gid, 6244 * since we know we are not going to dereference any values with 6245 * the ipoib_mac_t pointer. 6246 */ 6247 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 6248 mcast = &maddr; 6249 6250 /* 6251 * Check validity of MCG address. We could additionally check 6252 * that a enable/disable is not being issued on the "broadcast" 6253 * mcg, but since this operation is only invokable by privileged 6254 * programs anyway, we allow the flexibility to those dlpi apps. 6255 * Note that we do not validate the "scope" of the IBA mcg. 6256 */ 6257 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 6258 return (EINVAL); 6259 6260 /* 6261 * fill in multicast pkey and scope 6262 */ 6263 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 6264 6265 /* 6266 * If someone is trying to JOIN/LEAVE the broadcast group, we do 6267 * nothing (i.e. we stay JOINed to the broadcast group done in 6268 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 6269 * requires to be joined to broadcast groups at all times. 6270 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 6271 * depends on this. 6272 */ 6273 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6274 return (0); 6275 6276 ibd_n2h_gid(mcast, &mgid); 6277 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6278 if (req == NULL) 6279 return (ENOMEM); 6280 6281 req->rq_gid = mgid; 6282 6283 if (add) { 6284 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 6285 mgid.gid_prefix, mgid.gid_guid); 6286 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 6287 } else { 6288 DPRINT(1, "ibd_m_multicst : unset_multicast : " 6289 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6290 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 6291 } 6292 return (0); 6293 } 6294 6295 /* 6296 * The blocking part of the IBA promiscuous operations are done 6297 * out of here on the async thread. The dlpireq parameter indicates 6298 * whether this invocation is due to a dlpi request or due to 6299 * a port up/down event. 6300 */ 6301 static void 6302 ibd_async_unsetprom(ibd_state_t *state) 6303 { 6304 ibd_mce_t *mce = list_head(&state->id_mc_non); 6305 ib_gid_t mgid; 6306 6307 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 6308 6309 while (mce != NULL) { 6310 mgid = mce->mc_info.mc_adds_vect.av_dgid; 6311 mce = list_next(&state->id_mc_non, mce); 6312 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 6313 } 6314 state->id_prom_op = IBD_OP_NOTSTARTED; 6315 } 6316 6317 /* 6318 * The blocking part of the IBA promiscuous operations are done 6319 * out of here on the async thread. The dlpireq parameter indicates 6320 * whether this invocation is due to a dlpi request or due to 6321 * a port up/down event. 6322 */ 6323 static void 6324 ibd_async_setprom(ibd_state_t *state) 6325 { 6326 ibt_mcg_attr_t mcg_attr; 6327 ibt_mcg_info_t *mcg_info; 6328 ib_gid_t mgid; 6329 uint_t numg; 6330 int i; 6331 char ret = IBD_OP_COMPLETED; 6332 6333 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 6334 6335 /* 6336 * Obtain all active MC groups on the IB fabric with 6337 * specified criteria (scope + Pkey + Qkey + mtu). 6338 */ 6339 bzero(&mcg_attr, sizeof (mcg_attr)); 6340 mcg_attr.mc_pkey = state->id_pkey; 6341 mcg_attr.mc_scope = state->id_scope; 6342 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 6343 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 6344 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 6345 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 6346 IBT_SUCCESS) { 6347 ibd_print_warn(state, "Could not get list of IBA multicast " 6348 "groups"); 6349 ret = IBD_OP_ERRORED; 6350 goto done; 6351 } 6352 6353 /* 6354 * Iterate over the returned mcg's and join as NonMember 6355 * to the IP mcg's. 6356 */ 6357 for (i = 0; i < numg; i++) { 6358 /* 6359 * Do a NonMember JOIN on the MC group. 6360 */ 6361 mgid = mcg_info[i].mc_adds_vect.av_dgid; 6362 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 6363 ibd_print_warn(state, "IBA promiscuous mode missed " 6364 "multicast gid %016llx:%016llx", 6365 (u_longlong_t)mgid.gid_prefix, 6366 (u_longlong_t)mgid.gid_guid); 6367 } 6368 6369 ibt_free_mcg_info(mcg_info, numg); 6370 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 6371 done: 6372 state->id_prom_op = ret; 6373 } 6374 6375 /* 6376 * GLDv3 entry point for multicast promiscuous enable/disable requests. 6377 * GLDv3 assumes phys state receives more packets than multi state, 6378 * which is not true for IPoIB. Thus, treat the multi and phys 6379 * promiscuous states the same way to work with GLDv3's assumption. 6380 */ 6381 static int 6382 ibd_m_promisc(void *arg, boolean_t on) 6383 { 6384 ibd_state_t *state = (ibd_state_t *)arg; 6385 ibd_req_t *req; 6386 6387 if (state->id_type == IBD_PORT_DRIVER) 6388 return (EINVAL); 6389 6390 /* 6391 * Async thread wouldn't have been started if we haven't 6392 * passed ibd_m_start() 6393 */ 6394 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6395 return (0); 6396 6397 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6398 if (req == NULL) 6399 return (ENOMEM); 6400 if (on) { 6401 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 6402 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 6403 } else { 6404 DPRINT(1, "ibd_m_promisc : unset_promisc"); 6405 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 6406 } 6407 6408 return (0); 6409 } 6410 6411 /* 6412 * GLDv3 entry point for gathering statistics. 6413 */ 6414 static int 6415 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 6416 { 6417 ibd_state_t *state = (ibd_state_t *)arg; 6418 6419 switch (stat) { 6420 case MAC_STAT_IFSPEED: 6421 *val = state->id_link_speed; 6422 break; 6423 case MAC_STAT_MULTIRCV: 6424 *val = state->id_multi_rcv; 6425 break; 6426 case MAC_STAT_BRDCSTRCV: 6427 *val = state->id_brd_rcv; 6428 break; 6429 case MAC_STAT_MULTIXMT: 6430 *val = state->id_multi_xmt; 6431 break; 6432 case MAC_STAT_BRDCSTXMT: 6433 *val = state->id_brd_xmt; 6434 break; 6435 case MAC_STAT_RBYTES: 6436 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte 6437 + state->rc_rcv_copy_byte; 6438 break; 6439 case MAC_STAT_IPACKETS: 6440 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt 6441 + state->rc_rcv_copy_pkt; 6442 break; 6443 case MAC_STAT_OBYTES: 6444 *val = state->id_xmt_bytes + state->rc_xmt_bytes; 6445 break; 6446 case MAC_STAT_OPACKETS: 6447 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + 6448 state->rc_xmt_fragmented_pkt + 6449 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; 6450 break; 6451 case MAC_STAT_OERRORS: 6452 *val = state->id_ah_error; /* failed AH translation */ 6453 break; 6454 case MAC_STAT_IERRORS: 6455 *val = 0; 6456 break; 6457 case MAC_STAT_NOXMTBUF: 6458 *val = state->id_tx_short + state->rc_swqe_short + 6459 state->rc_xmt_buf_short; 6460 break; 6461 case MAC_STAT_NORCVBUF: 6462 default: 6463 return (ENOTSUP); 6464 } 6465 6466 return (0); 6467 } 6468 6469 static void 6470 ibd_async_txsched(ibd_state_t *state) 6471 { 6472 ibd_resume_transmission(state); 6473 } 6474 6475 static void 6476 ibd_resume_transmission(ibd_state_t *state) 6477 { 6478 int flag; 6479 int met_thresh = 0; 6480 int thresh = 0; 6481 int ret = -1; 6482 6483 mutex_enter(&state->id_sched_lock); 6484 if (state->id_sched_needed & IBD_RSRC_SWQE) { 6485 mutex_enter(&state->id_tx_list.dl_mutex); 6486 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6487 met_thresh = state->id_tx_list.dl_cnt + 6488 state->id_tx_rel_list.dl_cnt; 6489 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6490 mutex_exit(&state->id_tx_list.dl_mutex); 6491 thresh = IBD_FREE_SWQES_THRESH; 6492 flag = IBD_RSRC_SWQE; 6493 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 6494 ASSERT(state->id_lso != NULL); 6495 mutex_enter(&state->id_lso_lock); 6496 met_thresh = state->id_lso->bkt_nfree; 6497 thresh = IBD_FREE_LSOS_THRESH; 6498 mutex_exit(&state->id_lso_lock); 6499 flag = IBD_RSRC_LSOBUF; 6500 if (met_thresh > thresh) 6501 state->id_sched_lso_cnt++; 6502 } 6503 if (met_thresh > thresh) { 6504 state->id_sched_needed &= ~flag; 6505 state->id_sched_cnt++; 6506 ret = 0; 6507 } 6508 mutex_exit(&state->id_sched_lock); 6509 6510 if (ret == 0) 6511 mac_tx_update(state->id_mh); 6512 } 6513 6514 /* 6515 * Release the send wqe back into free list. 6516 */ 6517 static void 6518 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 6519 { 6520 /* 6521 * Add back on Tx list for reuse. 6522 */ 6523 ASSERT(tail->swqe_next == NULL); 6524 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6525 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 6526 tail->swqe_next = state->id_tx_rel_list.dl_head; 6527 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 6528 state->id_tx_rel_list.dl_cnt += n; 6529 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6530 } 6531 6532 /* 6533 * Acquire a send wqe from free list. 6534 * Returns error number and send wqe pointer. 6535 */ 6536 static ibd_swqe_t * 6537 ibd_acquire_swqe(ibd_state_t *state) 6538 { 6539 ibd_swqe_t *wqe; 6540 6541 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6542 if (state->id_tx_rel_list.dl_head != NULL) { 6543 /* transfer id_tx_rel_list to id_tx_list */ 6544 state->id_tx_list.dl_head = 6545 state->id_tx_rel_list.dl_head; 6546 state->id_tx_list.dl_cnt = 6547 state->id_tx_rel_list.dl_cnt; 6548 state->id_tx_list.dl_pending_sends = B_FALSE; 6549 6550 /* clear id_tx_rel_list */ 6551 state->id_tx_rel_list.dl_head = NULL; 6552 state->id_tx_rel_list.dl_cnt = 0; 6553 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6554 6555 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 6556 state->id_tx_list.dl_cnt -= 1; 6557 state->id_tx_list.dl_head = wqe->swqe_next; 6558 } else { /* no free swqe */ 6559 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6560 state->id_tx_list.dl_pending_sends = B_TRUE; 6561 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 6562 state->id_tx_short++; 6563 wqe = NULL; 6564 } 6565 return (wqe); 6566 } 6567 6568 static int 6569 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 6570 ibt_ud_dest_hdl_t ud_dest) 6571 { 6572 mblk_t *nmp; 6573 int iph_len, tcph_len; 6574 ibt_wr_lso_t *lso; 6575 uintptr_t ip_start, tcp_start; 6576 uint8_t *dst; 6577 uint_t pending, mblen; 6578 6579 /* 6580 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 6581 * we need to adjust it here for lso. 6582 */ 6583 lso = &(node->w_swr.wr.ud_lso); 6584 lso->lso_ud_dest = ud_dest; 6585 lso->lso_mss = mss; 6586 6587 /* 6588 * Calculate the LSO header size and set it in the UD LSO structure. 6589 * Note that the only assumption we make is that each of the IPoIB, 6590 * IP and TCP headers will be contained in a single mblk fragment; 6591 * together, the headers may span multiple mblk fragments. 6592 */ 6593 nmp = mp; 6594 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 6595 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 6596 ip_start = (uintptr_t)nmp->b_cont->b_rptr 6597 + (ip_start - (uintptr_t)(nmp->b_wptr)); 6598 nmp = nmp->b_cont; 6599 6600 } 6601 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 6602 6603 tcp_start = ip_start + iph_len; 6604 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 6605 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 6606 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 6607 nmp = nmp->b_cont; 6608 } 6609 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 6610 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 6611 6612 /* 6613 * If the lso header fits entirely within a single mblk fragment, 6614 * we'll avoid an additional copy of the lso header here and just 6615 * pass the b_rptr of the mblk directly. 6616 * 6617 * If this isn't true, we'd have to allocate for it explicitly. 6618 */ 6619 if (lso->lso_hdr_sz <= MBLKL(mp)) { 6620 lso->lso_hdr = mp->b_rptr; 6621 } else { 6622 /* On work completion, remember to free this allocated hdr */ 6623 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 6624 if (lso->lso_hdr == NULL) { 6625 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 6626 "sz = %d", lso->lso_hdr_sz); 6627 lso->lso_hdr_sz = 0; 6628 lso->lso_mss = 0; 6629 return (-1); 6630 } 6631 } 6632 6633 /* 6634 * Copy in the lso header only if we need to 6635 */ 6636 if (lso->lso_hdr != mp->b_rptr) { 6637 dst = lso->lso_hdr; 6638 pending = lso->lso_hdr_sz; 6639 6640 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 6641 mblen = MBLKL(nmp); 6642 if (pending > mblen) { 6643 bcopy(nmp->b_rptr, dst, mblen); 6644 dst += mblen; 6645 pending -= mblen; 6646 } else { 6647 bcopy(nmp->b_rptr, dst, pending); 6648 break; 6649 } 6650 } 6651 } 6652 6653 return (0); 6654 } 6655 6656 static void 6657 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 6658 { 6659 ibt_wr_lso_t *lso; 6660 6661 if ((!node) || (!mp)) 6662 return; 6663 6664 /* 6665 * Free any header space that we might've allocated if we 6666 * did an LSO 6667 */ 6668 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 6669 lso = &(node->w_swr.wr.ud_lso); 6670 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 6671 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 6672 lso->lso_hdr = NULL; 6673 lso->lso_hdr_sz = 0; 6674 } 6675 } 6676 } 6677 6678 static void 6679 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 6680 { 6681 uint_t i; 6682 uint_t num_posted; 6683 uint_t n_wrs; 6684 ibt_status_t ibt_status; 6685 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 6686 ibd_swqe_t *tx_head, *elem; 6687 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 6688 6689 /* post the one request, then check for more */ 6690 ibt_status = ibt_post_send(state->id_chnl_hdl, 6691 &node->w_swr, 1, NULL); 6692 if (ibt_status != IBT_SUCCESS) { 6693 ibd_print_warn(state, "ibd_post_send: " 6694 "posting one wr failed: ret=%d", ibt_status); 6695 ibd_tx_cleanup(state, node); 6696 } 6697 6698 tx_head = NULL; 6699 for (;;) { 6700 if (tx_head == NULL) { 6701 mutex_enter(&state->id_txpost_lock); 6702 tx_head = state->id_tx_head; 6703 if (tx_head == NULL) { 6704 state->id_tx_busy = 0; 6705 mutex_exit(&state->id_txpost_lock); 6706 return; 6707 } 6708 state->id_tx_head = NULL; 6709 mutex_exit(&state->id_txpost_lock); 6710 } 6711 6712 /* 6713 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 6714 * at a time if possible, and keep posting them. 6715 */ 6716 for (n_wrs = 0, elem = tx_head; 6717 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 6718 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 6719 nodes[n_wrs] = elem; 6720 wrs[n_wrs] = elem->w_swr; 6721 } 6722 tx_head = elem; 6723 6724 ASSERT(n_wrs != 0); 6725 6726 /* 6727 * If posting fails for some reason, we'll never receive 6728 * completion intimation, so we'll need to cleanup. But 6729 * we need to make sure we don't clean up nodes whose 6730 * wrs have been successfully posted. We assume that the 6731 * hca driver returns on the first failure to post and 6732 * therefore the first 'num_posted' entries don't need 6733 * cleanup here. 6734 */ 6735 num_posted = 0; 6736 ibt_status = ibt_post_send(state->id_chnl_hdl, 6737 wrs, n_wrs, &num_posted); 6738 if (ibt_status != IBT_SUCCESS) { 6739 ibd_print_warn(state, "ibd_post_send: " 6740 "posting multiple wrs failed: " 6741 "requested=%d, done=%d, ret=%d", 6742 n_wrs, num_posted, ibt_status); 6743 6744 for (i = num_posted; i < n_wrs; i++) 6745 ibd_tx_cleanup(state, nodes[i]); 6746 } 6747 } 6748 } 6749 6750 static int 6751 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 6752 uint_t lsohdr_sz) 6753 { 6754 ibt_wr_ds_t *sgl; 6755 ibt_status_t ibt_status; 6756 mblk_t *nmp; 6757 mblk_t *data_mp; 6758 uchar_t *bufp; 6759 size_t blksize; 6760 size_t skip; 6761 size_t avail; 6762 uint_t pktsize; 6763 uint_t frag_len; 6764 uint_t pending_hdr; 6765 int nmblks; 6766 int i; 6767 6768 /* 6769 * Let's skip ahead to the data if this is LSO 6770 */ 6771 data_mp = mp; 6772 pending_hdr = 0; 6773 if (lsohdr_sz) { 6774 pending_hdr = lsohdr_sz; 6775 for (nmp = mp; nmp; nmp = nmp->b_cont) { 6776 frag_len = nmp->b_wptr - nmp->b_rptr; 6777 if (frag_len > pending_hdr) 6778 break; 6779 pending_hdr -= frag_len; 6780 } 6781 data_mp = nmp; /* start of data past lso header */ 6782 ASSERT(data_mp != NULL); 6783 } 6784 6785 /* 6786 * Calculate the size of message data and number of msg blocks 6787 */ 6788 pktsize = 0; 6789 for (nmblks = 0, nmp = data_mp; nmp != NULL; 6790 nmp = nmp->b_cont, nmblks++) { 6791 pktsize += MBLKL(nmp); 6792 } 6793 pktsize -= pending_hdr; 6794 6795 /* 6796 * We only do ibt_map_mem_iov() if the pktsize is above the 6797 * "copy-threshold", and if the number of mp fragments is less than 6798 * the maximum acceptable. 6799 */ 6800 if ((state->id_hca_res_lkey_capab) && 6801 (pktsize > state->id_ud_tx_copy_thresh) && 6802 (nmblks < state->id_max_sqseg_hiwm)) { 6803 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 6804 ibt_iov_attr_t iov_attr; 6805 6806 iov_attr.iov_as = NULL; 6807 iov_attr.iov = iov_arr; 6808 iov_attr.iov_buf = NULL; 6809 iov_attr.iov_list_len = nmblks; 6810 iov_attr.iov_wr_nds = state->id_max_sqseg; 6811 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 6812 iov_attr.iov_flags = IBT_IOV_SLEEP; 6813 6814 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 6815 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 6816 iov_arr[i].iov_len = MBLKL(nmp); 6817 if (i == 0) { 6818 iov_arr[i].iov_addr += pending_hdr; 6819 iov_arr[i].iov_len -= pending_hdr; 6820 } 6821 } 6822 6823 node->w_buftype = IBD_WQE_MAPPED; 6824 node->w_swr.wr_sgl = node->w_sgl; 6825 6826 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 6827 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 6828 if (ibt_status != IBT_SUCCESS) { 6829 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 6830 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 6831 goto ibd_copy_path; 6832 } 6833 6834 return (0); 6835 } 6836 6837 ibd_copy_path: 6838 if (pktsize <= state->id_tx_buf_sz) { 6839 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 6840 node->w_swr.wr_nds = 1; 6841 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 6842 node->w_buftype = IBD_WQE_TXBUF; 6843 6844 /* 6845 * Even though this is the copy path for transfers less than 6846 * id_tx_buf_sz, it could still be an LSO packet. If so, it 6847 * is possible the first data mblk fragment (data_mp) still 6848 * contains part of the LSO header that we need to skip. 6849 */ 6850 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6851 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 6852 blksize = MBLKL(nmp) - pending_hdr; 6853 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 6854 bufp += blksize; 6855 pending_hdr = 0; 6856 } 6857 6858 return (0); 6859 } 6860 6861 /* 6862 * Copy path for transfers greater than id_tx_buf_sz 6863 */ 6864 node->w_swr.wr_sgl = node->w_sgl; 6865 if (ibd_acquire_lsobufs(state, pktsize, 6866 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 6867 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 6868 return (-1); 6869 } 6870 node->w_buftype = IBD_WQE_LSOBUF; 6871 6872 /* 6873 * Copy the larger-than-id_tx_buf_sz packet into a set of 6874 * fixed-sized, pre-mapped LSO buffers. Note that we might 6875 * need to skip part of the LSO header in the first fragment 6876 * as before. 6877 */ 6878 nmp = data_mp; 6879 skip = pending_hdr; 6880 for (i = 0; i < node->w_swr.wr_nds; i++) { 6881 sgl = node->w_swr.wr_sgl + i; 6882 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 6883 avail = IBD_LSO_BUFSZ; 6884 while (nmp && avail) { 6885 blksize = MBLKL(nmp) - skip; 6886 if (blksize > avail) { 6887 bcopy(nmp->b_rptr + skip, bufp, avail); 6888 skip += avail; 6889 avail = 0; 6890 } else { 6891 bcopy(nmp->b_rptr + skip, bufp, blksize); 6892 skip = 0; 6893 avail -= blksize; 6894 bufp += blksize; 6895 nmp = nmp->b_cont; 6896 } 6897 } 6898 } 6899 6900 return (0); 6901 } 6902 6903 /* 6904 * Schedule a completion queue polling to reap the resource we're 6905 * short on. If we implement the change to reap tx completions 6906 * in a separate thread, we'll need to wake up that thread here. 6907 */ 6908 static int 6909 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 6910 { 6911 ibd_req_t *req; 6912 6913 mutex_enter(&state->id_sched_lock); 6914 state->id_sched_needed |= resource_type; 6915 mutex_exit(&state->id_sched_lock); 6916 6917 /* 6918 * If we are asked to queue a work entry, we need to do it 6919 */ 6920 if (q_flag) { 6921 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6922 if (req == NULL) 6923 return (-1); 6924 6925 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 6926 } 6927 6928 return (0); 6929 } 6930 6931 /* 6932 * The passed in packet has this format: 6933 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 6934 */ 6935 static boolean_t 6936 ibd_send(ibd_state_t *state, mblk_t *mp) 6937 { 6938 ibd_ace_t *ace; 6939 ibd_swqe_t *node; 6940 ipoib_mac_t *dest; 6941 ib_header_info_t *ipibp; 6942 ip6_t *ip6h; 6943 uint_t pktsize; 6944 uint32_t mss; 6945 uint32_t hckflags; 6946 uint32_t lsoflags = 0; 6947 uint_t lsohdr_sz = 0; 6948 int ret, len; 6949 boolean_t dofree = B_FALSE; 6950 boolean_t rc; 6951 /* if (rc_chan == NULL) send by UD; else send by RC; */ 6952 ibd_rc_chan_t *rc_chan; 6953 int nmblks; 6954 mblk_t *nmp; 6955 6956 /* 6957 * If we aren't done with the device initialization and start, 6958 * we shouldn't be here. 6959 */ 6960 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6961 return (B_FALSE); 6962 6963 /* 6964 * Obtain an address handle for the destination. 6965 */ 6966 ipibp = (ib_header_info_t *)mp->b_rptr; 6967 dest = (ipoib_mac_t *)&ipibp->ib_dst; 6968 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6969 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 6970 6971 rc_chan = NULL; 6972 ace = ibd_acache_lookup(state, dest, &ret, 1); 6973 if (state->id_enable_rc && (ace != NULL) && 6974 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { 6975 if (ace->ac_chan == NULL) { 6976 state->rc_null_conn++; 6977 } else { 6978 if (ace->ac_chan->chan_state == 6979 IBD_RC_STATE_ACT_ESTAB) { 6980 rc_chan = ace->ac_chan; 6981 rc_chan->is_used = B_TRUE; 6982 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 6983 node = WQE_TO_SWQE( 6984 rc_chan->tx_wqe_list.dl_head); 6985 if (node != NULL) { 6986 rc_chan->tx_wqe_list.dl_cnt -= 1; 6987 rc_chan->tx_wqe_list.dl_head = 6988 node->swqe_next; 6989 } else { 6990 node = ibd_rc_acquire_swqes(rc_chan); 6991 } 6992 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 6993 6994 if (node == NULL) { 6995 state->rc_swqe_short++; 6996 mutex_enter(&state->id_sched_lock); 6997 state->id_sched_needed |= 6998 IBD_RSRC_RC_SWQE; 6999 mutex_exit(&state->id_sched_lock); 7000 ibd_dec_ref_ace(state, ace); 7001 return (B_FALSE); 7002 } 7003 } else { 7004 state->rc_no_estab_conn++; 7005 } 7006 } 7007 } 7008 7009 if (rc_chan == NULL) { 7010 mutex_enter(&state->id_tx_list.dl_mutex); 7011 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 7012 if (node != NULL) { 7013 state->id_tx_list.dl_cnt -= 1; 7014 state->id_tx_list.dl_head = node->swqe_next; 7015 } else { 7016 node = ibd_acquire_swqe(state); 7017 } 7018 mutex_exit(&state->id_tx_list.dl_mutex); 7019 if (node == NULL) { 7020 /* 7021 * If we don't have an swqe available, schedule a 7022 * transmit completion queue cleanup and hold off on 7023 * sending more packets until we have some free swqes 7024 */ 7025 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { 7026 if (ace != NULL) { 7027 ibd_dec_ref_ace(state, ace); 7028 } 7029 return (B_FALSE); 7030 } 7031 7032 /* 7033 * If a poll cannot be scheduled, we have no choice but 7034 * to drop this packet 7035 */ 7036 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 7037 if (ace != NULL) { 7038 ibd_dec_ref_ace(state, ace); 7039 } 7040 return (B_TRUE); 7041 } 7042 } 7043 7044 /* 7045 * Initialize the commonly used fields in swqe to NULL to protect 7046 * against ibd_tx_cleanup accidentally misinterpreting these on a 7047 * failure. 7048 */ 7049 node->swqe_im_mblk = NULL; 7050 node->w_swr.wr_nds = 0; 7051 node->w_swr.wr_sgl = NULL; 7052 node->w_swr.wr_opcode = IBT_WRC_SEND; 7053 7054 /* 7055 * Calculate the size of message data and number of msg blocks 7056 */ 7057 pktsize = 0; 7058 for (nmblks = 0, nmp = mp; nmp != NULL; 7059 nmp = nmp->b_cont, nmblks++) { 7060 pktsize += MBLKL(nmp); 7061 } 7062 7063 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 7064 atomic_inc_64(&state->id_brd_xmt); 7065 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 7066 atomic_inc_64(&state->id_multi_xmt); 7067 7068 if (ace != NULL) { 7069 node->w_ahandle = ace; 7070 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 7071 } else { 7072 DPRINT(5, 7073 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 7074 ((ret == EFAULT) ? "failed" : "queued"), 7075 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 7076 htonl(dest->ipoib_gidpref[1]), 7077 htonl(dest->ipoib_gidsuff[0]), 7078 htonl(dest->ipoib_gidsuff[1])); 7079 state->rc_ace_not_found++; 7080 node->w_ahandle = NULL; 7081 7082 /* 7083 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 7084 * can not find a path for the specific dest address. We 7085 * should get rid of this kind of packet. We also should get 7086 * rid of the packet if we cannot schedule a poll via the 7087 * async thread. For the normal case, ibd will return the 7088 * packet to upper layer and wait for AH creating. 7089 * 7090 * Note that we always queue a work slot entry for the async 7091 * thread when we fail AH lookup (even in intr mode); this is 7092 * due to the convoluted way the code currently looks for AH. 7093 */ 7094 if (ret == EFAULT) { 7095 dofree = B_TRUE; 7096 rc = B_TRUE; 7097 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 7098 dofree = B_TRUE; 7099 rc = B_TRUE; 7100 } else { 7101 dofree = B_FALSE; 7102 rc = B_FALSE; 7103 } 7104 goto ibd_send_fail; 7105 } 7106 7107 /* 7108 * For ND6 packets, padding is at the front of the source lladdr. 7109 * Insert the padding at front. 7110 */ 7111 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 7112 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 7113 if (!pullupmsg(mp, IPV6_HDR_LEN + 7114 sizeof (ib_header_info_t))) { 7115 DPRINT(10, "ibd_send: pullupmsg failure "); 7116 dofree = B_TRUE; 7117 rc = B_TRUE; 7118 goto ibd_send_fail; 7119 } 7120 ipibp = (ib_header_info_t *)mp->b_rptr; 7121 } 7122 ip6h = (ip6_t *)((uchar_t *)ipibp + 7123 sizeof (ib_header_info_t)); 7124 len = ntohs(ip6h->ip6_plen); 7125 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 7126 mblk_t *pad; 7127 7128 pad = allocb(4, 0); 7129 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 7130 linkb(mp, pad); 7131 if (MBLKL(mp) < sizeof (ib_header_info_t) + 7132 IPV6_HDR_LEN + len + 4) { 7133 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 7134 IPV6_HDR_LEN + len + 4)) { 7135 DPRINT(10, "ibd_send: pullupmsg " 7136 "failure "); 7137 dofree = B_TRUE; 7138 rc = B_TRUE; 7139 goto ibd_send_fail; 7140 } 7141 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 7142 sizeof (ib_header_info_t)); 7143 } 7144 7145 /* LINTED: E_CONSTANT_CONDITION */ 7146 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 7147 } 7148 } 7149 7150 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); 7151 mp->b_rptr += sizeof (ib_addrs_t); 7152 pktsize -= sizeof (ib_addrs_t); 7153 7154 if (rc_chan) { /* send in RC mode */ 7155 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 7156 ibt_iov_attr_t iov_attr; 7157 uint_t i; 7158 size_t blksize; 7159 uchar_t *bufp; 7160 ibd_rc_tx_largebuf_t *lbufp; 7161 7162 atomic_add_64(&state->rc_xmt_bytes, pktsize); 7163 7164 /* 7165 * Upper layer does Tx checksum, we don't need do any 7166 * checksum here. 7167 */ 7168 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); 7169 7170 /* 7171 * We only do ibt_map_mem_iov() if the pktsize is above 7172 * the "copy-threshold", and if the number of mp 7173 * fragments is less than the maximum acceptable. 7174 */ 7175 if (pktsize <= state->id_rc_tx_copy_thresh) { 7176 atomic_inc_64(&state->rc_xmt_small_pkt); 7177 /* 7178 * Only process unicast packet in Reliable Connected 7179 * mode. 7180 */ 7181 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 7182 node->w_swr.wr_nds = 1; 7183 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 7184 node->w_buftype = IBD_WQE_TXBUF; 7185 7186 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 7187 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7188 blksize = MBLKL(nmp); 7189 bcopy(nmp->b_rptr, bufp, blksize); 7190 bufp += blksize; 7191 } 7192 freemsg(mp); 7193 ASSERT(node->swqe_im_mblk == NULL); 7194 } else { 7195 if ((state->rc_enable_iov_map) && 7196 (nmblks < state->rc_max_sqseg_hiwm)) { 7197 7198 /* do ibt_map_mem_iov() */ 7199 iov_attr.iov_as = NULL; 7200 iov_attr.iov = iov_arr; 7201 iov_attr.iov_buf = NULL; 7202 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; 7203 iov_attr.iov_lso_hdr_sz = 0; 7204 iov_attr.iov_flags = IBT_IOV_SLEEP; 7205 7206 i = 0; 7207 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7208 iov_arr[i].iov_len = MBLKL(nmp); 7209 if (iov_arr[i].iov_len != 0) { 7210 iov_arr[i].iov_addr = (caddr_t) 7211 (void *)nmp->b_rptr; 7212 i++; 7213 } 7214 } 7215 iov_attr.iov_list_len = i; 7216 node->w_swr.wr_sgl = node->w_sgl; 7217 7218 ret = ibt_map_mem_iov(state->id_hca_hdl, 7219 &iov_attr, (ibt_all_wr_t *)&node->w_swr, 7220 &node->w_mi_hdl); 7221 if (ret != IBT_SUCCESS) { 7222 atomic_inc_64( 7223 &state->rc_xmt_map_fail_pkt); 7224 DPRINT(30, "ibd_send: ibt_map_mem_iov(" 7225 ") failed, nmblks=%d, real_nmblks" 7226 "=%d, ret=0x%x", nmblks, i, ret); 7227 goto ibd_rc_large_copy; 7228 } 7229 7230 atomic_inc_64(&state->rc_xmt_map_succ_pkt); 7231 node->w_buftype = IBD_WQE_MAPPED; 7232 node->swqe_im_mblk = mp; 7233 } else { 7234 atomic_inc_64(&state->rc_xmt_fragmented_pkt); 7235 ibd_rc_large_copy: 7236 mutex_enter(&state->rc_tx_large_bufs_lock); 7237 if (state->rc_tx_largebuf_nfree == 0) { 7238 state->rc_xmt_buf_short++; 7239 mutex_exit 7240 (&state->rc_tx_large_bufs_lock); 7241 mutex_enter(&state->id_sched_lock); 7242 state->id_sched_needed |= 7243 IBD_RSRC_RC_TX_LARGEBUF; 7244 mutex_exit(&state->id_sched_lock); 7245 dofree = B_FALSE; 7246 rc = B_FALSE; 7247 /* 7248 * If we don't have Tx large bufs, 7249 * return failure. node->w_buftype 7250 * should not be IBD_WQE_RC_COPYBUF, 7251 * otherwise it will cause problem 7252 * in ibd_rc_tx_cleanup() 7253 */ 7254 node->w_buftype = IBD_WQE_TXBUF; 7255 goto ibd_send_fail; 7256 } 7257 7258 lbufp = state->rc_tx_largebuf_free_head; 7259 ASSERT(lbufp->lb_buf != NULL); 7260 state->rc_tx_largebuf_free_head = 7261 lbufp->lb_next; 7262 lbufp->lb_next = NULL; 7263 /* Update nfree count */ 7264 state->rc_tx_largebuf_nfree --; 7265 mutex_exit(&state->rc_tx_large_bufs_lock); 7266 bufp = lbufp->lb_buf; 7267 node->w_sgl[0].ds_va = 7268 (ib_vaddr_t)(uintptr_t)bufp; 7269 node->w_sgl[0].ds_key = 7270 state->rc_tx_mr_desc.md_lkey; 7271 node->w_sgl[0].ds_len = pktsize; 7272 node->w_swr.wr_sgl = node->w_sgl; 7273 node->w_swr.wr_nds = 1; 7274 node->w_buftype = IBD_WQE_RC_COPYBUF; 7275 node->w_rc_tx_largebuf = lbufp; 7276 7277 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7278 blksize = MBLKL(nmp); 7279 if (blksize != 0) { 7280 bcopy(nmp->b_rptr, bufp, 7281 blksize); 7282 bufp += blksize; 7283 } 7284 } 7285 freemsg(mp); 7286 ASSERT(node->swqe_im_mblk == NULL); 7287 } 7288 } 7289 7290 node->swqe_next = NULL; 7291 mutex_enter(&rc_chan->tx_post_lock); 7292 if (rc_chan->tx_busy) { 7293 if (rc_chan->tx_head) { 7294 rc_chan->tx_tail->swqe_next = 7295 SWQE_TO_WQE(node); 7296 } else { 7297 rc_chan->tx_head = node; 7298 } 7299 rc_chan->tx_tail = node; 7300 mutex_exit(&rc_chan->tx_post_lock); 7301 } else { 7302 rc_chan->tx_busy = 1; 7303 mutex_exit(&rc_chan->tx_post_lock); 7304 ibd_rc_post_send(rc_chan, node); 7305 } 7306 7307 return (B_TRUE); 7308 } /* send by RC */ 7309 7310 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { 7311 /* 7312 * Too long pktsize. The packet size from GLD should <= 7313 * state->id_mtu + sizeof (ib_addrs_t) 7314 */ 7315 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { 7316 ibd_req_t *req; 7317 7318 mutex_enter(&ace->tx_too_big_mutex); 7319 if (ace->tx_too_big_ongoing) { 7320 mutex_exit(&ace->tx_too_big_mutex); 7321 state->rc_xmt_reenter_too_long_pkt++; 7322 dofree = B_TRUE; 7323 } else { 7324 ace->tx_too_big_ongoing = B_TRUE; 7325 mutex_exit(&ace->tx_too_big_mutex); 7326 state->rc_xmt_icmp_too_long_pkt++; 7327 7328 req = kmem_cache_alloc(state->id_req_kmc, 7329 KM_NOSLEEP); 7330 if (req == NULL) { 7331 ibd_print_warn(state, "ibd_send: alloc " 7332 "ibd_req_t fail"); 7333 /* Drop it. */ 7334 dofree = B_TRUE; 7335 } else { 7336 req->rq_ptr = mp; 7337 req->rq_ptr2 = ace; 7338 ibd_queue_work_slot(state, req, 7339 IBD_ASYNC_RC_TOO_BIG); 7340 dofree = B_FALSE; 7341 } 7342 } 7343 } else { 7344 ibd_print_warn(state, "Reliable Connected mode is on. " 7345 "Multicast packet length %d > %d is too long to " 7346 "send packet (%d > %d), drop it", 7347 pktsize, state->id_mtu); 7348 state->rc_xmt_drop_too_long_pkt++; 7349 /* Drop it. */ 7350 dofree = B_TRUE; 7351 } 7352 rc = B_TRUE; 7353 goto ibd_send_fail; 7354 } 7355 7356 atomic_add_64(&state->id_xmt_bytes, pktsize); 7357 atomic_inc_64(&state->id_xmt_pkt); 7358 7359 /* 7360 * Do LSO and checksum related work here. For LSO send, adjust the 7361 * ud destination, the opcode and the LSO header information to the 7362 * work request. 7363 */ 7364 mac_lso_get(mp, &mss, &lsoflags); 7365 if ((lsoflags & HW_LSO) != HW_LSO) { 7366 node->w_swr.wr_opcode = IBT_WRC_SEND; 7367 lsohdr_sz = 0; 7368 } else { 7369 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 7370 /* 7371 * The routine can only fail if there's no memory; we 7372 * can only drop the packet if this happens 7373 */ 7374 ibd_print_warn(state, 7375 "ibd_send: no memory, lso posting failed"); 7376 dofree = B_TRUE; 7377 rc = B_TRUE; 7378 goto ibd_send_fail; 7379 } 7380 7381 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 7382 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 7383 } 7384 7385 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); 7386 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 7387 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 7388 else 7389 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 7390 7391 /* 7392 * Prepare the sgl for posting; the routine can only fail if there's 7393 * no lso buf available for posting. If this is the case, we should 7394 * probably resched for lso bufs to become available and then try again. 7395 */ 7396 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 7397 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 7398 dofree = B_TRUE; 7399 rc = B_TRUE; 7400 } else { 7401 dofree = B_FALSE; 7402 rc = B_FALSE; 7403 } 7404 goto ibd_send_fail; 7405 } 7406 node->swqe_im_mblk = mp; 7407 7408 /* 7409 * Queue the wqe to hardware; since we can now simply queue a 7410 * post instead of doing it serially, we cannot assume anything 7411 * about the 'node' after ibd_post_send() returns. 7412 */ 7413 node->swqe_next = NULL; 7414 7415 mutex_enter(&state->id_txpost_lock); 7416 if (state->id_tx_busy) { 7417 if (state->id_tx_head) { 7418 state->id_tx_tail->swqe_next = 7419 SWQE_TO_WQE(node); 7420 } else { 7421 state->id_tx_head = node; 7422 } 7423 state->id_tx_tail = node; 7424 mutex_exit(&state->id_txpost_lock); 7425 } else { 7426 state->id_tx_busy = 1; 7427 mutex_exit(&state->id_txpost_lock); 7428 ibd_post_send(state, node); 7429 } 7430 7431 return (B_TRUE); 7432 7433 ibd_send_fail: 7434 if (node && mp) 7435 ibd_free_lsohdr(node, mp); 7436 7437 if (dofree) 7438 freemsg(mp); 7439 7440 if (node != NULL) { 7441 if (rc_chan) { 7442 ibd_rc_tx_cleanup(node); 7443 } else { 7444 ibd_tx_cleanup(state, node); 7445 } 7446 } 7447 7448 return (rc); 7449 } 7450 7451 /* 7452 * GLDv3 entry point for transmitting datagram. 7453 */ 7454 static mblk_t * 7455 ibd_m_tx(void *arg, mblk_t *mp) 7456 { 7457 ibd_state_t *state = (ibd_state_t *)arg; 7458 mblk_t *next; 7459 7460 if (state->id_type == IBD_PORT_DRIVER) { 7461 freemsgchain(mp); 7462 return (NULL); 7463 } 7464 7465 if ((state->id_link_state != LINK_STATE_UP) || 7466 !(state->id_mac_state & IBD_DRV_STARTED)) { 7467 freemsgchain(mp); 7468 mp = NULL; 7469 } 7470 7471 while (mp != NULL) { 7472 next = mp->b_next; 7473 mp->b_next = NULL; 7474 if (ibd_send(state, mp) == B_FALSE) { 7475 /* Send fail */ 7476 mp->b_next = next; 7477 break; 7478 } 7479 mp = next; 7480 } 7481 7482 return (mp); 7483 } 7484 7485 /* 7486 * this handles Tx and Rx completions. With separate CQs, this handles 7487 * only Rx completions. 7488 */ 7489 static uint_t 7490 ibd_intr(caddr_t arg) 7491 { 7492 ibd_state_t *state = (ibd_state_t *)arg; 7493 7494 ibd_poll_rcq(state, state->id_rcq_hdl); 7495 7496 return (DDI_INTR_CLAIMED); 7497 } 7498 7499 /* 7500 * Poll and fully drain the send cq 7501 */ 7502 static void 7503 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7504 { 7505 ibt_wc_t *wcs = state->id_txwcs; 7506 uint_t numwcs = state->id_txwcs_size; 7507 ibd_wqe_t *wqe; 7508 ibd_swqe_t *head, *tail; 7509 ibt_wc_t *wc; 7510 uint_t num_polled; 7511 int i; 7512 7513 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7514 head = tail = NULL; 7515 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7516 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 7517 if (wc->wc_status != IBT_WC_SUCCESS) { 7518 /* 7519 * Channel being torn down. 7520 */ 7521 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7522 DPRINT(5, "ibd_drain_scq: flush error"); 7523 DPRINT(10, "ibd_drain_scq: Bad " 7524 "status %d", wc->wc_status); 7525 } else { 7526 DPRINT(10, "ibd_drain_scq: " 7527 "unexpected wc_status %d", 7528 wc->wc_status); 7529 } 7530 /* 7531 * Fallthrough to invoke the Tx handler to 7532 * release held resources, e.g., AH refcount. 7533 */ 7534 } 7535 /* 7536 * Add this swqe to the list to be cleaned up. 7537 */ 7538 if (head) 7539 tail->swqe_next = wqe; 7540 else 7541 head = WQE_TO_SWQE(wqe); 7542 tail = WQE_TO_SWQE(wqe); 7543 } 7544 tail->swqe_next = NULL; 7545 ibd_tx_cleanup_list(state, head, tail); 7546 7547 /* 7548 * Resume any blocked transmissions if possible 7549 */ 7550 ibd_resume_transmission(state); 7551 } 7552 } 7553 7554 /* 7555 * Poll and fully drain the receive cq 7556 */ 7557 static void 7558 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7559 { 7560 ibt_wc_t *wcs = state->id_rxwcs; 7561 uint_t numwcs = state->id_rxwcs_size; 7562 ibd_rwqe_t *rwqe; 7563 ibt_wc_t *wc; 7564 uint_t num_polled; 7565 int i; 7566 mblk_t *head, *tail, *mp; 7567 7568 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7569 head = tail = NULL; 7570 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7571 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 7572 if (wc->wc_status != IBT_WC_SUCCESS) { 7573 /* 7574 * Channel being torn down. 7575 */ 7576 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7577 DPRINT(5, "ibd_drain_rcq: " 7578 "expected flushed rwqe"); 7579 } else { 7580 DPRINT(5, "ibd_drain_rcq: " 7581 "unexpected wc_status %d", 7582 wc->wc_status); 7583 } 7584 atomic_inc_32( 7585 &state->id_rx_list.dl_bufs_outstanding); 7586 freemsg(rwqe->rwqe_im_mblk); 7587 continue; 7588 } 7589 mp = ibd_process_rx(state, rwqe, wc); 7590 if (mp == NULL) 7591 continue; 7592 7593 /* 7594 * Add this mp to the list to send to the nw layer. 7595 */ 7596 if (head) 7597 tail->b_next = mp; 7598 else 7599 head = mp; 7600 tail = mp; 7601 } 7602 if (head) 7603 mac_rx(state->id_mh, state->id_rh, head); 7604 7605 /* 7606 * Account for #rwqes polled. 7607 * Post more here, if less than one fourth full. 7608 */ 7609 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 7610 (state->id_ud_num_rwqe / 4)) 7611 ibd_post_recv_intr(state); 7612 } 7613 } 7614 7615 /* 7616 * Common code for interrupt handling as well as for polling 7617 * for all completed wqe's while detaching. 7618 */ 7619 static void 7620 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7621 { 7622 int flag, redo_flag; 7623 int redo = 1; 7624 7625 flag = IBD_CQ_POLLING; 7626 redo_flag = IBD_REDO_CQ_POLLING; 7627 7628 mutex_enter(&state->id_scq_poll_lock); 7629 if (state->id_scq_poll_busy & flag) { 7630 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 7631 state->id_scq_poll_busy |= redo_flag; 7632 mutex_exit(&state->id_scq_poll_lock); 7633 return; 7634 } 7635 state->id_scq_poll_busy |= flag; 7636 mutex_exit(&state->id_scq_poll_lock); 7637 7638 /* 7639 * In some cases (eg detaching), this code can be invoked on 7640 * any cpu after disabling cq notification (thus no concurrency 7641 * exists). Apart from that, the following applies normally: 7642 * Transmit completion handling could be from any cpu if 7643 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 7644 * is interrupt driven. 7645 */ 7646 7647 /* 7648 * Poll and drain the CQ 7649 */ 7650 ibd_drain_scq(state, cq_hdl); 7651 7652 /* 7653 * Enable CQ notifications and redrain the cq to catch any 7654 * completions we might have missed after the ibd_drain_scq() 7655 * above and before the ibt_enable_cq_notify() that follows. 7656 * Finally, service any new requests to poll the cq that 7657 * could've come in after the ibt_enable_cq_notify(). 7658 */ 7659 do { 7660 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 7661 IBT_SUCCESS) { 7662 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7663 } 7664 7665 ibd_drain_scq(state, cq_hdl); 7666 7667 mutex_enter(&state->id_scq_poll_lock); 7668 if (state->id_scq_poll_busy & redo_flag) 7669 state->id_scq_poll_busy &= ~redo_flag; 7670 else { 7671 state->id_scq_poll_busy &= ~flag; 7672 redo = 0; 7673 } 7674 mutex_exit(&state->id_scq_poll_lock); 7675 7676 } while (redo); 7677 } 7678 7679 /* 7680 * Common code for interrupt handling as well as for polling 7681 * for all completed wqe's while detaching. 7682 */ 7683 static void 7684 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 7685 { 7686 int flag, redo_flag; 7687 int redo = 1; 7688 7689 flag = IBD_CQ_POLLING; 7690 redo_flag = IBD_REDO_CQ_POLLING; 7691 7692 mutex_enter(&state->id_rcq_poll_lock); 7693 if (state->id_rcq_poll_busy & flag) { 7694 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 7695 state->id_rcq_poll_busy |= redo_flag; 7696 mutex_exit(&state->id_rcq_poll_lock); 7697 return; 7698 } 7699 state->id_rcq_poll_busy |= flag; 7700 mutex_exit(&state->id_rcq_poll_lock); 7701 7702 /* 7703 * Poll and drain the CQ 7704 */ 7705 ibd_drain_rcq(state, rcq); 7706 7707 /* 7708 * Enable CQ notifications and redrain the cq to catch any 7709 * completions we might have missed after the ibd_drain_cq() 7710 * above and before the ibt_enable_cq_notify() that follows. 7711 * Finally, service any new requests to poll the cq that 7712 * could've come in after the ibt_enable_cq_notify(). 7713 */ 7714 do { 7715 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 7716 IBT_SUCCESS) { 7717 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7718 } 7719 7720 ibd_drain_rcq(state, rcq); 7721 7722 mutex_enter(&state->id_rcq_poll_lock); 7723 if (state->id_rcq_poll_busy & redo_flag) 7724 state->id_rcq_poll_busy &= ~redo_flag; 7725 else { 7726 state->id_rcq_poll_busy &= ~flag; 7727 redo = 0; 7728 } 7729 mutex_exit(&state->id_rcq_poll_lock); 7730 7731 } while (redo); 7732 } 7733 7734 /* 7735 * Unmap the memory area associated with a given swqe. 7736 */ 7737 void 7738 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 7739 { 7740 ibt_status_t stat; 7741 7742 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 7743 7744 if (swqe->w_mi_hdl) { 7745 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 7746 swqe->w_mi_hdl)) != IBT_SUCCESS) { 7747 DPRINT(10, 7748 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 7749 } 7750 swqe->w_mi_hdl = NULL; 7751 } 7752 swqe->w_swr.wr_nds = 0; 7753 } 7754 7755 void 7756 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 7757 { 7758 /* 7759 * The recycling logic can be eliminated from here 7760 * and put into the async thread if we create another 7761 * list to hold ACE's for unjoined mcg's. 7762 */ 7763 if (DEC_REF_DO_CYCLE(ace)) { 7764 ibd_mce_t *mce; 7765 7766 /* 7767 * Check with the lock taken: we decremented 7768 * reference count without the lock, and some 7769 * transmitter might already have bumped the 7770 * reference count (possible in case of multicast 7771 * disable when we leave the AH on the active 7772 * list). If not still 0, get out, leaving the 7773 * recycle bit intact. 7774 * 7775 * Atomically transition the AH from active 7776 * to free list, and queue a work request to 7777 * leave the group and destroy the mce. No 7778 * transmitter can be looking at the AH or 7779 * the MCE in between, since we have the 7780 * ac_mutex lock. In the SendOnly reap case, 7781 * it is not necessary to hold the ac_mutex 7782 * and recheck the ref count (since the AH was 7783 * taken off the active list), we just do it 7784 * to have uniform processing with the Full 7785 * reap case. 7786 */ 7787 mutex_enter(&state->id_ac_mutex); 7788 mce = ace->ac_mce; 7789 if (GET_REF_CYCLE(ace) == 0) { 7790 CLEAR_REFCYCLE(ace); 7791 /* 7792 * Identify the case of fullmember reap as 7793 * opposed to mcg trap reap. Also, port up 7794 * might set ac_mce to NULL to indicate Tx 7795 * cleanup should do no more than put the 7796 * AH in the free list (see ibd_async_link). 7797 */ 7798 if (mce != NULL) { 7799 ace->ac_mce = NULL; 7800 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 7801 /* 7802 * mc_req was initialized at mce 7803 * creation time. 7804 */ 7805 ibd_queue_work_slot(state, 7806 &mce->mc_req, IBD_ASYNC_REAP); 7807 } 7808 IBD_ACACHE_INSERT_FREE(state, ace); 7809 } 7810 mutex_exit(&state->id_ac_mutex); 7811 } 7812 } 7813 7814 /* 7815 * Common code that deals with clean ups after a successful or 7816 * erroneous transmission attempt. 7817 */ 7818 static void 7819 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 7820 { 7821 ibd_ace_t *ace = swqe->w_ahandle; 7822 7823 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 7824 7825 /* 7826 * If this was a dynamic mapping in ibd_send(), we need to 7827 * unmap here. If this was an lso buffer we'd used for sending, 7828 * we need to release the lso buf to the pool, since the resource 7829 * is scarce. However, if this was simply a normal send using 7830 * the copybuf (present in each swqe), we don't need to release it. 7831 */ 7832 if (swqe->swqe_im_mblk != NULL) { 7833 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7834 ibd_unmap_mem(state, swqe); 7835 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7836 ibd_release_lsobufs(state, 7837 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7838 } 7839 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7840 freemsg(swqe->swqe_im_mblk); 7841 swqe->swqe_im_mblk = NULL; 7842 } 7843 7844 /* 7845 * Drop the reference count on the AH; it can be reused 7846 * now for a different destination if there are no more 7847 * posted sends that will use it. This can be eliminated 7848 * if we can always associate each Tx buffer with an AH. 7849 * The ace can be null if we are cleaning up from the 7850 * ibd_send() error path. 7851 */ 7852 if (ace != NULL) { 7853 ibd_dec_ref_ace(state, ace); 7854 } 7855 7856 /* 7857 * Release the send wqe for reuse. 7858 */ 7859 swqe->swqe_next = NULL; 7860 ibd_release_swqe(state, swqe, swqe, 1); 7861 } 7862 7863 static void 7864 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 7865 { 7866 ibd_ace_t *ace; 7867 ibd_swqe_t *swqe; 7868 int n = 0; 7869 7870 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 7871 7872 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 7873 7874 /* 7875 * If this was a dynamic mapping in ibd_send(), we need to 7876 * unmap here. If this was an lso buffer we'd used for sending, 7877 * we need to release the lso buf to the pool, since the 7878 * resource is scarce. However, if this was simply a normal 7879 * send using the copybuf (present in each swqe), we don't need 7880 * to release it. 7881 */ 7882 if (swqe->swqe_im_mblk != NULL) { 7883 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7884 ibd_unmap_mem(state, swqe); 7885 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7886 ibd_release_lsobufs(state, 7887 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7888 } 7889 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7890 freemsg(swqe->swqe_im_mblk); 7891 swqe->swqe_im_mblk = NULL; 7892 } 7893 7894 /* 7895 * Drop the reference count on the AH; it can be reused 7896 * now for a different destination if there are no more 7897 * posted sends that will use it. This can be eliminated 7898 * if we can always associate each Tx buffer with an AH. 7899 * The ace can be null if we are cleaning up from the 7900 * ibd_send() error path. 7901 */ 7902 ace = swqe->w_ahandle; 7903 if (ace != NULL) { 7904 ibd_dec_ref_ace(state, ace); 7905 } 7906 n++; 7907 } 7908 7909 /* 7910 * Release the send wqes for reuse. 7911 */ 7912 ibd_release_swqe(state, head, tail, n); 7913 } 7914 7915 /* 7916 * Processing to be done after receipt of a packet; hand off to GLD 7917 * in the format expected by GLD. The received packet has this 7918 * format: 2b sap :: 00 :: data. 7919 */ 7920 static mblk_t * 7921 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 7922 { 7923 ib_header_info_t *phdr; 7924 mblk_t *mp; 7925 ipoib_hdr_t *ipibp; 7926 ipha_t *iphap; 7927 ip6_t *ip6h; 7928 int len; 7929 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 7930 uint32_t bufs; 7931 7932 /* 7933 * Track number handed to upper layer that need to be returned. 7934 */ 7935 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 7936 7937 /* Never run out of rwqes, use allocb when running low */ 7938 if (bufs >= state->id_rx_bufs_outstanding_limit) { 7939 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 7940 atomic_inc_32(&state->id_rx_allocb); 7941 mp = allocb(pkt_len, BPRI_HI); 7942 if (mp) { 7943 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 7944 ibd_post_recv(state, rwqe); 7945 } else { /* no memory */ 7946 atomic_inc_32(&state->id_rx_allocb_failed); 7947 ibd_post_recv(state, rwqe); 7948 return (NULL); 7949 } 7950 } else { 7951 mp = rwqe->rwqe_im_mblk; 7952 } 7953 7954 7955 /* 7956 * Adjust write pointer depending on how much data came in. 7957 */ 7958 mp->b_wptr = mp->b_rptr + pkt_len; 7959 7960 /* 7961 * Make sure this is NULL or we're in trouble. 7962 */ 7963 if (mp->b_next != NULL) { 7964 ibd_print_warn(state, 7965 "ibd_process_rx: got duplicate mp from rcq?"); 7966 mp->b_next = NULL; 7967 } 7968 7969 /* 7970 * the IB link will deliver one of the IB link layer 7971 * headers called, the Global Routing Header (GRH). 7972 * ibd driver uses the information in GRH to build the 7973 * Header_info structure and pass it with the datagram up 7974 * to GLDv3. 7975 * If the GRH is not valid, indicate to GLDv3 by setting 7976 * the VerTcFlow field to 0. 7977 */ 7978 phdr = (ib_header_info_t *)mp->b_rptr; 7979 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 7980 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 7981 7982 /* if it is loop back packet, just drop it. */ 7983 if (state->id_enable_rc) { 7984 if (bcmp(&phdr->ib_grh.ipoib_sqpn, 7985 &state->rc_macaddr_loopback, 7986 IPOIB_ADDRL) == 0) { 7987 freemsg(mp); 7988 return (NULL); 7989 } 7990 } else { 7991 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 7992 IPOIB_ADDRL) == 0) { 7993 freemsg(mp); 7994 return (NULL); 7995 } 7996 } 7997 7998 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 7999 sizeof (ipoib_mac_t)); 8000 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 8001 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 8002 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 8003 } else { 8004 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 8005 } 8006 } else { 8007 /* 8008 * It can not be a IBA multicast packet. Must have been 8009 * unicast for us. Just copy the interface address to dst. 8010 */ 8011 phdr->ib_grh.ipoib_vertcflow = 0; 8012 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 8013 sizeof (ipoib_mac_t)); 8014 } 8015 8016 /* 8017 * For ND6 packets, padding is at the front of the source/target 8018 * lladdr. However the inet6 layer is not aware of it, hence remove 8019 * the padding from such packets. 8020 */ 8021 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 8022 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 8023 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 8024 len = ntohs(ip6h->ip6_plen); 8025 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 8026 /* LINTED: E_CONSTANT_CONDITION */ 8027 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 8028 } 8029 } 8030 8031 /* 8032 * Update statistics 8033 */ 8034 atomic_add_64(&state->id_rcv_bytes, pkt_len); 8035 atomic_inc_64(&state->id_rcv_pkt); 8036 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 8037 atomic_inc_64(&state->id_brd_rcv); 8038 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 8039 atomic_inc_64(&state->id_multi_rcv); 8040 8041 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 8042 /* 8043 * Set receive checksum status in mp 8044 * Hardware checksumming can be considered valid only if: 8045 * 1. CQE.IP_OK bit is set 8046 * 2. CQE.CKSUM = 0xffff 8047 * 3. IPv6 routing header is not present in the packet 8048 * 4. If there are no IP_OPTIONS in the IP HEADER 8049 */ 8050 8051 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 8052 (wc->wc_cksum == 0xFFFF) && 8053 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 8054 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 8055 } 8056 8057 return (mp); 8058 } 8059 8060 /* 8061 * Callback code invoked from STREAMs when the receive data buffer is 8062 * free for recycling. 8063 */ 8064 static void 8065 ibd_freemsg_cb(char *arg) 8066 { 8067 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 8068 ibd_state_t *state = rwqe->w_state; 8069 8070 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 8071 8072 /* 8073 * If the driver is stopped, just free the rwqe. 8074 */ 8075 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 8076 DPRINT(6, "ibd_freemsg: wqe being freed"); 8077 rwqe->rwqe_im_mblk = NULL; 8078 ibd_free_rwqe(state, rwqe); 8079 return; 8080 } 8081 8082 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 8083 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 8084 if (rwqe->rwqe_im_mblk == NULL) { 8085 ibd_free_rwqe(state, rwqe); 8086 DPRINT(6, "ibd_freemsg: desballoc failed"); 8087 return; 8088 } 8089 8090 ibd_post_recv(state, rwqe); 8091 } 8092 8093 static uint_t 8094 ibd_tx_recycle(caddr_t arg) 8095 { 8096 ibd_state_t *state = (ibd_state_t *)arg; 8097 8098 /* 8099 * Poll for completed entries 8100 */ 8101 ibd_poll_scq(state, state->id_scq_hdl); 8102 8103 return (DDI_INTR_CLAIMED); 8104 } 8105 8106 #ifdef IBD_LOGGING 8107 static void 8108 ibd_log_init(void) 8109 { 8110 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 8111 ibd_lbuf_ndx = 0; 8112 8113 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 8114 } 8115 8116 static void 8117 ibd_log_fini(void) 8118 { 8119 if (ibd_lbuf) 8120 kmem_free(ibd_lbuf, IBD_LOG_SZ); 8121 ibd_lbuf_ndx = 0; 8122 ibd_lbuf = NULL; 8123 8124 mutex_destroy(&ibd_lbuf_lock); 8125 } 8126 8127 static void 8128 ibd_log(const char *fmt, ...) 8129 { 8130 va_list ap; 8131 uint32_t off; 8132 uint32_t msglen; 8133 char tmpbuf[IBD_DMAX_LINE]; 8134 8135 if (ibd_lbuf == NULL) 8136 return; 8137 8138 va_start(ap, fmt); 8139 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 8140 va_end(ap); 8141 8142 if (msglen >= IBD_DMAX_LINE) 8143 msglen = IBD_DMAX_LINE - 1; 8144 8145 mutex_enter(&ibd_lbuf_lock); 8146 8147 off = ibd_lbuf_ndx; /* current msg should go here */ 8148 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 8149 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 8150 8151 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 8152 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 8153 8154 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 8155 ibd_lbuf_ndx = 0; 8156 8157 mutex_exit(&ibd_lbuf_lock); 8158 8159 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 8160 } 8161 #endif 8162 8163 /* ARGSUSED */ 8164 static int 8165 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8166 int *rvalp) 8167 { 8168 ibd_create_ioctl_t *cmd = karg; 8169 ibd_state_t *state, *port_state, *p; 8170 int i, err, rval = 0; 8171 mac_register_t *macp; 8172 ibt_hca_portinfo_t *pinfop = NULL; 8173 ibt_status_t ibt_status; 8174 uint_t psize, pinfosz; 8175 boolean_t force_create = B_FALSE; 8176 8177 cmd->ibdioc.ioc_status = 0; 8178 8179 if (cmd->ibdioc.ioc_port_inst < 0) { 8180 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8181 return (EINVAL); 8182 } 8183 port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst); 8184 if (port_state == NULL) { 8185 DPRINT(10, "ibd_create_partition: failed to get state %d", 8186 cmd->ibdioc.ioc_port_inst); 8187 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8188 return (EINVAL); 8189 } 8190 8191 /* Limited PKeys not supported */ 8192 if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) { 8193 rval = EINVAL; 8194 goto part_create_return; 8195 } 8196 8197 if (cmd->ioc_force_create == 0) { 8198 /* 8199 * Check if the port pkey table contains the pkey for which 8200 * this partition is being created. 8201 */ 8202 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8203 port_state->id_port, &pinfop, &psize, &pinfosz); 8204 8205 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8206 rval = EINVAL; 8207 goto part_create_return; 8208 } 8209 8210 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) { 8211 rval = ENETDOWN; 8212 cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN; 8213 goto part_create_return; 8214 } 8215 8216 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) { 8217 if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) { 8218 break; 8219 } 8220 } 8221 if (i == pinfop->p_pkey_tbl_sz) { 8222 rval = EINVAL; 8223 cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT; 8224 goto part_create_return; 8225 } 8226 } else { 8227 force_create = B_TRUE; 8228 } 8229 8230 mutex_enter(&ibd_objlist_lock); 8231 for (p = ibd_objlist_head; p; p = p->id_next) { 8232 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) && 8233 (p->id_pkey == cmd->ioc_pkey) && 8234 (p->id_plinkid == cmd->ioc_partid)) { 8235 mutex_exit(&ibd_objlist_lock); 8236 rval = EEXIST; 8237 cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS; 8238 goto part_create_return; 8239 } 8240 } 8241 mutex_exit(&ibd_objlist_lock); 8242 8243 state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP); 8244 8245 state->id_type = IBD_PARTITION_OBJ; 8246 8247 state->id_plinkid = cmd->ioc_partid; 8248 state->id_dlinkid = cmd->ibdioc.ioc_linkid; 8249 state->id_port_inst = cmd->ibdioc.ioc_port_inst; 8250 8251 state->id_dip = port_state->id_dip; 8252 state->id_port = port_state->id_port; 8253 state->id_pkey = cmd->ioc_pkey; 8254 state->id_hca_guid = port_state->id_hca_guid; 8255 state->id_port_guid = port_state->id_port_guid; 8256 state->id_force_create = force_create; 8257 8258 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 8259 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 8260 8261 if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) { 8262 rval = EIO; 8263 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE; 8264 goto fail; 8265 } 8266 8267 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 8268 rval = EAGAIN; 8269 goto fail; 8270 } 8271 8272 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 8273 macp->m_dip = port_state->id_dip; 8274 macp->m_instance = (uint_t)-1; 8275 macp->m_driver = state; 8276 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 8277 macp->m_callbacks = &ibd_m_callbacks; 8278 macp->m_min_sdu = 0; 8279 macp->m_multicast_sdu = IBD_DEF_MAX_SDU; 8280 if (state->id_enable_rc) { 8281 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 8282 } else { 8283 macp->m_max_sdu = IBD_DEF_MAX_SDU; 8284 } 8285 macp->m_priv_props = ibd_priv_props; 8286 8287 err = mac_register(macp, &state->id_mh); 8288 mac_free(macp); 8289 8290 if (err != 0) { 8291 DPRINT(10, "ibd_create_partition: mac_register() failed %d", 8292 err); 8293 rval = err; 8294 goto fail; 8295 } 8296 8297 err = dls_devnet_create(state->id_mh, 8298 cmd->ioc_partid, crgetzoneid(credp)); 8299 if (err != 0) { 8300 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed " 8301 "%d", err); 8302 rval = err; 8303 (void) mac_unregister(state->id_mh); 8304 goto fail; 8305 } 8306 8307 /* 8308 * Add the new partition state structure to the list 8309 */ 8310 mutex_enter(&ibd_objlist_lock); 8311 if (ibd_objlist_head) 8312 state->id_next = ibd_objlist_head; 8313 8314 ibd_objlist_head = state; 8315 mutex_exit(&ibd_objlist_lock); 8316 8317 part_create_return: 8318 if (pinfop) { 8319 ibt_free_portinfo(pinfop, pinfosz); 8320 } 8321 return (rval); 8322 8323 fail: 8324 if (pinfop) { 8325 ibt_free_portinfo(pinfop, pinfosz); 8326 } 8327 ibd_part_unattach(state); 8328 kmem_free(state, sizeof (ibd_state_t)); 8329 return (rval); 8330 } 8331 8332 /* ARGSUSED */ 8333 static int 8334 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8335 int *rvalp) 8336 { 8337 int err; 8338 datalink_id_t tmpid; 8339 ibd_state_t *node, *prev; 8340 ibd_delete_ioctl_t *cmd = karg; 8341 8342 prev = NULL; 8343 8344 mutex_enter(&ibd_objlist_lock); 8345 node = ibd_objlist_head; 8346 8347 /* Find the ibd state structure corresponding to the partition */ 8348 while (node != NULL) { 8349 if (node->id_plinkid == cmd->ioc_partid) 8350 break; 8351 prev = node; 8352 node = node->id_next; 8353 } 8354 8355 if (node == NULL) { 8356 mutex_exit(&ibd_objlist_lock); 8357 return (ENOENT); 8358 } 8359 8360 if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) { 8361 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed " 8362 "%d", err); 8363 mutex_exit(&ibd_objlist_lock); 8364 return (err); 8365 } 8366 8367 /* 8368 * Call ibd_part_unattach() only after making sure that the instance has 8369 * not been started yet and is also not in late hca init mode. 8370 */ 8371 ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8372 8373 err = 0; 8374 if ((node->id_mac_state & IBD_DRV_STARTED) || 8375 (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) || 8376 (ibd_part_busy(node) != DDI_SUCCESS) || 8377 ((err = mac_disable(node->id_mh)) != 0)) { 8378 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid, 8379 crgetzoneid(credp)); 8380 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8381 mutex_exit(&ibd_objlist_lock); 8382 return (err != 0 ? err : EBUSY); 8383 } 8384 8385 node->id_mac_state |= IBD_DRV_IN_DELETION; 8386 8387 ibd_part_unattach(node); 8388 8389 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8390 8391 /* Remove the partition state structure from the linked list */ 8392 if (prev == NULL) 8393 ibd_objlist_head = node->id_next; 8394 else 8395 prev->id_next = node->id_next; 8396 mutex_exit(&ibd_objlist_lock); 8397 8398 if ((err = mac_unregister(node->id_mh)) != 0) { 8399 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d", 8400 err); 8401 } 8402 8403 cv_destroy(&node->id_macst_cv); 8404 mutex_destroy(&node->id_macst_lock); 8405 8406 kmem_free(node, sizeof (ibd_state_t)); 8407 8408 return (0); 8409 } 8410 8411 /* ARGSUSED */ 8412 static int 8413 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred, 8414 int *rvalp) 8415 { 8416 ibd_ioctl_t cmd; 8417 ibpart_ioctl_t partioc; 8418 ibport_ioctl_t portioc; 8419 #ifdef _MULTI_DATAMODEL 8420 ibport_ioctl32_t portioc32; 8421 #endif 8422 ibd_state_t *state, *port_state; 8423 int size; 8424 ibt_hca_portinfo_t *pinfop = NULL; 8425 ibt_status_t ibt_status; 8426 uint_t psize, pinfosz; 8427 int rval = 0; 8428 8429 size = sizeof (ibd_ioctl_t); 8430 if (ddi_copyin((void *)arg, &cmd, size, mode)) { 8431 return (EFAULT); 8432 } 8433 cmd.ioc_status = 0; 8434 switch (cmd.ioc_info_cmd) { 8435 case IBD_INFO_CMD_IBPART: 8436 size = sizeof (ibpart_ioctl_t); 8437 if (ddi_copyin((void *)arg, &partioc, size, mode)) { 8438 return (EFAULT); 8439 } 8440 8441 mutex_enter(&ibd_objlist_lock); 8442 /* Find the ibd state structure corresponding the partition */ 8443 for (state = ibd_objlist_head; state; state = state->id_next) { 8444 if (state->id_plinkid == cmd.ioc_linkid) { 8445 break; 8446 } 8447 } 8448 8449 if (state == NULL) { 8450 mutex_exit(&ibd_objlist_lock); 8451 return (ENOENT); 8452 } 8453 8454 partioc.ibdioc.ioc_linkid = state->id_dlinkid; 8455 partioc.ibdioc.ioc_port_inst = state->id_port_inst; 8456 partioc.ibdioc.ioc_portnum = state->id_port; 8457 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid; 8458 partioc.ibdioc.ioc_portguid = state->id_port_guid; 8459 partioc.ibdioc.ioc_status = 0; 8460 partioc.ioc_partid = state->id_plinkid; 8461 partioc.ioc_pkey = state->id_pkey; 8462 partioc.ioc_force_create = state->id_force_create; 8463 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) { 8464 mutex_exit(&ibd_objlist_lock); 8465 return (EFAULT); 8466 } 8467 mutex_exit(&ibd_objlist_lock); 8468 8469 break; 8470 8471 case IBD_INFO_CMD_IBPORT: 8472 if ((cmd.ioc_port_inst < 0) || ((port_state = 8473 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8474 DPRINT(10, "ibd_create_partition: failed to get" 8475 " state %d", cmd.ioc_port_inst); 8476 size = sizeof (ibd_ioctl_t); 8477 cmd.ioc_status = IBD_INVALID_PORT_INST; 8478 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8479 mode)) { 8480 return (EFAULT); 8481 } 8482 return (EINVAL); 8483 } 8484 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8485 port_state->id_port, &pinfop, &psize, &pinfosz); 8486 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8487 return (EINVAL); 8488 } 8489 #ifdef _MULTI_DATAMODEL 8490 switch (ddi_model_convert_from(mode & FMODELS)) { 8491 case DDI_MODEL_ILP32: { 8492 size = sizeof (ibport_ioctl32_t); 8493 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8494 rval = EFAULT; 8495 goto fail; 8496 } 8497 portioc32.ibdioc.ioc_status = 0; 8498 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8499 portioc32.ibdioc.ioc_hcaguid = 8500 port_state->id_hca_guid; 8501 portioc32.ibdioc.ioc_portguid = 8502 port_state->id_port_guid; 8503 if (portioc32.ioc_pkey_tbl_sz != 8504 pinfop->p_pkey_tbl_sz) { 8505 rval = EINVAL; 8506 size = sizeof (ibd_ioctl_t); 8507 portioc32.ibdioc.ioc_status = 8508 IBD_INVALID_PKEY_TBL_SIZE; 8509 if (ddi_copyout((void *)&portioc32.ibdioc, 8510 (void *)arg, size, mode)) { 8511 rval = EFAULT; 8512 goto fail; 8513 } 8514 goto fail; 8515 } 8516 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8517 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8518 (void *)(uintptr_t)portioc32.ioc_pkeys, size, 8519 mode)) { 8520 rval = EFAULT; 8521 goto fail; 8522 } 8523 size = sizeof (ibport_ioctl32_t); 8524 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8525 mode)) { 8526 rval = EFAULT; 8527 goto fail; 8528 } 8529 break; 8530 } 8531 case DDI_MODEL_NONE: 8532 size = sizeof (ibport_ioctl_t); 8533 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8534 rval = EFAULT; 8535 goto fail; 8536 } 8537 portioc.ibdioc.ioc_status = 0; 8538 portioc.ibdioc.ioc_portnum = port_state->id_port; 8539 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8540 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8541 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8542 rval = EINVAL; 8543 size = sizeof (ibd_ioctl_t); 8544 portioc.ibdioc.ioc_status = 8545 IBD_INVALID_PKEY_TBL_SIZE; 8546 if (ddi_copyout((void *)&portioc.ibdioc, 8547 (void *)arg, size, mode)) { 8548 rval = EFAULT; 8549 goto fail; 8550 } 8551 goto fail; 8552 } 8553 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8554 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8555 (void *)(portioc.ioc_pkeys), size, mode)) { 8556 rval = EFAULT; 8557 goto fail; 8558 } 8559 size = sizeof (ibport_ioctl_t); 8560 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8561 mode)) { 8562 rval = EFAULT; 8563 goto fail; 8564 } 8565 break; 8566 } 8567 #else /* ! _MULTI_DATAMODEL */ 8568 size = sizeof (ibport_ioctl_t); 8569 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8570 rval = EFAULT; 8571 goto fail; 8572 } 8573 portioc.ibdioc.ioc_status = 0; 8574 portioc.ibdioc.ioc_portnum = port_state->id_port; 8575 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8576 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8577 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8578 rval = EINVAL; 8579 size = sizeof (ibd_ioctl_t); 8580 portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE; 8581 if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg, 8582 size, mode)) { 8583 rval = EFAULT; 8584 goto fail; 8585 } 8586 goto fail; 8587 } 8588 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8589 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8590 (void *)(portioc.ioc_pkeys), size, mode)) { 8591 rval = EFAULT; 8592 goto fail; 8593 } 8594 size = sizeof (ibport_ioctl_t); 8595 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8596 mode)) { 8597 rval = EFAULT; 8598 goto fail; 8599 } 8600 #endif /* _MULTI_DATAMODEL */ 8601 8602 break; 8603 8604 case IBD_INFO_CMD_PKEYTBLSZ: 8605 if ((cmd.ioc_port_inst < 0) || ((port_state = 8606 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8607 DPRINT(10, "ibd_create_partition: failed to get" 8608 " state %d", cmd.ioc_port_inst); 8609 size = sizeof (ibd_ioctl_t); 8610 cmd.ioc_status = IBD_INVALID_PORT_INST; 8611 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8612 mode)) { 8613 return (EFAULT); 8614 } 8615 return (EINVAL); 8616 } 8617 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8618 port_state->id_port, &pinfop, &psize, &pinfosz); 8619 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8620 return (EINVAL); 8621 } 8622 #ifdef _MULTI_DATAMODEL 8623 switch (ddi_model_convert_from(mode & FMODELS)) { 8624 case DDI_MODEL_ILP32: { 8625 size = sizeof (ibport_ioctl32_t); 8626 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8627 rval = EFAULT; 8628 goto fail; 8629 } 8630 portioc32.ibdioc.ioc_status = 0; 8631 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8632 portioc32.ibdioc.ioc_hcaguid = 8633 port_state->id_hca_guid; 8634 portioc32.ibdioc.ioc_portguid = 8635 port_state->id_port_guid; 8636 portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8637 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8638 mode)) { 8639 rval = EFAULT; 8640 goto fail; 8641 } 8642 break; 8643 } 8644 case DDI_MODEL_NONE: 8645 size = sizeof (ibport_ioctl_t); 8646 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8647 rval = EFAULT; 8648 goto fail; 8649 } 8650 portioc.ibdioc.ioc_status = 0; 8651 portioc.ibdioc.ioc_portnum = port_state->id_port; 8652 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8653 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8654 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8655 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8656 mode)) { 8657 rval = EFAULT; 8658 goto fail; 8659 } 8660 break; 8661 } 8662 #else /* ! _MULTI_DATAMODEL */ 8663 size = sizeof (ibport_ioctl_t); 8664 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8665 rval = EFAULT; 8666 goto fail; 8667 } 8668 portioc.ibdioc.ioc_status = 0; 8669 portioc.ibdioc.ioc_portnum = port_state->id_port; 8670 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8671 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8672 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8673 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8674 mode)) { 8675 rval = EFAULT; 8676 goto fail; 8677 } 8678 #endif /* _MULTI_DATAMODEL */ 8679 break; 8680 8681 default: 8682 return (EINVAL); 8683 8684 } /* switch (cmd.ioc_info_cmd) */ 8685 fail: 8686 if (pinfop) { 8687 ibt_free_portinfo(pinfop, pinfosz); 8688 } 8689 return (rval); 8690 } 8691 8692 /* ARGSUSED */ 8693 static void 8694 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl, 8695 ibt_async_code_t code, ibt_async_event_t *event) 8696 { 8697 ibd_state_t *state = (ibd_state_t *)arg; 8698 link_state_t lstate; 8699 8700 switch (code) { 8701 case IBT_EVENT_PORT_UP: 8702 case IBT_ERROR_PORT_DOWN: 8703 if (ibd_get_port_state(state, &lstate) != 0) 8704 break; 8705 8706 if (state->id_link_state != lstate) { 8707 state->id_link_state = lstate; 8708 mac_link_update(state->id_mh, lstate); 8709 } 8710 break; 8711 default: 8712 break; 8713 } 8714 } 8715 8716 static int 8717 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate) 8718 { 8719 ibt_hca_portinfo_t *port_infop; 8720 uint_t psize, port_infosz; 8721 ibt_status_t ret; 8722 8723 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 8724 &port_infop, &psize, &port_infosz); 8725 if ((ret != IBT_SUCCESS) || (psize != 1)) 8726 return (-1); 8727 8728 state->id_sgid = *port_infop->p_sgid_tbl; 8729 state->id_link_speed = ibd_get_portspeed(state); 8730 8731 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) 8732 *lstate = LINK_STATE_UP; 8733 else 8734 *lstate = LINK_STATE_DOWN; 8735 8736 ibt_free_portinfo(port_infop, port_infosz); 8737 return (0); 8738 } 8739 8740 static int 8741 ibd_port_attach(dev_info_t *dip) 8742 { 8743 ibd_state_t *state; 8744 link_state_t lstate; 8745 int instance; 8746 ibt_status_t ret; 8747 8748 /* 8749 * Allocate softstate structure 8750 */ 8751 instance = ddi_get_instance(dip); 8752 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) { 8753 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed"); 8754 return (DDI_FAILURE); 8755 } 8756 8757 state = ddi_get_soft_state(ibd_list, instance); 8758 8759 state->id_dip = dip; 8760 state->id_type = IBD_PORT_DRIVER; 8761 8762 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 8763 "port-number", 0)) == 0) { 8764 DPRINT(10, "ibd_port_attach: invalid port number (%d)", 8765 state->id_port); 8766 return (DDI_FAILURE); 8767 } 8768 if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8769 "hca-guid", 0)) == 0) { 8770 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)", 8771 state->id_hca_guid); 8772 return (DDI_FAILURE); 8773 } 8774 if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8775 "port-guid", 0)) == 0) { 8776 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)", 8777 state->id_port_guid); 8778 return (DDI_FAILURE); 8779 } 8780 8781 /* 8782 * Attach to IBTL 8783 */ 8784 if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state, 8785 &state->id_ibt_hdl)) != IBT_SUCCESS) { 8786 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d", 8787 ret); 8788 goto done; 8789 } 8790 8791 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 8792 8793 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 8794 &state->id_hca_hdl)) != IBT_SUCCESS) { 8795 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8796 ret); 8797 goto done; 8798 } 8799 state->id_mac_state |= IBD_DRV_HCA_OPENED; 8800 8801 /* Update link status */ 8802 8803 if (ibd_get_port_state(state, &lstate) != 0) { 8804 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8805 ret); 8806 goto done; 8807 } 8808 state->id_link_state = lstate; 8809 /* 8810 * Register ibd interfaces with the Nemo framework 8811 */ 8812 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 8813 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()"); 8814 goto done; 8815 } 8816 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 8817 8818 mac_link_update(state->id_mh, lstate); 8819 8820 return (DDI_SUCCESS); 8821 done: 8822 (void) ibd_port_unattach(state, dip); 8823 return (DDI_FAILURE); 8824 } 8825 8826 static int 8827 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip) 8828 { 8829 int instance; 8830 uint32_t progress = state->id_mac_state; 8831 ibt_status_t ret; 8832 8833 if (progress & IBD_DRV_MAC_REGISTERED) { 8834 (void) mac_unregister(state->id_mh); 8835 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 8836 } 8837 8838 if (progress & IBD_DRV_HCA_OPENED) { 8839 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 8840 IBT_SUCCESS) { 8841 ibd_print_warn(state, "failed to close " 8842 "HCA device, ret=%d", ret); 8843 } 8844 state->id_hca_hdl = NULL; 8845 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 8846 } 8847 8848 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 8849 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 8850 ibd_print_warn(state, 8851 "ibt_detach() failed, ret=%d", ret); 8852 } 8853 state->id_ibt_hdl = NULL; 8854 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 8855 } 8856 instance = ddi_get_instance(dip); 8857 ddi_soft_state_free(ibd_list, instance); 8858 8859 return (DDI_SUCCESS); 8860 } 8861 8862 ibt_status_t 8863 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr) 8864 { 8865 ibd_state_t *state; 8866 8867 mutex_enter(&ibd_objlist_lock); 8868 8869 /* Find the ibd state structure corresponding the partition */ 8870 for (state = ibd_objlist_head; state; state = state->id_next) { 8871 if (state->id_plinkid == linkid) { 8872 break; 8873 } 8874 } 8875 8876 if (state == NULL) { 8877 mutex_exit(&ibd_objlist_lock); 8878 return (IBT_NO_SUCH_OBJECT); 8879 } 8880 8881 attr->pa_dlinkid = state->id_dlinkid; 8882 attr->pa_plinkid = state->id_plinkid; 8883 attr->pa_port = state->id_port; 8884 attr->pa_hca_guid = state->id_hca_guid; 8885 attr->pa_port_guid = state->id_port_guid; 8886 attr->pa_pkey = state->id_pkey; 8887 8888 mutex_exit(&ibd_objlist_lock); 8889 8890 return (IBT_SUCCESS); 8891 } 8892 8893 ibt_status_t 8894 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts) 8895 { 8896 ibd_state_t *state; 8897 int n = 0; 8898 ibt_part_attr_t *attr; 8899 8900 mutex_enter(&ibd_objlist_lock); 8901 8902 for (state = ibd_objlist_head; state; state = state->id_next) 8903 n++; 8904 8905 *nparts = n; 8906 if (n == 0) { 8907 *attr_list = NULL; 8908 mutex_exit(&ibd_objlist_lock); 8909 return (IBT_SUCCESS); 8910 } 8911 8912 *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP); 8913 attr = *attr_list; 8914 for (state = ibd_objlist_head; state; state = state->id_next) { 8915 #ifdef DEBUG 8916 ASSERT(n > 0); 8917 n--; 8918 #endif 8919 attr->pa_dlinkid = state->id_dlinkid; 8920 attr->pa_plinkid = state->id_plinkid; 8921 attr->pa_port = state->id_port; 8922 attr->pa_hca_guid = state->id_hca_guid; 8923 attr->pa_port_guid = state->id_port_guid; 8924 attr->pa_pkey = state->id_pkey; 8925 attr++; 8926 } 8927 8928 mutex_exit(&ibd_objlist_lock); 8929 return (IBT_SUCCESS); 8930 } 8931