1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * An implementation of the IPoIB standard based on PSARC 2001/289. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/modctl.h> 35 #include <sys/stropts.h> 36 #include <sys/stream.h> 37 #include <sys/strsun.h> 38 #include <sys/strsubr.h> 39 #include <sys/dlpi.h> 40 #include <sys/mac_provider.h> 41 42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 43 #include <sys/sysmacros.h> /* for offsetof */ 44 #include <sys/disp.h> /* for async thread pri */ 45 #include <sys/atomic.h> /* for atomic_add*() */ 46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 47 #include <netinet/in.h> /* for netinet/ip.h below */ 48 #include <netinet/ip.h> /* for struct ip */ 49 #include <netinet/udp.h> /* for struct udphdr */ 50 #include <inet/common.h> /* for inet/ip.h below */ 51 #include <inet/ip.h> /* for ipha_t */ 52 #include <inet/ip6.h> /* for ip6_t */ 53 #include <inet/tcp.h> /* for tcph_t */ 54 #include <netinet/icmp6.h> /* for icmp6_t */ 55 #include <sys/callb.h> 56 #include <sys/modhash.h> 57 58 #include <sys/ib/clients/ibd/ibd.h> 59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 60 #include <sys/note.h> 61 #include <sys/multidata.h> 62 63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 64 65 #include <sys/priv_names.h> 66 #include <sys/dls.h> 67 #include <sys/dld_ioc.h> 68 #include <sys/policy.h> 69 #include <sys/ibpart.h> 70 #include <sys/file.h> 71 72 /* 73 * The write-up below includes details on the following: 74 * 1. The dladm administrative model. 75 * 2. Late HCA initialization feature. 76 * 3. Brussels support and its implications to the current architecture. 77 * 78 * 1. The dladm administrative model. 79 * ------------------------------------------ 80 * With the dladm model, ibnex will create one ibd instance per port. These 81 * instances will be created independent of the port state. 82 * 83 * The ibd driver is two faceted: One side of it working as the port driver and 84 * the other as the partition object driver. 85 * 86 * The port instance is a child of the HCA, and will have an entry in the devfs. 87 * A DDI attach only happens for the port driver, and its attach is 88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is 89 * handled in ibd_port_unattach(). 90 * 91 * The partition object is only a registrant to the mac layer via mac_register() 92 * and does not have an entry in the device tree. There is no DDI softstate 93 * managed by the DDI framework for the partition objects. However, the state is 94 * managed inside the ibd driver, and every partition object hangs off the 95 * "ibd_objlist_head". 96 * 97 * The partition object first comes into existence when a user runs the 98 * 'create-part' subcommand of dladm. This is like invoking the attach entry 99 * point of the partition object. The partition object goes away with the 100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry 101 * point of the partition object. 102 * 103 * The create-part and delete-part subcommands result in dld ioctls that end up 104 * calling ibd_create_parition() and ibd_delete_partition respectively. 105 * There ioctls are registered with the dld layer in _init() via a call to 106 * dld_ioc_register(). 107 * 108 * The port instance by itself cannot be plumbed. It is only the partition 109 * objects that can be plumbed and they alone participate in I/O and not the 110 * port driver. 111 * 112 * There are some info ioctls supported in ibd which are used by dladm(1M) to 113 * display useful information. The info entry point for ibd is 114 * ibd_get_partition_info(). 115 * 116 * 2. Late HCA initialization feature. 117 * ------------------------------------ 118 * As mentioned in section 1, the user creates the partition objects via 119 * dladm(1M). It is possible that: 120 * a) The physical port itself is down and the SM cannot be reached. 121 * b) The PKEY specified by the used has not been created in the SM yet. 122 * c) An IPoIB broadcast group for the specified PKEY is not present. 123 * 124 * In all of the above cases, complete initialization of the partition object is 125 * not possible. However, the new model allows the creation of partition 126 * objects even in such cases but will defer the initialization for later. 127 * When such a partition object is plumbed, the link state will be displayed as 128 * "down". 129 * The driver, at this point, is listening to events that herald the 130 * availability of resources - 131 * i) LINK_UP when the link becomes available 132 * ii) PORT_CHANGE when the PKEY has been created 133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been 134 * created 135 * via ibd_async_handler() for events i) and ii), and via 136 * ibd_snet_notices_handler() for iii. 137 * The driver handles these events (as and when they arrive) and completes the 138 * initialization of the partition object and transitions it to a usable state. 139 * 140 * 3. Brussels support and its implications to the current architecture. 141 * --------------------------------------------------------------------- 142 * The brussels support introduces two new interfaces to the ibd driver - 143 * ibd_m_getprop() and ibd_m_setprop(). 144 * These interfaces allow setting and retrieval of certain properties. 145 * Some of them are public properties while most other are private properties 146 * meant to be used by developers. Tuning the latter kind can cause 147 * performance issues and should not be used without understanding the 148 * implications. All properties are specific to an instance of either the 149 * partition object or the port driver. 150 * 151 * The public properties are : mtu and linkmode. 152 * mtu is a read-only property. 153 * linkmode can take two values - UD and CM. 154 * 155 * Changing the linkmode requires some bookkeeping in the driver. The 156 * capabilities need to be re-reported to the mac layer. This is done by 157 * calling mac_capab_update(). The maxsdu is updated by calling 158 * mac_maxsdu_update(). 159 * The private properties retain their values across the change of linkmode. 160 * NOTE: 161 * - The port driver does not support any property apart from mtu. 162 * - All other properties are only meant for the partition object. 163 * - The properties cannot be set when an instance is plumbed. The 164 * instance has to be unplumbed to effect any setting. 165 */ 166 167 /* 168 * Driver wide tunables 169 * 170 * ibd_tx_softintr 171 * ibd_rx_softintr 172 * The softintr mechanism allows ibd to avoid event queue overflows if 173 * the receive/completion handlers are to be expensive. These are enabled 174 * by default. 175 * 176 * ibd_log_sz 177 * This specifies the size of the ibd log buffer in bytes. The buffer is 178 * allocated and logging is enabled only when IBD_LOGGING is defined. 179 * 180 */ 181 uint_t ibd_rx_softintr = 1; 182 uint_t ibd_tx_softintr = 1; 183 184 #ifdef IBD_LOGGING 185 uint_t ibd_log_sz = 0x20000; 186 #endif 187 188 #ifdef IBD_LOGGING 189 #define IBD_LOG_SZ ibd_log_sz 190 #endif 191 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 193 #define IBD_RX_POST_CNT 8 194 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 196 #define IBD_LOG_RX_POST 4 197 198 /* Minimum number of receive work requests driver needs to always have */ 199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 200 201 /* 202 * LSO parameters 203 */ 204 #define IBD_LSO_MAXLEN 65536 205 #define IBD_LSO_BUFSZ 8192 206 207 /* 208 * Async operation states 209 */ 210 #define IBD_OP_NOTSTARTED 0 211 #define IBD_OP_ONGOING 1 212 #define IBD_OP_COMPLETED 2 213 #define IBD_OP_ERRORED 3 214 #define IBD_OP_ROUTERED 4 215 216 /* 217 * State of IBD driver initialization during attach/m_start 218 */ 219 #define IBD_DRV_STATE_INITIALIZED 0x000001 220 #define IBD_DRV_RXINTR_ADDED 0x000002 221 #define IBD_DRV_TXINTR_ADDED 0x000004 222 #define IBD_DRV_IBTL_ATTACH_DONE 0x000008 223 #define IBD_DRV_HCA_OPENED 0x000010 224 #define IBD_DRV_PD_ALLOCD 0x000020 225 #define IBD_DRV_MAC_REGISTERED 0x000040 226 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x000080 227 #define IBD_DRV_BCAST_GROUP_FOUND 0x000100 228 #define IBD_DRV_ACACHE_INITIALIZED 0x000200 229 #define IBD_DRV_CQS_ALLOCD 0x000400 230 #define IBD_DRV_UD_CHANNEL_SETUP 0x000800 231 #define IBD_DRV_TXLIST_ALLOCD 0x001000 232 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x002000 233 #define IBD_DRV_RXLIST_ALLOCD 0x004000 234 #define IBD_DRV_BCAST_GROUP_JOINED 0x008000 235 #define IBD_DRV_ASYNC_THR_CREATED 0x010000 236 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x020000 237 #define IBD_DRV_SM_NOTICES_REGISTERED 0x040000 238 #define IBD_DRV_STARTED 0x080000 239 #define IBD_DRV_RC_SRQ_ALLOCD 0x100000 240 #define IBD_DRV_RC_LARGEBUF_ALLOCD 0x200000 241 #define IBD_DRV_RC_LISTEN 0x400000 242 #ifdef DEBUG 243 #define IBD_DRV_RC_PRIVATE_STATE 0x800000 244 #endif 245 #define IBD_DRV_IN_DELETION 0x1000000 246 #define IBD_DRV_IN_LATE_HCA_INIT 0x2000000 247 #define IBD_DRV_REQ_LIST_INITED 0x4000000 248 249 /* 250 * Start/stop in-progress flags; note that restart must always remain 251 * the OR of start and stop flag values. 252 */ 253 #define IBD_DRV_START_IN_PROGRESS 0x10000000 254 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 255 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 256 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS 257 258 /* 259 * Miscellaneous constants 260 */ 261 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 262 #define IBD_DEF_MAX_SDU 2044 263 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE) 264 #define IBD_DEF_RC_MAX_SDU 65520 265 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE) 266 #define IBD_DEFAULT_QKEY 0xB1B 267 #ifdef IBD_LOGGING 268 #define IBD_DMAX_LINE 100 269 #endif 270 271 /* 272 * Enumerations for link states 273 */ 274 typedef enum { 275 IBD_LINK_DOWN, 276 IBD_LINK_UP, 277 IBD_LINK_UP_ABSENT 278 } ibd_link_op_t; 279 280 /* 281 * Driver State Pointer 282 */ 283 void *ibd_list; 284 285 /* 286 * Driver Global Data 287 */ 288 ibd_global_state_t ibd_gstate; 289 290 /* 291 * Partition object list 292 */ 293 ibd_state_t *ibd_objlist_head = NULL; 294 kmutex_t ibd_objlist_lock; 295 296 /* 297 * Logging 298 */ 299 #ifdef IBD_LOGGING 300 kmutex_t ibd_lbuf_lock; 301 uint8_t *ibd_lbuf; 302 uint32_t ibd_lbuf_ndx; 303 #endif 304 305 /* 306 * Required system entry points 307 */ 308 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 309 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 310 311 /* 312 * Required driver entry points for GLDv3 313 */ 314 static int ibd_m_stat(void *, uint_t, uint64_t *); 315 static int ibd_m_start(void *); 316 static void ibd_m_stop(void *); 317 static int ibd_m_promisc(void *, boolean_t); 318 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 319 static int ibd_m_unicst(void *, const uint8_t *); 320 static mblk_t *ibd_m_tx(void *, mblk_t *); 321 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 322 323 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 324 const void *); 325 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 326 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t, 327 mac_prop_info_handle_t); 328 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t, 329 const void *); 330 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *); 331 332 /* 333 * Private driver entry points for GLDv3 334 */ 335 336 /* 337 * Initialization 338 */ 339 static int ibd_state_init(ibd_state_t *, dev_info_t *); 340 static int ibd_init_txlist(ibd_state_t *); 341 static int ibd_init_rxlist(ibd_state_t *); 342 static int ibd_acache_init(ibd_state_t *); 343 #ifdef IBD_LOGGING 344 static void ibd_log_init(void); 345 #endif 346 347 /* 348 * Termination/cleanup 349 */ 350 static void ibd_state_fini(ibd_state_t *); 351 static void ibd_fini_txlist(ibd_state_t *); 352 static void ibd_fini_rxlist(ibd_state_t *); 353 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 354 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 355 static void ibd_acache_fini(ibd_state_t *); 356 #ifdef IBD_LOGGING 357 static void ibd_log_fini(void); 358 #endif 359 360 /* 361 * Allocation/acquire/map routines 362 */ 363 static int ibd_alloc_tx_copybufs(ibd_state_t *); 364 static int ibd_alloc_rx_copybufs(ibd_state_t *); 365 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 366 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 367 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 368 uint32_t *); 369 370 /* 371 * Free/release/unmap routines 372 */ 373 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 374 static void ibd_free_tx_copybufs(ibd_state_t *); 375 static void ibd_free_rx_copybufs(ibd_state_t *); 376 static void ibd_free_rx_rsrcs(ibd_state_t *); 377 static void ibd_free_tx_lsobufs(ibd_state_t *); 378 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 379 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 380 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 381 382 /* 383 * Handlers/callback routines 384 */ 385 static uint_t ibd_intr(caddr_t); 386 static uint_t ibd_tx_recycle(caddr_t); 387 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 388 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 389 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 390 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 391 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 392 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 393 static void ibd_freemsg_cb(char *); 394 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 395 ibt_async_event_t *); 396 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 397 ibt_async_event_t *); 398 static void ibd_snet_notices_handler(void *, ib_gid_t, 399 ibt_subnet_event_code_t, ibt_subnet_event_t *); 400 401 /* 402 * Send/receive routines 403 */ 404 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 405 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 406 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 407 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 408 409 /* 410 * Threads 411 */ 412 static void ibd_async_work(ibd_state_t *); 413 414 /* 415 * Async tasks 416 */ 417 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 418 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 419 static void ibd_async_setprom(ibd_state_t *); 420 static void ibd_async_unsetprom(ibd_state_t *); 421 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 422 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 423 static void ibd_async_txsched(ibd_state_t *); 424 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 425 426 /* 427 * Async task helpers 428 */ 429 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 430 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 431 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 432 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 433 ipoib_mac_t *, ipoib_mac_t *); 434 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 435 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 436 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 437 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 438 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 439 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 440 static uint64_t ibd_get_portspeed(ibd_state_t *); 441 static boolean_t ibd_async_safe(ibd_state_t *); 442 static void ibd_async_done(ibd_state_t *); 443 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 444 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 445 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 446 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 447 448 /* 449 * Helpers for attach/start routines 450 */ 451 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 452 static int ibd_record_capab(ibd_state_t *); 453 static int ibd_get_port_details(ibd_state_t *); 454 static int ibd_alloc_cqs(ibd_state_t *); 455 static int ibd_setup_ud_channel(ibd_state_t *); 456 static int ibd_start(ibd_state_t *); 457 static int ibd_undo_start(ibd_state_t *, link_state_t); 458 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 459 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 460 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip); 461 static int ibd_part_unattach(ibd_state_t *state); 462 static int ibd_port_attach(dev_info_t *); 463 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip); 464 static int ibd_get_port_state(ibd_state_t *, link_state_t *); 465 466 467 /* 468 * Miscellaneous helpers 469 */ 470 static int ibd_sched_poll(ibd_state_t *, int, int); 471 static void ibd_resume_transmission(ibd_state_t *); 472 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 473 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 474 static void *list_get_head(list_t *); 475 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 476 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 477 478 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *); 479 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *); 480 481 #ifdef IBD_LOGGING 482 static void ibd_log(const char *, ...); 483 #endif 484 485 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 486 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 487 488 /* Module Driver Info */ 489 static struct modldrv ibd_modldrv = { 490 &mod_driverops, /* This one is a driver */ 491 "InfiniBand GLDv3 Driver", /* short description */ 492 &ibd_dev_ops /* driver specific ops */ 493 }; 494 495 /* Module Linkage */ 496 static struct modlinkage ibd_modlinkage = { 497 MODREV_1, (void *)&ibd_modldrv, NULL 498 }; 499 500 /* 501 * Module (static) info passed to IBTL during ibt_attach 502 */ 503 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 504 IBTI_V_CURR, 505 IBT_NETWORK, 506 ibd_async_handler, 507 NULL, 508 "IBPART" 509 }; 510 511 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = { 512 IBTI_V_CURR, 513 IBT_NETWORK, 514 ibdpd_async_handler, 515 NULL, 516 "IPIB" 517 }; 518 519 /* 520 * GLDv3 entry points 521 */ 522 #define IBD_M_CALLBACK_FLAGS \ 523 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) 524 525 static mac_callbacks_t ibd_m_callbacks = { 526 IBD_M_CALLBACK_FLAGS, 527 ibd_m_stat, 528 ibd_m_start, 529 ibd_m_stop, 530 ibd_m_promisc, 531 ibd_m_multicst, 532 ibd_m_unicst, 533 ibd_m_tx, 534 NULL, 535 NULL, 536 ibd_m_getcapab, 537 NULL, 538 NULL, 539 ibd_m_setprop, 540 ibd_m_getprop, 541 ibd_m_propinfo 542 }; 543 544 /* Private properties */ 545 char *ibd_priv_props[] = { 546 "_ibd_broadcast_group", 547 "_ibd_coalesce_completions", 548 "_ibd_create_broadcast_group", 549 "_ibd_hash_size", 550 "_ibd_lso_enable", 551 "_ibd_num_ah", 552 "_ibd_num_lso_bufs", 553 "_ibd_rc_enable_srq", 554 "_ibd_rc_num_rwqe", 555 "_ibd_rc_num_srq", 556 "_ibd_rc_num_swqe", 557 "_ibd_rc_rx_comp_count", 558 "_ibd_rc_rx_comp_usec", 559 "_ibd_rc_rx_copy_thresh", 560 "_ibd_rc_rx_rwqe_thresh", 561 "_ibd_rc_tx_comp_count", 562 "_ibd_rc_tx_comp_usec", 563 "_ibd_rc_tx_copy_thresh", 564 "_ibd_ud_num_rwqe", 565 "_ibd_ud_num_swqe", 566 "_ibd_ud_rx_comp_count", 567 "_ibd_ud_rx_comp_usec", 568 "_ibd_ud_tx_comp_count", 569 "_ibd_ud_tx_comp_usec", 570 "_ibd_ud_tx_copy_thresh", 571 NULL 572 }; 573 574 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *); 575 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *); 576 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *); 577 578 static dld_ioc_info_t ibd_dld_ioctl_list[] = { 579 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t), 580 ibd_create_partition, secpolicy_dl_config}, 581 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t), 582 ibd_delete_partition, secpolicy_dl_config}, 583 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t), 584 ibd_get_partition_info, NULL} 585 }; 586 587 /* 588 * Fill/clear <scope> and <p_key> in multicast/broadcast address 589 */ 590 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 591 { \ 592 *(uint32_t *)((char *)(maddr) + 4) |= \ 593 htonl((uint32_t)(scope) << 16); \ 594 *(uint32_t *)((char *)(maddr) + 8) |= \ 595 htonl((uint32_t)(pkey) << 16); \ 596 } 597 598 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 599 { \ 600 *(uint32_t *)((char *)(maddr) + 4) &= \ 601 htonl(~((uint32_t)0xF << 16)); \ 602 *(uint32_t *)((char *)(maddr) + 8) &= \ 603 htonl(~((uint32_t)0xFFFF << 16)); \ 604 } 605 606 /* 607 * Rudimentary debugging support 608 */ 609 #ifdef DEBUG 610 int ibd_debuglevel = 100; 611 void 612 debug_print(int l, char *fmt, ...) 613 { 614 va_list ap; 615 616 if (l < ibd_debuglevel) 617 return; 618 va_start(ap, fmt); 619 vcmn_err(CE_CONT, fmt, ap); 620 va_end(ap); 621 } 622 #endif 623 624 /* 625 * Common routine to print warning messages; adds in hca guid, port number 626 * and pkey to be able to identify the IBA interface. 627 */ 628 void 629 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 630 { 631 ib_guid_t hca_guid; 632 char ibd_print_buf[256]; 633 int len; 634 va_list ap; 635 636 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 637 0, "hca-guid", 0); 638 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 639 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 640 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 641 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 642 va_start(ap, fmt); 643 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 644 fmt, ap); 645 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 646 va_end(ap); 647 } 648 649 /* 650 * Warlock directives 651 */ 652 653 /* 654 * id_lso_lock 655 * 656 * state->id_lso->bkt_nfree may be accessed without a lock to 657 * determine the threshold at which we have to ask the nw layer 658 * to resume transmission (see ibd_resume_transmission()). 659 */ 660 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 661 ibd_state_t::id_lso)) 662 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 663 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 664 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 665 666 /* 667 * id_scq_poll_lock 668 */ 669 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 670 ibd_state_t::id_scq_poll_busy)) 671 672 /* 673 * id_txpost_lock 674 */ 675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 676 ibd_state_t::id_tx_head)) 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 678 ibd_state_t::id_tx_busy)) 679 680 /* 681 * id_acache_req_lock 682 */ 683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 684 ibd_state_t::id_acache_req_cv)) 685 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 686 ibd_state_t::id_req_list)) 687 _NOTE(SCHEME_PROTECTS_DATA("atomic", 688 ibd_acache_s::ac_ref)) 689 690 /* 691 * id_ac_mutex 692 * 693 * This mutex is actually supposed to protect id_ah_op as well, 694 * but this path of the code isn't clean (see update of id_ah_op 695 * in ibd_async_acache(), immediately after the call to 696 * ibd_async_mcache()). For now, we'll skip this check by 697 * declaring that id_ah_op is protected by some internal scheme 698 * that warlock isn't aware of. 699 */ 700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 701 ibd_state_t::id_ah_active)) 702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 703 ibd_state_t::id_ah_free)) 704 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 705 ibd_state_t::id_ah_addr)) 706 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 707 ibd_state_t::id_ah_op)) 708 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 709 ibd_state_t::id_ah_error)) 710 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 711 ibd_state_t::id_ac_hot_ace)) 712 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 713 714 /* 715 * id_mc_mutex 716 */ 717 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 718 ibd_state_t::id_mc_full)) 719 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 720 ibd_state_t::id_mc_non)) 721 722 /* 723 * id_trap_lock 724 */ 725 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 726 ibd_state_t::id_trap_cv)) 727 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 728 ibd_state_t::id_trap_stop)) 729 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 730 ibd_state_t::id_trap_inprog)) 731 732 /* 733 * id_prom_op 734 */ 735 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 736 ibd_state_t::id_prom_op)) 737 738 /* 739 * id_sched_lock 740 */ 741 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 742 ibd_state_t::id_sched_needed)) 743 744 /* 745 * id_link_mutex 746 */ 747 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 748 ibd_state_t::id_link_state)) 749 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 750 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 751 ibd_state_t::id_link_speed)) 752 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 753 754 /* 755 * id_tx_list.dl_mutex 756 */ 757 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 758 ibd_state_t::id_tx_list.dl_head)) 759 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 760 ibd_state_t::id_tx_list.dl_pending_sends)) 761 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 762 ibd_state_t::id_tx_list.dl_cnt)) 763 764 /* 765 * id_rx_list.dl_mutex 766 */ 767 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 768 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 769 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 770 ibd_state_t::id_rx_list.dl_cnt)) 771 772 773 /* 774 * Items protected by atomic updates 775 */ 776 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 777 ibd_state_s::id_brd_rcv 778 ibd_state_s::id_brd_xmt 779 ibd_state_s::id_multi_rcv 780 ibd_state_s::id_multi_xmt 781 ibd_state_s::id_num_intrs 782 ibd_state_s::id_rcv_bytes 783 ibd_state_s::id_rcv_pkt 784 ibd_state_s::id_rx_post_queue_index 785 ibd_state_s::id_tx_short 786 ibd_state_s::id_xmt_bytes 787 ibd_state_s::id_xmt_pkt 788 ibd_state_s::rc_rcv_trans_byte 789 ibd_state_s::rc_rcv_trans_pkt 790 ibd_state_s::rc_rcv_copy_byte 791 ibd_state_s::rc_rcv_copy_pkt 792 ibd_state_s::rc_xmt_bytes 793 ibd_state_s::rc_xmt_small_pkt 794 ibd_state_s::rc_xmt_fragmented_pkt 795 ibd_state_s::rc_xmt_map_fail_pkt 796 ibd_state_s::rc_xmt_map_succ_pkt)) 797 798 /* 799 * Non-mutex protection schemes for data elements. Almost all of 800 * these are non-shared items. 801 */ 802 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 803 callb_cpr 804 ib_gid_s 805 ib_header_info 806 ibd_acache_rq 807 ibd_acache_s::ac_mce 808 ibd_acache_s::ac_chan 809 ibd_mcache::mc_fullreap 810 ibd_mcache::mc_jstate 811 ibd_mcache::mc_req 812 ibd_rwqe_s 813 ibd_swqe_s 814 ibd_wqe_s 815 ibt_wr_ds_s::ds_va 816 ibt_wr_lso_s 817 ipoib_mac::ipoib_qpn 818 mac_capab_lso_s 819 msgb::b_next 820 msgb::b_cont 821 msgb::b_rptr 822 msgb::b_wptr 823 ibd_state_s::id_bgroup_created 824 ibd_state_s::id_mac_state 825 ibd_state_s::id_mtu 826 ibd_state_s::id_ud_num_rwqe 827 ibd_state_s::id_ud_num_swqe 828 ibd_state_s::id_qpnum 829 ibd_state_s::id_rcq_hdl 830 ibd_state_s::id_rx_buf_sz 831 ibd_state_s::id_rx_bufs 832 ibd_state_s::id_rx_mr_hdl 833 ibd_state_s::id_rx_wqes 834 ibd_state_s::id_rxwcs 835 ibd_state_s::id_rxwcs_size 836 ibd_state_s::id_rx_nqueues 837 ibd_state_s::id_rx_queues 838 ibd_state_s::id_scope 839 ibd_state_s::id_scq_hdl 840 ibd_state_s::id_tx_buf_sz 841 ibd_state_s::id_tx_bufs 842 ibd_state_s::id_tx_mr_hdl 843 ibd_state_s::id_tx_rel_list.dl_cnt 844 ibd_state_s::id_tx_wqes 845 ibd_state_s::id_txwcs 846 ibd_state_s::id_txwcs_size 847 ibd_state_s::rc_listen_hdl 848 ibd_state_s::rc_listen_hdl_OFED_interop 849 ibd_state_s::rc_srq_size 850 ibd_state_s::rc_srq_rwqes 851 ibd_state_s::rc_srq_rx_bufs 852 ibd_state_s::rc_srq_rx_mr_hdl 853 ibd_state_s::rc_tx_largebuf_desc_base 854 ibd_state_s::rc_tx_mr_bufs 855 ibd_state_s::rc_tx_mr_hdl 856 ipha_s 857 icmph_s 858 ibt_path_info_s::pi_sid 859 ibd_rc_chan_s::ace 860 ibd_rc_chan_s::chan_hdl 861 ibd_rc_chan_s::state 862 ibd_rc_chan_s::chan_state 863 ibd_rc_chan_s::is_tx_chan 864 ibd_rc_chan_s::rcq_hdl 865 ibd_rc_chan_s::rcq_size 866 ibd_rc_chan_s::scq_hdl 867 ibd_rc_chan_s::scq_size 868 ibd_rc_chan_s::requester_gid 869 ibd_rc_chan_s::requester_pkey 870 ibd_rc_chan_s::rx_bufs 871 ibd_rc_chan_s::rx_mr_hdl 872 ibd_rc_chan_s::rx_rwqes 873 ibd_rc_chan_s::tx_wqes 874 ibd_rc_chan_s::tx_mr_bufs 875 ibd_rc_chan_s::tx_mr_hdl 876 ibd_rc_chan_s::tx_rel_list.dl_cnt 877 ibd_rc_chan_s::tx_trans_error_cnt 878 ibd_rc_tx_largebuf_s::lb_buf 879 ibd_rc_msg_hello_s 880 ibt_cm_return_args_s)) 881 882 /* 883 * ibd_rc_chan_s::next is protected by two mutexes: 884 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex 885 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. 886 */ 887 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", 888 ibd_rc_chan_s::next)) 889 890 /* 891 * ibd_state_s.rc_tx_large_bufs_lock 892 */ 893 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 894 ibd_state_s::rc_tx_largebuf_free_head)) 895 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 896 ibd_state_s::rc_tx_largebuf_nfree)) 897 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 898 ibd_rc_tx_largebuf_s::lb_next)) 899 900 /* 901 * ibd_acache_s.tx_too_big_mutex 902 */ 903 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, 904 ibd_acache_s::tx_too_big_ongoing)) 905 906 /* 907 * tx_wqe_list.dl_mutex 908 */ 909 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 910 ibd_rc_chan_s::tx_wqe_list.dl_head)) 911 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 912 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) 913 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 914 ibd_rc_chan_s::tx_wqe_list.dl_cnt)) 915 916 /* 917 * ibd_state_s.rc_ace_recycle_lock 918 */ 919 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, 920 ibd_state_s::rc_ace_recycle)) 921 922 /* 923 * rc_srq_rwqe_list.dl_mutex 924 */ 925 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 926 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) 927 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 928 ibd_state_t::rc_srq_rwqe_list.dl_cnt)) 929 930 /* 931 * Non-mutex protection schemes for data elements. They are counters 932 * for problem diagnosis. Don't need be protected. 933 */ 934 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 935 ibd_state_s::rc_rcv_alloc_fail 936 ibd_state_s::rc_rcq_invoke 937 ibd_state_s::rc_rcq_err 938 ibd_state_s::rc_ace_not_found 939 ibd_state_s::rc_xmt_drop_too_long_pkt 940 ibd_state_s::rc_xmt_icmp_too_long_pkt 941 ibd_state_s::rc_xmt_reenter_too_long_pkt 942 ibd_state_s::rc_swqe_short 943 ibd_state_s::rc_swqe_mac_update 944 ibd_state_s::rc_xmt_buf_short 945 ibd_state_s::rc_xmt_buf_mac_update 946 ibd_state_s::rc_scq_no_swqe 947 ibd_state_s::rc_scq_no_largebuf 948 ibd_state_s::rc_scq_invoke 949 ibd_state_s::rc_conn_succ 950 ibd_state_s::rc_conn_fail 951 ibd_state_s::rc_null_conn 952 ibd_state_s::rc_no_estab_conn 953 ibd_state_s::rc_act_close 954 ibd_state_s::rc_pas_close 955 ibd_state_s::rc_delay_ace_recycle 956 ibd_state_s::rc_act_close_simultaneous 957 ibd_state_s::rc_reset_cnt)) 958 959 #ifdef DEBUG 960 /* 961 * Non-mutex protection schemes for data elements. They are counters 962 * for problem diagnosis. Don't need be protected. 963 */ 964 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 965 ibd_state_s::rc_rwqe_short 966 ibd_rc_stat_s::rc_rcv_trans_byte 967 ibd_rc_stat_s::rc_rcv_trans_pkt 968 ibd_rc_stat_s::rc_rcv_copy_byte 969 ibd_rc_stat_s::rc_rcv_copy_pkt 970 ibd_rc_stat_s::rc_rcv_alloc_fail 971 ibd_rc_stat_s::rc_rcq_invoke 972 ibd_rc_stat_s::rc_rcq_err 973 ibd_rc_stat_s::rc_scq_invoke 974 ibd_rc_stat_s::rc_rwqe_short 975 ibd_rc_stat_s::rc_xmt_bytes 976 ibd_rc_stat_s::rc_xmt_small_pkt 977 ibd_rc_stat_s::rc_xmt_fragmented_pkt 978 ibd_rc_stat_s::rc_xmt_map_fail_pkt 979 ibd_rc_stat_s::rc_xmt_map_succ_pkt 980 ibd_rc_stat_s::rc_ace_not_found 981 ibd_rc_stat_s::rc_scq_no_swqe 982 ibd_rc_stat_s::rc_scq_no_largebuf 983 ibd_rc_stat_s::rc_swqe_short 984 ibd_rc_stat_s::rc_swqe_mac_update 985 ibd_rc_stat_s::rc_xmt_buf_short 986 ibd_rc_stat_s::rc_xmt_buf_mac_update 987 ibd_rc_stat_s::rc_conn_succ 988 ibd_rc_stat_s::rc_conn_fail 989 ibd_rc_stat_s::rc_null_conn 990 ibd_rc_stat_s::rc_no_estab_conn 991 ibd_rc_stat_s::rc_act_close 992 ibd_rc_stat_s::rc_pas_close 993 ibd_rc_stat_s::rc_delay_ace_recycle 994 ibd_rc_stat_s::rc_act_close_simultaneous 995 ibd_rc_stat_s::rc_reset_cnt)) 996 #endif 997 998 int 999 _init() 1000 { 1001 int status; 1002 1003 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 1004 PAGESIZE), 0); 1005 if (status != 0) { 1006 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 1007 return (status); 1008 } 1009 1010 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL); 1011 1012 mac_init_ops(&ibd_dev_ops, "ibp"); 1013 status = mod_install(&ibd_modlinkage); 1014 if (status != 0) { 1015 DPRINT(10, "_init:failed in mod_install()"); 1016 ddi_soft_state_fini(&ibd_list); 1017 mac_fini_ops(&ibd_dev_ops); 1018 return (status); 1019 } 1020 1021 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); 1022 mutex_enter(&ibd_gstate.ig_mutex); 1023 ibd_gstate.ig_ibt_hdl = NULL; 1024 ibd_gstate.ig_ibt_hdl_ref_cnt = 0; 1025 ibd_gstate.ig_service_list = NULL; 1026 mutex_exit(&ibd_gstate.ig_mutex); 1027 1028 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list, 1029 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) { 1030 return (EIO); 1031 } 1032 1033 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr); 1034 1035 #ifdef IBD_LOGGING 1036 ibd_log_init(); 1037 #endif 1038 return (0); 1039 } 1040 1041 int 1042 _info(struct modinfo *modinfop) 1043 { 1044 return (mod_info(&ibd_modlinkage, modinfop)); 1045 } 1046 1047 int 1048 _fini() 1049 { 1050 int status; 1051 1052 status = mod_remove(&ibd_modlinkage); 1053 if (status != 0) 1054 return (status); 1055 1056 ibt_unregister_part_attr_cb(); 1057 1058 mac_fini_ops(&ibd_dev_ops); 1059 mutex_destroy(&ibd_objlist_lock); 1060 ddi_soft_state_fini(&ibd_list); 1061 mutex_destroy(&ibd_gstate.ig_mutex); 1062 #ifdef IBD_LOGGING 1063 ibd_log_fini(); 1064 #endif 1065 return (0); 1066 } 1067 1068 /* 1069 * Convert the GID part of the mac address from network byte order 1070 * to host order. 1071 */ 1072 static void 1073 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 1074 { 1075 ib_sn_prefix_t nbopref; 1076 ib_guid_t nboguid; 1077 1078 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 1079 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 1080 dgid->gid_prefix = b2h64(nbopref); 1081 dgid->gid_guid = b2h64(nboguid); 1082 } 1083 1084 /* 1085 * Create the IPoIB address in network byte order from host order inputs. 1086 */ 1087 static void 1088 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 1089 ib_guid_t guid) 1090 { 1091 ib_sn_prefix_t nbopref; 1092 ib_guid_t nboguid; 1093 1094 mac->ipoib_qpn = htonl(qpn); 1095 nbopref = h2b64(prefix); 1096 nboguid = h2b64(guid); 1097 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 1098 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 1099 } 1100 1101 /* 1102 * Send to the appropriate all-routers group when the IBA multicast group 1103 * does not exist, based on whether the target group is v4 or v6. 1104 */ 1105 static boolean_t 1106 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 1107 ipoib_mac_t *rmac) 1108 { 1109 boolean_t retval = B_TRUE; 1110 uint32_t adjscope = state->id_scope << 16; 1111 uint32_t topword; 1112 1113 /* 1114 * Copy the first 4 bytes in without assuming any alignment of 1115 * input mac address; this will have IPoIB signature, flags and 1116 * scope bits. 1117 */ 1118 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 1119 topword = ntohl(topword); 1120 1121 /* 1122 * Generate proper address for IPv4/v6, adding in the Pkey properly. 1123 */ 1124 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 1125 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 1126 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 1127 ((uint32_t)(state->id_pkey << 16))), 1128 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 1129 else 1130 /* 1131 * Does not have proper bits in the mgid address. 1132 */ 1133 retval = B_FALSE; 1134 1135 return (retval); 1136 } 1137 1138 /* 1139 * Membership states for different mcg's are tracked by two lists: 1140 * the "non" list is used for promiscuous mode, when all mcg traffic 1141 * needs to be inspected. This type of membership is never used for 1142 * transmission, so there can not be an AH in the active list 1143 * corresponding to a member in this list. This list does not need 1144 * any protection, since all operations are performed by the async 1145 * thread. 1146 * 1147 * "Full" and "SendOnly" membership is tracked using a single list, 1148 * the "full" list. This is because this single list can then be 1149 * searched during transmit to a multicast group (if an AH for the 1150 * mcg is not found in the active list), since at least one type 1151 * of membership must be present before initiating the transmit. 1152 * This list is also emptied during driver detach, since sendonly 1153 * membership acquired during transmit is dropped at detach time 1154 * along with ipv4 broadcast full membership. Insert/deletes to 1155 * this list are done only by the async thread, but it is also 1156 * searched in program context (see multicast disable case), thus 1157 * the id_mc_mutex protects the list. The driver detach path also 1158 * deconstructs the "full" list, but it ensures that the async 1159 * thread will not be accessing the list (by blocking out mcg 1160 * trap handling and making sure no more Tx reaping will happen). 1161 * 1162 * Currently, an IBA attach is done in the SendOnly case too, 1163 * although this is not required. 1164 */ 1165 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1166 list_insert_head(&state->id_mc_full, mce) 1167 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1168 list_insert_head(&state->id_mc_non, mce) 1169 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1170 ibd_mcache_find(mgid, &state->id_mc_full) 1171 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1172 ibd_mcache_find(mgid, &state->id_mc_non) 1173 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1174 list_remove(&state->id_mc_full, mce) 1175 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1176 list_remove(&state->id_mc_non, mce) 1177 1178 static void * 1179 list_get_head(list_t *list) 1180 { 1181 list_node_t *lhead = list_head(list); 1182 1183 if (lhead != NULL) 1184 list_remove(list, lhead); 1185 return (lhead); 1186 } 1187 1188 /* 1189 * This is always guaranteed to be able to queue the work. 1190 */ 1191 void 1192 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1193 { 1194 /* Initialize request */ 1195 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1196 ptr->rq_op = op; 1197 1198 /* 1199 * Queue provided slot onto request pool. 1200 */ 1201 mutex_enter(&state->id_acache_req_lock); 1202 list_insert_tail(&state->id_req_list, ptr); 1203 1204 /* Go, fetch, async thread */ 1205 cv_signal(&state->id_acache_req_cv); 1206 mutex_exit(&state->id_acache_req_lock); 1207 } 1208 1209 /* 1210 * Main body of the per interface async thread. 1211 */ 1212 static void 1213 ibd_async_work(ibd_state_t *state) 1214 { 1215 ibd_req_t *ptr; 1216 callb_cpr_t cprinfo; 1217 1218 mutex_enter(&state->id_acache_req_lock); 1219 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1220 callb_generic_cpr, "ibd_async_work"); 1221 1222 for (;;) { 1223 ptr = list_get_head(&state->id_req_list); 1224 if (ptr != NULL) { 1225 mutex_exit(&state->id_acache_req_lock); 1226 1227 /* 1228 * If we are in late hca initialization mode, do not 1229 * process any other async request other than TRAP. TRAP 1230 * is used for indicating creation of a broadcast group; 1231 * in which case, we need to join/create the group. 1232 */ 1233 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 1234 (ptr->rq_op != IBD_ASYNC_TRAP)) { 1235 goto free_req_and_continue; 1236 } 1237 1238 /* 1239 * Once we have done the operation, there is no 1240 * guarantee the request slot is going to be valid, 1241 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1242 * TRAP). 1243 * 1244 * Perform the request. 1245 */ 1246 switch (ptr->rq_op) { 1247 case IBD_ASYNC_GETAH: 1248 ibd_async_acache(state, &ptr->rq_mac); 1249 break; 1250 case IBD_ASYNC_JOIN: 1251 case IBD_ASYNC_LEAVE: 1252 ibd_async_multicast(state, 1253 ptr->rq_gid, ptr->rq_op); 1254 break; 1255 case IBD_ASYNC_PROMON: 1256 ibd_async_setprom(state); 1257 break; 1258 case IBD_ASYNC_PROMOFF: 1259 ibd_async_unsetprom(state); 1260 break; 1261 case IBD_ASYNC_REAP: 1262 ibd_async_reap_group(state, 1263 ptr->rq_ptr, ptr->rq_gid, 1264 IB_MC_JSTATE_FULL); 1265 /* 1266 * the req buf contains in mce 1267 * structure, so we do not need 1268 * to free it here. 1269 */ 1270 ptr = NULL; 1271 break; 1272 case IBD_ASYNC_TRAP: 1273 ibd_async_trap(state, ptr); 1274 break; 1275 case IBD_ASYNC_SCHED: 1276 ibd_async_txsched(state); 1277 break; 1278 case IBD_ASYNC_LINK: 1279 ibd_async_link(state, ptr); 1280 break; 1281 case IBD_ASYNC_EXIT: 1282 mutex_enter(&state->id_acache_req_lock); 1283 #ifndef __lock_lint 1284 CALLB_CPR_EXIT(&cprinfo); 1285 #else 1286 mutex_exit(&state->id_acache_req_lock); 1287 #endif 1288 return; 1289 case IBD_ASYNC_RC_TOO_BIG: 1290 ibd_async_rc_process_too_big(state, 1291 ptr); 1292 break; 1293 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 1294 ibd_async_rc_close_act_chan(state, ptr); 1295 break; 1296 case IBD_ASYNC_RC_RECYCLE_ACE: 1297 ibd_async_rc_recycle_ace(state, ptr); 1298 break; 1299 } 1300 free_req_and_continue: 1301 if (ptr != NULL) 1302 kmem_cache_free(state->id_req_kmc, ptr); 1303 1304 mutex_enter(&state->id_acache_req_lock); 1305 } else { 1306 #ifndef __lock_lint 1307 /* 1308 * Nothing to do: wait till new request arrives. 1309 */ 1310 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1311 cv_wait(&state->id_acache_req_cv, 1312 &state->id_acache_req_lock); 1313 CALLB_CPR_SAFE_END(&cprinfo, 1314 &state->id_acache_req_lock); 1315 #endif 1316 } 1317 } 1318 1319 /*NOTREACHED*/ 1320 _NOTE(NOT_REACHED) 1321 } 1322 1323 /* 1324 * Return when it is safe to queue requests to the async daemon; primarily 1325 * for subnet trap and async event handling. Disallow requests before the 1326 * daemon is created, and when interface deinitilization starts. 1327 */ 1328 static boolean_t 1329 ibd_async_safe(ibd_state_t *state) 1330 { 1331 mutex_enter(&state->id_trap_lock); 1332 if (state->id_trap_stop) { 1333 mutex_exit(&state->id_trap_lock); 1334 return (B_FALSE); 1335 } 1336 state->id_trap_inprog++; 1337 mutex_exit(&state->id_trap_lock); 1338 return (B_TRUE); 1339 } 1340 1341 /* 1342 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1343 * trap or event handling to complete to kill the async thread and deconstruct 1344 * the mcg/ace list. 1345 */ 1346 static void 1347 ibd_async_done(ibd_state_t *state) 1348 { 1349 mutex_enter(&state->id_trap_lock); 1350 if (--state->id_trap_inprog == 0) 1351 cv_signal(&state->id_trap_cv); 1352 mutex_exit(&state->id_trap_lock); 1353 } 1354 1355 /* 1356 * Hash functions: 1357 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1358 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1359 * These operate on mac addresses input into ibd_send, but there is no 1360 * guarantee on the alignment of the ipoib_mac_t structure. 1361 */ 1362 /*ARGSUSED*/ 1363 static uint_t 1364 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1365 { 1366 ulong_t ptraddr = (ulong_t)key; 1367 uint_t hval; 1368 1369 /* 1370 * If the input address is 4 byte aligned, we can just dereference 1371 * it. This is most common, since IP will send in a 4 byte aligned 1372 * IP header, which implies the 24 byte IPoIB psuedo header will be 1373 * 4 byte aligned too. 1374 */ 1375 if ((ptraddr & 3) == 0) 1376 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1377 1378 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1379 return (hval); 1380 } 1381 1382 static int 1383 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1384 { 1385 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1386 return (0); 1387 else 1388 return (1); 1389 } 1390 1391 /* 1392 * Initialize all the per interface caches and lists; AH cache, 1393 * MCG list etc. 1394 */ 1395 static int 1396 ibd_acache_init(ibd_state_t *state) 1397 { 1398 ibd_ace_t *ce; 1399 int i; 1400 1401 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1402 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1403 mutex_enter(&state->id_ac_mutex); 1404 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1405 offsetof(ibd_ace_t, ac_list)); 1406 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1407 offsetof(ibd_ace_t, ac_list)); 1408 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1409 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor, 1410 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1411 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1412 offsetof(ibd_mce_t, mc_list)); 1413 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1414 offsetof(ibd_mce_t, mc_list)); 1415 state->id_ac_hot_ace = NULL; 1416 1417 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1418 state->id_num_ah, KM_SLEEP); 1419 for (i = 0; i < state->id_num_ah; i++, ce++) { 1420 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1421 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1422 mutex_exit(&state->id_ac_mutex); 1423 ibd_acache_fini(state); 1424 return (DDI_FAILURE); 1425 } else { 1426 CLEAR_REFCYCLE(ce); 1427 ce->ac_mce = NULL; 1428 mutex_init(&ce->tx_too_big_mutex, NULL, 1429 MUTEX_DRIVER, NULL); 1430 IBD_ACACHE_INSERT_FREE(state, ce); 1431 } 1432 } 1433 mutex_exit(&state->id_ac_mutex); 1434 return (DDI_SUCCESS); 1435 } 1436 1437 static void 1438 ibd_acache_fini(ibd_state_t *state) 1439 { 1440 ibd_ace_t *ptr; 1441 1442 mutex_enter(&state->id_ac_mutex); 1443 1444 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1445 ASSERT(GET_REF(ptr) == 0); 1446 mutex_destroy(&ptr->tx_too_big_mutex); 1447 (void) ibt_free_ud_dest(ptr->ac_dest); 1448 } 1449 1450 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1451 ASSERT(GET_REF(ptr) == 0); 1452 mutex_destroy(&ptr->tx_too_big_mutex); 1453 (void) ibt_free_ud_dest(ptr->ac_dest); 1454 } 1455 1456 list_destroy(&state->id_ah_free); 1457 list_destroy(&state->id_ah_active); 1458 list_destroy(&state->id_mc_full); 1459 list_destroy(&state->id_mc_non); 1460 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah); 1461 mutex_exit(&state->id_ac_mutex); 1462 mutex_destroy(&state->id_ac_mutex); 1463 mutex_destroy(&state->id_mc_mutex); 1464 } 1465 1466 /* 1467 * Search AH active hash list for a cached path to input destination. 1468 * If we are "just looking", hold == F. When we are in the Tx path, 1469 * we set hold == T to grab a reference on the AH so that it can not 1470 * be recycled to a new destination while the Tx request is posted. 1471 */ 1472 ibd_ace_t * 1473 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1474 { 1475 ibd_ace_t *ptr; 1476 1477 ASSERT(mutex_owned(&state->id_ac_mutex)); 1478 1479 /* 1480 * Do hash search. 1481 */ 1482 if (mod_hash_find(state->id_ah_active_hash, 1483 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1484 if (hold) 1485 INC_REF(ptr, num); 1486 return (ptr); 1487 } 1488 return (NULL); 1489 } 1490 1491 /* 1492 * This is called by the tx side; if an initialized AH is found in 1493 * the active list, it is locked down and can be used; if no entry 1494 * is found, an async request is queued to do path resolution. 1495 */ 1496 static ibd_ace_t * 1497 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1498 { 1499 ibd_ace_t *ptr; 1500 ibd_req_t *req; 1501 1502 /* 1503 * Only attempt to print when we can; in the mdt pattr case, the 1504 * address is not aligned properly. 1505 */ 1506 if (((ulong_t)mac & 3) == 0) { 1507 DPRINT(4, 1508 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1509 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1510 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1511 htonl(mac->ipoib_gidsuff[1])); 1512 } 1513 1514 mutex_enter(&state->id_ac_mutex); 1515 1516 if (((ptr = state->id_ac_hot_ace) != NULL) && 1517 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1518 INC_REF(ptr, numwqe); 1519 mutex_exit(&state->id_ac_mutex); 1520 return (ptr); 1521 } 1522 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1523 state->id_ac_hot_ace = ptr; 1524 mutex_exit(&state->id_ac_mutex); 1525 return (ptr); 1526 } 1527 1528 /* 1529 * Implementation of a single outstanding async request; if 1530 * the operation is not started yet, queue a request and move 1531 * to ongoing state. Remember in id_ah_addr for which address 1532 * we are queueing the request, in case we need to flag an error; 1533 * Any further requests, for the same or different address, until 1534 * the operation completes, is sent back to GLDv3 to be retried. 1535 * The async thread will update id_ah_op with an error indication 1536 * or will set it to indicate the next look up can start; either 1537 * way, it will mac_tx_update() so that all blocked requests come 1538 * back here. 1539 */ 1540 *err = EAGAIN; 1541 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1542 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1543 if (req != NULL) { 1544 /* 1545 * We did not even find the entry; queue a request 1546 * for it. 1547 */ 1548 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1549 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1550 state->id_ah_op = IBD_OP_ONGOING; 1551 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1552 } 1553 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1554 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1555 /* 1556 * Check the status of the pathrecord lookup request 1557 * we had queued before. 1558 */ 1559 if (state->id_ah_op == IBD_OP_ERRORED) { 1560 *err = EFAULT; 1561 state->id_ah_error++; 1562 } else { 1563 /* 1564 * IBD_OP_ROUTERED case: We need to send to the 1565 * all-router MCG. If we can find the AH for 1566 * the mcg, the Tx will be attempted. If we 1567 * do not find the AH, we return NORESOURCES 1568 * to retry. 1569 */ 1570 ipoib_mac_t routermac; 1571 1572 (void) ibd_get_allroutergroup(state, mac, &routermac); 1573 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1574 numwqe); 1575 } 1576 state->id_ah_op = IBD_OP_NOTSTARTED; 1577 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1578 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1579 /* 1580 * This case can happen when we get a higher band 1581 * packet. The easiest way is to reset the state machine 1582 * to accommodate the higher priority packet. 1583 */ 1584 state->id_ah_op = IBD_OP_NOTSTARTED; 1585 } 1586 mutex_exit(&state->id_ac_mutex); 1587 1588 return (ptr); 1589 } 1590 1591 /* 1592 * Grab a not-currently-in-use AH/PathRecord from the active 1593 * list to recycle to a new destination. Only the async thread 1594 * executes this code. 1595 */ 1596 static ibd_ace_t * 1597 ibd_acache_get_unref(ibd_state_t *state) 1598 { 1599 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1600 boolean_t try_rc_chan_recycle = B_FALSE; 1601 1602 ASSERT(mutex_owned(&state->id_ac_mutex)); 1603 1604 /* 1605 * Do plain linear search. 1606 */ 1607 while (ptr != NULL) { 1608 /* 1609 * Note that it is possible that the "cycle" bit 1610 * is set on the AH w/o any reference count. The 1611 * mcg must have been deleted, and the tx cleanup 1612 * just decremented the reference count to 0, but 1613 * hasn't gotten around to grabbing the id_ac_mutex 1614 * to move the AH into the free list. 1615 */ 1616 if (GET_REF(ptr) == 0) { 1617 if (ptr->ac_chan != NULL) { 1618 ASSERT(state->id_enable_rc == B_TRUE); 1619 if (!try_rc_chan_recycle) { 1620 try_rc_chan_recycle = B_TRUE; 1621 ibd_rc_signal_ace_recycle(state, ptr); 1622 } 1623 } else { 1624 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1625 break; 1626 } 1627 } 1628 ptr = list_prev(&state->id_ah_active, ptr); 1629 } 1630 return (ptr); 1631 } 1632 1633 /* 1634 * Invoked to clean up AH from active list in case of multicast 1635 * disable and to handle sendonly memberships during mcg traps. 1636 * And for port up processing for multicast and unicast AHs. 1637 * Normally, the AH is taken off the active list, and put into 1638 * the free list to be recycled for a new destination. In case 1639 * Tx requests on the AH have not completed yet, the AH is marked 1640 * for reaping (which will put the AH on the free list) once the Tx's 1641 * complete; in this case, depending on the "force" input, we take 1642 * out the AH from the active list right now, or leave it also for 1643 * the reap operation. Returns TRUE if the AH is taken off the active 1644 * list (and either put into the free list right now, or arranged for 1645 * later), FALSE otherwise. 1646 */ 1647 boolean_t 1648 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1649 { 1650 ibd_ace_t *acactive; 1651 boolean_t ret = B_TRUE; 1652 1653 ASSERT(mutex_owned(&state->id_ac_mutex)); 1654 1655 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1656 1657 /* 1658 * Note that the AH might already have the cycle bit set 1659 * on it; this might happen if sequences of multicast 1660 * enables and disables are coming so fast, that posted 1661 * Tx's to the mcg have not completed yet, and the cycle 1662 * bit is set successively by each multicast disable. 1663 */ 1664 if (SET_CYCLE_IF_REF(acactive)) { 1665 if (!force) { 1666 /* 1667 * The ace is kept on the active list, further 1668 * Tx's can still grab a reference on it; the 1669 * ace is reaped when all pending Tx's 1670 * referencing the AH complete. 1671 */ 1672 ret = B_FALSE; 1673 } else { 1674 /* 1675 * In the mcg trap case, we always pull the 1676 * AH from the active list. And also the port 1677 * up multi/unicast case. 1678 */ 1679 ASSERT(acactive->ac_chan == NULL); 1680 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1681 acactive->ac_mce = NULL; 1682 } 1683 } else { 1684 /* 1685 * Determined the ref count is 0, thus reclaim 1686 * immediately after pulling out the ace from 1687 * the active list. 1688 */ 1689 ASSERT(acactive->ac_chan == NULL); 1690 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1691 acactive->ac_mce = NULL; 1692 IBD_ACACHE_INSERT_FREE(state, acactive); 1693 } 1694 1695 } 1696 return (ret); 1697 } 1698 1699 /* 1700 * Helper function for async path record lookup. If we are trying to 1701 * Tx to a MCG, check our membership, possibly trying to join the 1702 * group if required. If that fails, try to send the packet to the 1703 * all router group (indicated by the redirect output), pointing 1704 * the input mac address to the router mcg address. 1705 */ 1706 static ibd_mce_t * 1707 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1708 { 1709 ib_gid_t mgid; 1710 ibd_mce_t *mce; 1711 ipoib_mac_t routermac; 1712 1713 *redirect = B_FALSE; 1714 ibd_n2h_gid(mac, &mgid); 1715 1716 /* 1717 * Check the FullMember+SendOnlyNonMember list. 1718 * Since we are the only one who manipulates the 1719 * id_mc_full list, no locks are needed. 1720 */ 1721 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1722 if (mce != NULL) { 1723 DPRINT(4, "ibd_async_mcache : already joined to group"); 1724 return (mce); 1725 } 1726 1727 /* 1728 * Not found; try to join(SendOnlyNonMember) and attach. 1729 */ 1730 DPRINT(4, "ibd_async_mcache : not joined to group"); 1731 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1732 NULL) { 1733 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1734 return (mce); 1735 } 1736 1737 /* 1738 * MCGroup not present; try to join the all-router group. If 1739 * any of the following steps succeed, we will be redirecting 1740 * to the all router group. 1741 */ 1742 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1743 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1744 return (NULL); 1745 *redirect = B_TRUE; 1746 ibd_n2h_gid(&routermac, &mgid); 1747 bcopy(&routermac, mac, IPOIB_ADDRL); 1748 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1749 mgid.gid_prefix, mgid.gid_guid); 1750 1751 /* 1752 * Are we already joined to the router group? 1753 */ 1754 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1755 DPRINT(4, "ibd_async_mcache : using already joined router" 1756 "group\n"); 1757 return (mce); 1758 } 1759 1760 /* 1761 * Can we join(SendOnlyNonMember) the router group? 1762 */ 1763 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1764 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1765 NULL) { 1766 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1767 return (mce); 1768 } 1769 1770 return (NULL); 1771 } 1772 1773 /* 1774 * Async path record lookup code. 1775 */ 1776 static void 1777 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1778 { 1779 ibd_ace_t *ce; 1780 ibd_mce_t *mce = NULL; 1781 ibt_path_attr_t path_attr; 1782 ibt_path_info_t path_info; 1783 ib_gid_t destgid; 1784 char ret = IBD_OP_NOTSTARTED; 1785 1786 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1787 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1788 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1789 htonl(mac->ipoib_gidsuff[1])); 1790 1791 /* 1792 * Check whether we are trying to transmit to a MCG. 1793 * In that case, we need to make sure we are a member of 1794 * the MCG. 1795 */ 1796 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1797 boolean_t redirected; 1798 1799 /* 1800 * If we can not find or join the group or even 1801 * redirect, error out. 1802 */ 1803 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1804 NULL) { 1805 state->id_ah_op = IBD_OP_ERRORED; 1806 return; 1807 } 1808 1809 /* 1810 * If we got redirected, we need to determine whether 1811 * the AH for the new mcg is in the cache already, and 1812 * not pull it in then; otherwise proceed to get the 1813 * path for the new mcg. There is no guarantee that 1814 * if the AH is currently in the cache, it will still be 1815 * there when we look in ibd_acache_lookup(), but that's 1816 * okay, we will come back here. 1817 */ 1818 if (redirected) { 1819 ret = IBD_OP_ROUTERED; 1820 DPRINT(4, "ibd_async_acache : redirected to " 1821 "%08X:%08X:%08X:%08X:%08X", 1822 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1823 htonl(mac->ipoib_gidpref[1]), 1824 htonl(mac->ipoib_gidsuff[0]), 1825 htonl(mac->ipoib_gidsuff[1])); 1826 1827 mutex_enter(&state->id_ac_mutex); 1828 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1829 state->id_ah_op = IBD_OP_ROUTERED; 1830 mutex_exit(&state->id_ac_mutex); 1831 DPRINT(4, "ibd_async_acache : router AH found"); 1832 return; 1833 } 1834 mutex_exit(&state->id_ac_mutex); 1835 } 1836 } 1837 1838 /* 1839 * Get an AH from the free list. 1840 */ 1841 mutex_enter(&state->id_ac_mutex); 1842 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1843 /* 1844 * No free ones; try to grab an unreferenced active 1845 * one. Maybe we need to make the active list LRU, 1846 * but that will create more work for Tx callbacks. 1847 * Is there a way of not having to pull out the 1848 * entry from the active list, but just indicate it 1849 * is being recycled? Yes, but that creates one more 1850 * check in the fast lookup path. 1851 */ 1852 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1853 /* 1854 * Pretty serious shortage now. 1855 */ 1856 state->id_ah_op = IBD_OP_NOTSTARTED; 1857 mutex_exit(&state->id_ac_mutex); 1858 DPRINT(10, "ibd_async_acache : failed to find AH " 1859 "slot\n"); 1860 return; 1861 } 1862 /* 1863 * We could check whether ac_mce points to a SendOnly 1864 * member and drop that membership now. Or do it lazily 1865 * at detach time. 1866 */ 1867 ce->ac_mce = NULL; 1868 } 1869 mutex_exit(&state->id_ac_mutex); 1870 ASSERT(ce->ac_mce == NULL); 1871 1872 /* 1873 * Update the entry. 1874 */ 1875 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1876 1877 bzero(&path_info, sizeof (path_info)); 1878 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1879 path_attr.pa_sgid = state->id_sgid; 1880 path_attr.pa_num_dgids = 1; 1881 ibd_n2h_gid(&ce->ac_mac, &destgid); 1882 path_attr.pa_dgids = &destgid; 1883 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1884 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1885 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1886 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1887 goto error; 1888 } 1889 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1890 ntohl(ce->ac_mac.ipoib_qpn), 1891 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1892 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1893 goto error; 1894 } 1895 1896 /* 1897 * mce is set whenever an AH is being associated with a 1898 * MCG; this will come in handy when we leave the MCG. The 1899 * lock protects Tx fastpath from scanning the active list. 1900 */ 1901 if (mce != NULL) 1902 ce->ac_mce = mce; 1903 1904 /* 1905 * initiate a RC mode connection for unicast address 1906 */ 1907 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1908 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1909 ASSERT(ce->ac_chan == NULL); 1910 DPRINT(10, "ibd_async_acache: call " 1911 "ibd_rc_try_connect(ace=%p)", ce); 1912 ibd_rc_try_connect(state, ce, &path_info); 1913 if (ce->ac_chan == NULL) { 1914 DPRINT(10, "ibd_async_acache: fail to setup RC" 1915 " channel"); 1916 state->rc_conn_fail++; 1917 goto error; 1918 } 1919 } 1920 1921 mutex_enter(&state->id_ac_mutex); 1922 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1923 state->id_ah_op = ret; 1924 mutex_exit(&state->id_ac_mutex); 1925 return; 1926 error: 1927 /* 1928 * We might want to drop SendOnly membership here if we 1929 * joined above. The lock protects Tx callbacks inserting 1930 * into the free list. 1931 */ 1932 mutex_enter(&state->id_ac_mutex); 1933 state->id_ah_op = IBD_OP_ERRORED; 1934 IBD_ACACHE_INSERT_FREE(state, ce); 1935 mutex_exit(&state->id_ac_mutex); 1936 } 1937 1938 /* 1939 * While restoring port's presence on the subnet on a port up, it is possible 1940 * that the port goes down again. 1941 */ 1942 static void 1943 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1944 { 1945 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1946 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1947 LINK_STATE_UP; 1948 ibd_mce_t *mce, *pmce; 1949 ibd_ace_t *ace, *pace; 1950 1951 DPRINT(10, "ibd_async_link(): %d", opcode); 1952 1953 /* 1954 * On a link up, revalidate the link speed/width. No point doing 1955 * this on a link down, since we will be unable to do SA operations, 1956 * defaulting to the lowest speed. Also notice that we update our 1957 * notion of speed before calling mac_link_update(), which will do 1958 * necessary higher level notifications for speed changes. 1959 */ 1960 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1961 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1962 state->id_link_speed = ibd_get_portspeed(state); 1963 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1964 } 1965 1966 /* 1967 * Do all the work required to establish our presence on 1968 * the subnet. 1969 */ 1970 if (opcode == IBD_LINK_UP_ABSENT) { 1971 /* 1972 * If in promiscuous mode ... 1973 */ 1974 if (state->id_prom_op == IBD_OP_COMPLETED) { 1975 /* 1976 * Drop all nonmembership. 1977 */ 1978 ibd_async_unsetprom(state); 1979 1980 /* 1981 * Then, try to regain nonmembership to all mcg's. 1982 */ 1983 ibd_async_setprom(state); 1984 1985 } 1986 1987 /* 1988 * Drop all sendonly membership (which also gets rid of the 1989 * AHs); try to reacquire all full membership. 1990 */ 1991 mce = list_head(&state->id_mc_full); 1992 while ((pmce = mce) != NULL) { 1993 mce = list_next(&state->id_mc_full, mce); 1994 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1995 ibd_leave_group(state, 1996 pmce->mc_info.mc_adds_vect.av_dgid, 1997 IB_MC_JSTATE_SEND_ONLY_NON); 1998 else 1999 ibd_reacquire_group(state, pmce); 2000 } 2001 2002 /* 2003 * Recycle all active AHs to free list (and if there are 2004 * pending posts, make sure they will go into the free list 2005 * once the Tx's complete). Grab the lock to prevent 2006 * concurrent Tx's as well as Tx cleanups. 2007 */ 2008 mutex_enter(&state->id_ac_mutex); 2009 ace = list_head(&state->id_ah_active); 2010 while ((pace = ace) != NULL) { 2011 boolean_t cycled; 2012 2013 ace = list_next(&state->id_ah_active, ace); 2014 mce = pace->ac_mce; 2015 if (pace->ac_chan != NULL) { 2016 ASSERT(mce == NULL); 2017 ASSERT(state->id_enable_rc == B_TRUE); 2018 if (pace->ac_chan->chan_state == 2019 IBD_RC_STATE_ACT_ESTAB) { 2020 INC_REF(pace, 1); 2021 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 2022 pace->ac_chan->chan_state = 2023 IBD_RC_STATE_ACT_CLOSING; 2024 ibd_rc_signal_act_close(state, pace); 2025 } else { 2026 state->rc_act_close_simultaneous++; 2027 DPRINT(40, "ibd_async_link: other " 2028 "thread is closing it, ace=%p, " 2029 "ac_chan=%p, chan_state=%d", 2030 pace, pace->ac_chan, 2031 pace->ac_chan->chan_state); 2032 } 2033 } else { 2034 cycled = ibd_acache_recycle(state, 2035 &pace->ac_mac, B_TRUE); 2036 } 2037 /* 2038 * If this is for an mcg, it must be for a fullmember, 2039 * since we got rid of send-only members above when 2040 * processing the mce list. 2041 */ 2042 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2043 IB_MC_JSTATE_FULL))); 2044 2045 /* 2046 * Check if the fullmember mce needs to be torn down, 2047 * ie whether the DLPI disable has already been done. 2048 * If so, do some of the work of tx_cleanup, namely 2049 * causing leave (which will fail), detach and 2050 * mce-freeing. tx_cleanup will put the AH into free 2051 * list. The reason to duplicate some of this 2052 * tx_cleanup work is because we want to delete the 2053 * AH right now instead of waiting for tx_cleanup, to 2054 * force subsequent Tx's to reacquire an AH. 2055 */ 2056 if ((mce != NULL) && (mce->mc_fullreap)) 2057 ibd_async_reap_group(state, mce, 2058 mce->mc_info.mc_adds_vect.av_dgid, 2059 mce->mc_jstate); 2060 } 2061 mutex_exit(&state->id_ac_mutex); 2062 } 2063 2064 /* 2065 * mac handle is guaranteed to exist since driver does ibt_close_hca() 2066 * (which stops further events from being delivered) before 2067 * mac_unregister(). At this point, it is guaranteed that mac_register 2068 * has already been done. 2069 */ 2070 mutex_enter(&state->id_link_mutex); 2071 state->id_link_state = lstate; 2072 mac_link_update(state->id_mh, lstate); 2073 mutex_exit(&state->id_link_mutex); 2074 2075 ibd_async_done(state); 2076 } 2077 2078 /* 2079 * Check the pkey table to see if we can find the pkey we're looking for. 2080 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 2081 * failure. 2082 */ 2083 static int 2084 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 2085 uint16_t *pkix) 2086 { 2087 uint16_t ndx; 2088 2089 ASSERT(pkix != NULL); 2090 2091 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 2092 if (pkey_tbl[ndx] == pkey) { 2093 *pkix = ndx; 2094 return (0); 2095 } 2096 } 2097 return (-1); 2098 } 2099 2100 /* 2101 * Late HCA Initialization: 2102 * If plumb had succeeded without the availability of an active port or the 2103 * pkey, and either of their availability is now being indicated via PORT_UP 2104 * or PORT_CHANGE respectively, try a start of the interface. 2105 * 2106 * Normal Operation: 2107 * When the link is notified up, we need to do a few things, based 2108 * on the port's current p_init_type_reply claiming a reinit has been 2109 * done or not. The reinit steps are: 2110 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2111 * the old Pkey and GID0 are correct. 2112 * 2. Register for mcg traps (already done by ibmf). 2113 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2114 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2115 * 4. Give up all sendonly memberships. 2116 * 5. Acquire all full memberships. 2117 * 6. In promiscuous mode, acquire all non memberships. 2118 * 7. Recycle all AHs to free list. 2119 */ 2120 static void 2121 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2122 { 2123 ibt_hca_portinfo_t *port_infop = NULL; 2124 ibt_status_t ibt_status; 2125 uint_t psize, port_infosz; 2126 ibd_link_op_t opcode; 2127 ibd_req_t *req; 2128 link_state_t new_link_state = LINK_STATE_UP; 2129 uint8_t itreply; 2130 uint16_t pkix; 2131 int ret; 2132 2133 /* 2134 * Let's not race with a plumb or an unplumb; if we detect a 2135 * pkey relocation event later on here, we may have to restart. 2136 */ 2137 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2138 2139 mutex_enter(&state->id_link_mutex); 2140 2141 /* 2142 * If the link state is unknown, a plumb has not yet been attempted 2143 * on the interface. Nothing to do. 2144 */ 2145 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2146 mutex_exit(&state->id_link_mutex); 2147 goto link_mod_return; 2148 } 2149 2150 /* 2151 * If link state is down because of plumb failure, and we are not in 2152 * late HCA init, and we were not successfully plumbed, nothing to do. 2153 */ 2154 if ((state->id_link_state == LINK_STATE_DOWN) && 2155 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) && 2156 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) { 2157 mutex_exit(&state->id_link_mutex); 2158 goto link_mod_return; 2159 } 2160 2161 /* 2162 * If this routine was called in response to a port down event, 2163 * we just need to see if this should be informed. 2164 */ 2165 if (code == IBT_ERROR_PORT_DOWN) { 2166 new_link_state = LINK_STATE_DOWN; 2167 goto update_link_state; 2168 } 2169 2170 /* 2171 * If it's not a port down event we've received, try to get the port 2172 * attributes first. If we fail here, the port is as good as down. 2173 * Otherwise, if the link went down by the time the handler gets 2174 * here, give up - we cannot even validate the pkey/gid since those 2175 * are not valid and this is as bad as a port down anyway. 2176 */ 2177 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2178 &port_infop, &psize, &port_infosz); 2179 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2180 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2181 new_link_state = LINK_STATE_DOWN; 2182 goto update_link_state; 2183 } 2184 2185 /* 2186 * If in the previous attempt, the pkey was not found either due to the 2187 * port state being down, or due to it's absence in the pkey table, 2188 * look for it now and try to start the interface. 2189 */ 2190 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) { 2191 mutex_exit(&state->id_link_mutex); 2192 if ((ret = ibd_start(state)) != 0) { 2193 DPRINT(10, "ibd_linkmod: cannot start from late HCA " 2194 "init, ret=%d", ret); 2195 } 2196 ibt_free_portinfo(port_infop, port_infosz); 2197 goto link_mod_return; 2198 } 2199 2200 /* 2201 * Check the SM InitTypeReply flags. If both NoLoadReply and 2202 * PreserveContentReply are 0, we don't know anything about the 2203 * data loaded into the port attributes, so we need to verify 2204 * if gid0 and pkey are still valid. 2205 */ 2206 itreply = port_infop->p_init_type_reply; 2207 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2208 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2209 /* 2210 * Check to see if the subnet part of GID0 has changed. If 2211 * not, check the simple case first to see if the pkey 2212 * index is the same as before; finally check to see if the 2213 * pkey has been relocated to a different index in the table. 2214 */ 2215 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2216 if (bcmp(port_infop->p_sgid_tbl, 2217 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2218 2219 new_link_state = LINK_STATE_DOWN; 2220 2221 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2222 state->id_pkey) { 2223 2224 new_link_state = LINK_STATE_UP; 2225 2226 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2227 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2228 2229 ibt_free_portinfo(port_infop, port_infosz); 2230 mutex_exit(&state->id_link_mutex); 2231 2232 /* 2233 * Currently a restart is required if our pkey has moved 2234 * in the pkey table. If we get the ibt_recycle_ud() to 2235 * work as documented (expected), we may be able to 2236 * avoid a complete restart. Note that we've already 2237 * marked both the start and stop 'in-progress' flags, 2238 * so it is ok to go ahead and do this restart. 2239 */ 2240 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2241 if ((ret = ibd_start(state)) != 0) { 2242 DPRINT(10, "ibd_restart: cannot restart, " 2243 "ret=%d", ret); 2244 } 2245 2246 goto link_mod_return; 2247 } else { 2248 new_link_state = LINK_STATE_DOWN; 2249 } 2250 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2251 } 2252 2253 update_link_state: 2254 if (port_infop) { 2255 ibt_free_portinfo(port_infop, port_infosz); 2256 } 2257 2258 /* 2259 * If we're reporting a link up, check InitTypeReply to see if 2260 * the SM has ensured that the port's presence in mcg, traps, 2261 * etc. is intact. 2262 */ 2263 if (new_link_state == LINK_STATE_DOWN) { 2264 opcode = IBD_LINK_DOWN; 2265 } else { 2266 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2267 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2268 opcode = IBD_LINK_UP; 2269 } else { 2270 opcode = IBD_LINK_UP_ABSENT; 2271 } 2272 } 2273 2274 /* 2275 * If the old state is the same as the new state, and the SM indicated 2276 * no change in the port parameters, nothing to do. 2277 */ 2278 if ((state->id_link_state == new_link_state) && (opcode != 2279 IBD_LINK_UP_ABSENT)) { 2280 mutex_exit(&state->id_link_mutex); 2281 goto link_mod_return; 2282 } 2283 2284 /* 2285 * Ok, so there was a link state change; see if it's safe to ask 2286 * the async thread to do the work 2287 */ 2288 if (!ibd_async_safe(state)) { 2289 state->id_link_state = new_link_state; 2290 mutex_exit(&state->id_link_mutex); 2291 goto link_mod_return; 2292 } 2293 2294 mutex_exit(&state->id_link_mutex); 2295 2296 /* 2297 * Queue up a request for ibd_async_link() to handle this link 2298 * state change event 2299 */ 2300 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2301 req->rq_ptr = (void *)opcode; 2302 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2303 2304 link_mod_return: 2305 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2306 } 2307 2308 /* 2309 * For the port up/down events, IBTL guarantees there will not be concurrent 2310 * invocations of the handler. IBTL might coalesce link transition events, 2311 * and not invoke the handler for _each_ up/down transition, but it will 2312 * invoke the handler with last known state 2313 */ 2314 static void 2315 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2316 ibt_async_code_t code, ibt_async_event_t *event) 2317 { 2318 ibd_state_t *state = (ibd_state_t *)clnt_private; 2319 2320 switch (code) { 2321 case IBT_ERROR_CATASTROPHIC_CHAN: 2322 ibd_print_warn(state, "catastrophic channel error"); 2323 break; 2324 case IBT_ERROR_CQ: 2325 ibd_print_warn(state, "completion queue error"); 2326 break; 2327 case IBT_PORT_CHANGE_EVENT: 2328 /* 2329 * Events will be delivered to all instances that have 2330 * done ibt_open_hca() but not yet done ibt_close_hca(). 2331 * Only need to do work for our port; IBTF will deliver 2332 * events for other ports on the hca we have ibt_open_hca'ed 2333 * too. Note that id_port is initialized in ibd_attach() 2334 * before we do an ibt_open_hca() in ibd_attach(). 2335 */ 2336 ASSERT(state->id_hca_hdl == hca_hdl); 2337 if (state->id_port != event->ev_port) 2338 break; 2339 2340 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2341 IBT_PORT_CHANGE_PKEY) { 2342 ibd_link_mod(state, code); 2343 } 2344 break; 2345 case IBT_ERROR_PORT_DOWN: 2346 case IBT_CLNT_REREG_EVENT: 2347 case IBT_EVENT_PORT_UP: 2348 /* 2349 * Events will be delivered to all instances that have 2350 * done ibt_open_hca() but not yet done ibt_close_hca(). 2351 * Only need to do work for our port; IBTF will deliver 2352 * events for other ports on the hca we have ibt_open_hca'ed 2353 * too. Note that id_port is initialized in ibd_attach() 2354 * before we do an ibt_open_hca() in ibd_attach(). 2355 */ 2356 ASSERT(state->id_hca_hdl == hca_hdl); 2357 if (state->id_port != event->ev_port) 2358 break; 2359 2360 ibd_link_mod(state, code); 2361 break; 2362 2363 case IBT_HCA_ATTACH_EVENT: 2364 case IBT_HCA_DETACH_EVENT: 2365 /* 2366 * When a new card is plugged to the system, attach_event is 2367 * invoked. Additionally, a cfgadm needs to be run to make the 2368 * card known to the system, and an ifconfig needs to be run to 2369 * plumb up any ibd interfaces on the card. In the case of card 2370 * unplug, a cfgadm is run that will trigger any RCM scripts to 2371 * unplumb the ibd interfaces on the card; when the card is 2372 * actually unplugged, the detach_event is invoked; 2373 * additionally, if any ibd instances are still active on the 2374 * card (eg there were no associated RCM scripts), driver's 2375 * detach routine is invoked. 2376 */ 2377 break; 2378 default: 2379 break; 2380 } 2381 } 2382 2383 static int 2384 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2385 { 2386 mac_register_t *macp; 2387 int ret; 2388 2389 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2390 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2391 return (DDI_FAILURE); 2392 } 2393 2394 /* 2395 * Note that when we register with mac during attach, we don't 2396 * have the id_macaddr yet, so we'll simply be registering a 2397 * zero macaddr that we'll overwrite later during plumb (in 2398 * ibd_m_start()). Similar is the case with id_mtu - we'll 2399 * update the mac layer with the correct mtu during plumb. 2400 */ 2401 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2402 macp->m_driver = state; 2403 macp->m_dip = dip; 2404 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2405 macp->m_callbacks = &ibd_m_callbacks; 2406 macp->m_min_sdu = 0; 2407 if (state->id_type == IBD_PORT_DRIVER) { 2408 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 2409 } else if (state->id_enable_rc) { 2410 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2411 } else { 2412 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2413 } 2414 macp->m_priv_props = ibd_priv_props; 2415 2416 /* 2417 * Register ourselves with the GLDv3 interface 2418 */ 2419 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2420 mac_free(macp); 2421 DPRINT(10, 2422 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2423 return (DDI_FAILURE); 2424 } 2425 2426 mac_free(macp); 2427 return (DDI_SUCCESS); 2428 } 2429 2430 static int 2431 ibd_record_capab(ibd_state_t *state) 2432 { 2433 ibt_hca_attr_t hca_attrs; 2434 ibt_status_t ibt_status; 2435 2436 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2437 2438 /* 2439 * Query the HCA and fetch its attributes 2440 */ 2441 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2442 ASSERT(ibt_status == IBT_SUCCESS); 2443 2444 /* 2445 * 1. Set the Hardware Checksum capability. Currently we only consider 2446 * full checksum offload. 2447 */ 2448 if (state->id_enable_rc) { 2449 state->id_hwcksum_capab = 0; 2450 } else { 2451 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2452 == IBT_HCA_CKSUM_FULL) { 2453 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2454 } 2455 } 2456 2457 /* 2458 * 2. Set LSO policy, capability and maximum length 2459 */ 2460 if (state->id_enable_rc) { 2461 state->id_lso_capable = B_FALSE; 2462 state->id_lso_maxlen = 0; 2463 } else { 2464 if (hca_attrs.hca_max_lso_size > 0) { 2465 state->id_lso_capable = B_TRUE; 2466 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2467 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2468 else 2469 state->id_lso_maxlen = 2470 hca_attrs.hca_max_lso_size; 2471 } else { 2472 state->id_lso_capable = B_FALSE; 2473 state->id_lso_maxlen = 0; 2474 } 2475 } 2476 2477 /* 2478 * 3. Set Reserved L_Key capability 2479 */ 2480 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2481 state->id_hca_res_lkey_capab = 1; 2482 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2483 state->rc_enable_iov_map = B_TRUE; 2484 } else { 2485 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2486 state->rc_enable_iov_map = B_FALSE; 2487 } 2488 2489 /* 2490 * 4. Set maximum sqseg value after checking to see if extended sgl 2491 * size information is provided by the hca 2492 */ 2493 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2494 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2495 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2496 } else { 2497 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2498 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2499 } 2500 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2501 state->id_max_sqseg = IBD_MAX_SQSEG; 2502 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2503 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2504 state->id_max_sqseg, IBD_MAX_SQSEG); 2505 } 2506 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2507 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2508 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2509 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2510 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2511 } 2512 2513 /* 2514 * Translating the virtual address regions into physical regions 2515 * for using the Reserved LKey feature results in a wr sgl that 2516 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2517 * we'll fix a high-water mark (65%) for when we should stop. 2518 */ 2519 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2520 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2521 2522 /* 2523 * 5. Set number of recv and send wqes after checking hca maximum 2524 * channel size. Store the max channel size in the state so that it 2525 * can be referred to when the swqe/rwqe change is requested via 2526 * dladm. 2527 */ 2528 2529 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz; 2530 2531 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe) 2532 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz; 2533 2534 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe - 2535 IBD_RWQE_MIN; 2536 2537 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe) 2538 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz; 2539 2540 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2541 2542 return (DDI_SUCCESS); 2543 } 2544 2545 static int 2546 ibd_part_unattach(ibd_state_t *state) 2547 { 2548 uint32_t progress = state->id_mac_state; 2549 ibt_status_t ret; 2550 2551 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2552 cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n"); 2553 return (DDI_FAILURE); 2554 } 2555 2556 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { 2557 cmn_err(CE_CONT, "ibd_detach: failed: srq bufs outstanding\n"); 2558 return (DDI_FAILURE); 2559 } 2560 2561 /* make sure rx resources are freed */ 2562 ibd_free_rx_rsrcs(state); 2563 2564 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 2565 ASSERT(state->id_enable_rc); 2566 ibd_rc_fini_srq_list(state); 2567 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 2568 } 2569 2570 if (progress & IBD_DRV_MAC_REGISTERED) { 2571 (void) mac_unregister(state->id_mh); 2572 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2573 } 2574 2575 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 2576 /* 2577 * No new async requests will be posted since the device 2578 * link state has been marked as unknown; completion handlers 2579 * have been turned off, so Tx handler will not cause any 2580 * more IBD_ASYNC_REAP requests. 2581 * 2582 * Queue a request for the async thread to exit, which will 2583 * be serviced after any pending ones. This can take a while, 2584 * specially if the SM is unreachable, since IBMF will slowly 2585 * timeout each SM request issued by the async thread. Reap 2586 * the thread before continuing on, we do not want it to be 2587 * lingering in modunloaded code. 2588 */ 2589 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 2590 thread_join(state->id_async_thrid); 2591 2592 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 2593 } 2594 2595 if (progress & IBD_DRV_REQ_LIST_INITED) { 2596 list_destroy(&state->id_req_list); 2597 mutex_destroy(&state->id_acache_req_lock); 2598 cv_destroy(&state->id_acache_req_cv); 2599 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED; 2600 } 2601 2602 if (progress & IBD_DRV_PD_ALLOCD) { 2603 if ((ret = ibt_free_pd(state->id_hca_hdl, 2604 state->id_pd_hdl)) != IBT_SUCCESS) { 2605 ibd_print_warn(state, "failed to free " 2606 "protection domain, ret=%d", ret); 2607 } 2608 state->id_pd_hdl = NULL; 2609 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2610 } 2611 2612 if (progress & IBD_DRV_HCA_OPENED) { 2613 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2614 IBT_SUCCESS) { 2615 ibd_print_warn(state, "failed to close " 2616 "HCA device, ret=%d", ret); 2617 } 2618 state->id_hca_hdl = NULL; 2619 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2620 } 2621 2622 mutex_enter(&ibd_gstate.ig_mutex); 2623 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2624 if ((ret = ibt_detach(state->id_ibt_hdl)) != 2625 IBT_SUCCESS) { 2626 ibd_print_warn(state, 2627 "ibt_detach() failed, ret=%d", ret); 2628 } 2629 state->id_ibt_hdl = NULL; 2630 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2631 ibd_gstate.ig_ibt_hdl_ref_cnt--; 2632 } 2633 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && 2634 (ibd_gstate.ig_ibt_hdl != NULL)) { 2635 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != 2636 IBT_SUCCESS) { 2637 ibd_print_warn(state, "ibt_detach(): global " 2638 "failed, ret=%d", ret); 2639 } 2640 ibd_gstate.ig_ibt_hdl = NULL; 2641 } 2642 mutex_exit(&ibd_gstate.ig_mutex); 2643 2644 if (progress & IBD_DRV_TXINTR_ADDED) { 2645 ddi_remove_softintr(state->id_tx); 2646 state->id_tx = NULL; 2647 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2648 } 2649 2650 if (progress & IBD_DRV_RXINTR_ADDED) { 2651 ddi_remove_softintr(state->id_rx); 2652 state->id_rx = NULL; 2653 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2654 } 2655 2656 #ifdef DEBUG 2657 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2658 kstat_delete(state->rc_ksp); 2659 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2660 } 2661 #endif 2662 2663 if (progress & IBD_DRV_STATE_INITIALIZED) { 2664 ibd_state_fini(state); 2665 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2666 } 2667 2668 return (DDI_SUCCESS); 2669 } 2670 2671 int 2672 ibd_part_attach(ibd_state_t *state, dev_info_t *dip) 2673 { 2674 ibt_status_t ret; 2675 int rv; 2676 kthread_t *kht; 2677 2678 /* 2679 * Initialize mutexes and condition variables 2680 */ 2681 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2682 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2683 return (DDI_FAILURE); 2684 } 2685 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2686 2687 /* 2688 * Allocate rx,tx softintr 2689 */ 2690 if (ibd_rx_softintr == 1) { 2691 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2692 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2693 DPRINT(10, "ibd_attach: failed in " 2694 "ddi_add_softintr(id_rx), ret=%d", rv); 2695 return (DDI_FAILURE); 2696 } 2697 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2698 } 2699 if (ibd_tx_softintr == 1) { 2700 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2701 NULL, NULL, ibd_tx_recycle, 2702 (caddr_t)state)) != DDI_SUCCESS) { 2703 DPRINT(10, "ibd_attach: failed in " 2704 "ddi_add_softintr(id_tx), ret=%d", rv); 2705 return (DDI_FAILURE); 2706 } 2707 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2708 } 2709 2710 /* 2711 * Attach to IBTL 2712 */ 2713 mutex_enter(&ibd_gstate.ig_mutex); 2714 if (ibd_gstate.ig_ibt_hdl == NULL) { 2715 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2716 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { 2717 DPRINT(10, "ibd_attach: global: failed in " 2718 "ibt_attach(), ret=%d", ret); 2719 mutex_exit(&ibd_gstate.ig_mutex); 2720 return (DDI_FAILURE); 2721 } 2722 } 2723 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2724 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2725 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 2726 mutex_exit(&ibd_gstate.ig_mutex); 2727 return (DDI_FAILURE); 2728 } 2729 ibd_gstate.ig_ibt_hdl_ref_cnt++; 2730 mutex_exit(&ibd_gstate.ig_mutex); 2731 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2732 2733 /* 2734 * Open the HCA 2735 */ 2736 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 2737 &state->id_hca_hdl)) != IBT_SUCCESS) { 2738 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2739 return (DDI_FAILURE); 2740 } 2741 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2742 2743 #ifdef DEBUG 2744 /* Initialize Driver Counters for Reliable Connected Mode */ 2745 if (state->id_enable_rc) { 2746 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2747 DPRINT(10, "ibd_attach: failed in ibd_rc_init_stats"); 2748 return (DDI_FAILURE); 2749 } 2750 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2751 } 2752 #endif 2753 2754 /* 2755 * Record capabilities 2756 */ 2757 (void) ibd_record_capab(state); 2758 2759 /* 2760 * Allocate a protection domain on the HCA 2761 */ 2762 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2763 &state->id_pd_hdl)) != IBT_SUCCESS) { 2764 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2765 return (DDI_FAILURE); 2766 } 2767 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2768 2769 2770 /* 2771 * We need to initialise the req_list that is required for the 2772 * operation of the async_thread. 2773 */ 2774 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 2775 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 2776 list_create(&state->id_req_list, sizeof (ibd_req_t), 2777 offsetof(ibd_req_t, rq_list)); 2778 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED; 2779 2780 /* 2781 * Create the async thread; thread_create never fails. 2782 */ 2783 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 2784 TS_RUN, minclsyspri); 2785 state->id_async_thrid = kht->t_did; 2786 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 2787 2788 return (DDI_SUCCESS); 2789 } 2790 2791 /* 2792 * Attach device to the IO framework. 2793 */ 2794 static int 2795 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2796 { 2797 int ret; 2798 2799 switch (cmd) { 2800 case DDI_ATTACH: 2801 ret = ibd_port_attach(dip); 2802 break; 2803 default: 2804 ret = DDI_FAILURE; 2805 break; 2806 } 2807 return (ret); 2808 } 2809 2810 /* 2811 * Detach device from the IO framework. 2812 */ 2813 static int 2814 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2815 { 2816 ibd_state_t *state; 2817 int instance; 2818 2819 /* 2820 * IBD doesn't support suspend/resume 2821 */ 2822 if (cmd != DDI_DETACH) 2823 return (DDI_FAILURE); 2824 2825 /* 2826 * Get the instance softstate 2827 */ 2828 instance = ddi_get_instance(dip); 2829 state = ddi_get_soft_state(ibd_list, instance); 2830 2831 /* 2832 * Release all resources we're holding still. Note that if we'd 2833 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2834 * so far, we should find all the flags we need in id_mac_state. 2835 */ 2836 return (ibd_port_unattach(state, dip)); 2837 } 2838 2839 /* 2840 * Pre ibt_attach() driver initialization 2841 */ 2842 static int 2843 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2844 { 2845 char buf[64]; 2846 2847 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2848 state->id_link_state = LINK_STATE_UNKNOWN; 2849 2850 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2851 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2852 state->id_trap_stop = B_TRUE; 2853 state->id_trap_inprog = 0; 2854 2855 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2856 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2857 state->id_dip = dip; 2858 2859 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2860 2861 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2862 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2863 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2864 state->id_tx_busy = 0; 2865 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2866 2867 state->id_rx_list.dl_bufs_outstanding = 0; 2868 state->id_rx_list.dl_cnt = 0; 2869 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2870 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2871 (void) sprintf(buf, "ibd_req%d_%x", ddi_get_instance(dip), 2872 state->id_pkey); 2873 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2874 0, NULL, NULL, NULL, NULL, NULL, 0); 2875 2876 /* For Reliable Connected Mode */ 2877 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2878 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2879 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2880 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2881 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2882 MUTEX_DRIVER, NULL); 2883 2884 /* 2885 * Make the default link mode as RC. If this fails during connection 2886 * setup, the link mode is automatically transitioned to UD. 2887 * Also set the RC MTU. 2888 */ 2889 state->id_enable_rc = IBD_DEF_LINK_MODE; 2890 state->rc_mtu = IBD_DEF_RC_MAX_MTU; 2891 state->id_mtu = IBD_DEF_MAX_MTU; 2892 2893 /* Iniatialize all tunables to default */ 2894 state->id_lso_policy = IBD_DEF_LSO_POLICY; 2895 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS; 2896 state->id_num_ah = IBD_DEF_NUM_AH; 2897 state->id_hash_size = IBD_DEF_HASH_SIZE; 2898 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP; 2899 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS; 2900 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT; 2901 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC; 2902 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT; 2903 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC; 2904 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT; 2905 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC; 2906 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT; 2907 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC; 2908 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH; 2909 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH; 2910 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH; 2911 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE; 2912 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE; 2913 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE; 2914 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE; 2915 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ; 2916 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ; 2917 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH; 2918 2919 return (DDI_SUCCESS); 2920 } 2921 2922 /* 2923 * Post ibt_detach() driver deconstruction 2924 */ 2925 static void 2926 ibd_state_fini(ibd_state_t *state) 2927 { 2928 kmem_cache_destroy(state->id_req_kmc); 2929 2930 mutex_destroy(&state->id_rx_list.dl_mutex); 2931 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2932 2933 mutex_destroy(&state->id_txpost_lock); 2934 mutex_destroy(&state->id_tx_list.dl_mutex); 2935 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2936 mutex_destroy(&state->id_lso_lock); 2937 2938 mutex_destroy(&state->id_sched_lock); 2939 mutex_destroy(&state->id_scq_poll_lock); 2940 mutex_destroy(&state->id_rcq_poll_lock); 2941 2942 cv_destroy(&state->id_trap_cv); 2943 mutex_destroy(&state->id_trap_lock); 2944 mutex_destroy(&state->id_link_mutex); 2945 2946 /* For Reliable Connected Mode */ 2947 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2948 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2949 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2950 mutex_destroy(&state->rc_tx_large_bufs_lock); 2951 mutex_destroy(&state->rc_rx_lock); 2952 } 2953 2954 /* 2955 * Fetch link speed from SA for snmp ifspeed reporting. 2956 */ 2957 static uint64_t 2958 ibd_get_portspeed(ibd_state_t *state) 2959 { 2960 int ret; 2961 ibt_path_info_t path; 2962 ibt_path_attr_t path_attr; 2963 uint8_t num_paths; 2964 uint64_t ifspeed; 2965 2966 /* 2967 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2968 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2969 * 2000000000. Start with that as default. 2970 */ 2971 ifspeed = 2000000000; 2972 2973 bzero(&path_attr, sizeof (path_attr)); 2974 2975 /* 2976 * Get the port speed from Loopback path information. 2977 */ 2978 path_attr.pa_dgids = &state->id_sgid; 2979 path_attr.pa_num_dgids = 1; 2980 path_attr.pa_sgid = state->id_sgid; 2981 2982 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2983 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2984 goto earlydone; 2985 2986 if (num_paths < 1) 2987 goto earlydone; 2988 2989 /* 2990 * In case SA does not return an expected value, report the default 2991 * speed as 1X. 2992 */ 2993 ret = 1; 2994 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2995 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2996 ret = 1; 2997 break; 2998 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2999 ret = 4; 3000 break; 3001 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 3002 ret = 12; 3003 break; 3004 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 3005 ret = 2; 3006 break; 3007 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 3008 ret = 8; 3009 break; 3010 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 3011 ret = 16; 3012 break; 3013 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 3014 ret = 24; 3015 break; 3016 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 3017 ret = 32; 3018 break; 3019 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 3020 ret = 48; 3021 break; 3022 } 3023 3024 ifspeed *= ret; 3025 3026 earlydone: 3027 return (ifspeed); 3028 } 3029 3030 /* 3031 * Search input mcg list (id_mc_full or id_mc_non) for an entry 3032 * representing the input mcg mgid. 3033 */ 3034 static ibd_mce_t * 3035 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 3036 { 3037 ibd_mce_t *ptr = list_head(mlist); 3038 3039 /* 3040 * Do plain linear search. 3041 */ 3042 while (ptr != NULL) { 3043 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 3044 sizeof (ib_gid_t)) == 0) 3045 return (ptr); 3046 ptr = list_next(mlist, ptr); 3047 } 3048 return (NULL); 3049 } 3050 3051 /* 3052 * Execute IBA JOIN. 3053 */ 3054 static ibt_status_t 3055 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 3056 { 3057 ibt_mcg_attr_t mcg_attr; 3058 3059 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3060 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 3061 mcg_attr.mc_mgid = mgid; 3062 mcg_attr.mc_join_state = mce->mc_jstate; 3063 mcg_attr.mc_scope = state->id_scope; 3064 mcg_attr.mc_pkey = state->id_pkey; 3065 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 3066 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 3067 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 3068 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 3069 NULL, NULL)); 3070 } 3071 3072 /* 3073 * This code JOINs the port in the proper way (depending on the join 3074 * state) so that IBA fabric will forward mcg packets to/from the port. 3075 * It also attaches the QPN to the mcg so it can receive those mcg 3076 * packets. This code makes sure not to attach the mcg to the QP if 3077 * that has been previously done due to the mcg being joined with a 3078 * different join state, even though this is not required by SWG_0216, 3079 * refid 3610. 3080 */ 3081 static ibd_mce_t * 3082 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3083 { 3084 ibt_status_t ibt_status; 3085 ibd_mce_t *mce, *tmce, *omce = NULL; 3086 boolean_t do_attach = B_TRUE; 3087 3088 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 3089 jstate, mgid.gid_prefix, mgid.gid_guid); 3090 3091 /* 3092 * For enable_multicast Full member joins, we need to do some 3093 * extra work. If there is already an mce on the list that 3094 * indicates full membership, that means the membership has 3095 * not yet been dropped (since the disable_multicast was issued) 3096 * because there are pending Tx's to the mcg; in that case, just 3097 * mark the mce not to be reaped when the Tx completion queues 3098 * an async reap operation. 3099 * 3100 * If there is already an mce on the list indicating sendonly 3101 * membership, try to promote to full membership. Be careful 3102 * not to deallocate the old mce, since there might be an AH 3103 * pointing to it; instead, update the old mce with new data 3104 * that tracks the full membership. 3105 */ 3106 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 3107 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 3108 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 3109 ASSERT(omce->mc_fullreap); 3110 omce->mc_fullreap = B_FALSE; 3111 return (omce); 3112 } else { 3113 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 3114 } 3115 } 3116 3117 /* 3118 * Allocate the ibd_mce_t to track this JOIN. 3119 */ 3120 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 3121 mce->mc_fullreap = B_FALSE; 3122 mce->mc_jstate = jstate; 3123 3124 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 3125 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 3126 ibt_status); 3127 kmem_free(mce, sizeof (ibd_mce_t)); 3128 return (NULL); 3129 } 3130 3131 /* 3132 * Is an IBA attach required? Not if the interface is already joined 3133 * to the mcg in a different appropriate join state. 3134 */ 3135 if (jstate == IB_MC_JSTATE_NON) { 3136 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3137 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3138 do_attach = B_FALSE; 3139 } else if (jstate == IB_MC_JSTATE_FULL) { 3140 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3141 do_attach = B_FALSE; 3142 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3143 do_attach = B_FALSE; 3144 } 3145 3146 if (do_attach) { 3147 /* 3148 * Do the IBA attach. 3149 */ 3150 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 3151 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 3152 &mce->mc_info)) != IBT_SUCCESS) { 3153 DPRINT(10, "ibd_join_group : failed qp attachment " 3154 "%d\n", ibt_status); 3155 /* 3156 * NOTE that we should probably preserve the join info 3157 * in the list and later try to leave again at detach 3158 * time. 3159 */ 3160 (void) ibt_leave_mcg(state->id_sgid, mgid, 3161 state->id_sgid, jstate); 3162 kmem_free(mce, sizeof (ibd_mce_t)); 3163 return (NULL); 3164 } 3165 } 3166 3167 /* 3168 * Insert the ibd_mce_t in the proper list. 3169 */ 3170 if (jstate == IB_MC_JSTATE_NON) { 3171 IBD_MCACHE_INSERT_NON(state, mce); 3172 } else { 3173 /* 3174 * Set up the mc_req fields used for reaping the 3175 * mcg in case of delayed tx completion (see 3176 * ibd_tx_cleanup()). Also done for sendonly join in 3177 * case we are promoted to fullmembership later and 3178 * keep using the same mce. 3179 */ 3180 mce->mc_req.rq_gid = mgid; 3181 mce->mc_req.rq_ptr = mce; 3182 /* 3183 * Check whether this is the case of trying to join 3184 * full member, and we were already joined send only. 3185 * We try to drop our SendOnly membership, but it is 3186 * possible that the mcg does not exist anymore (and 3187 * the subnet trap never reached us), so the leave 3188 * operation might fail. 3189 */ 3190 if (omce != NULL) { 3191 (void) ibt_leave_mcg(state->id_sgid, mgid, 3192 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 3193 omce->mc_jstate = IB_MC_JSTATE_FULL; 3194 bcopy(&mce->mc_info, &omce->mc_info, 3195 sizeof (ibt_mcg_info_t)); 3196 kmem_free(mce, sizeof (ibd_mce_t)); 3197 return (omce); 3198 } 3199 mutex_enter(&state->id_mc_mutex); 3200 IBD_MCACHE_INSERT_FULL(state, mce); 3201 mutex_exit(&state->id_mc_mutex); 3202 } 3203 3204 return (mce); 3205 } 3206 3207 /* 3208 * Called during port up event handling to attempt to reacquire full 3209 * membership to an mcg. Stripped down version of ibd_join_group(). 3210 * Note that it is possible that the mcg might have gone away, and 3211 * gets recreated at this point. 3212 */ 3213 static void 3214 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 3215 { 3216 ib_gid_t mgid; 3217 3218 /* 3219 * If the mc_fullreap flag is set, or this join fails, a subsequent 3220 * reap/leave is going to try to leave the group. We could prevent 3221 * that by adding a boolean flag into ibd_mce_t, if required. 3222 */ 3223 if (mce->mc_fullreap) 3224 return; 3225 3226 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3227 3228 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 3229 mgid.gid_guid); 3230 3231 /* While reacquiring, leave and then join the MCG */ 3232 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, 3233 mce->mc_jstate); 3234 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 3235 ibd_print_warn(state, "Failure on port up to rejoin " 3236 "multicast gid %016llx:%016llx", 3237 (u_longlong_t)mgid.gid_prefix, 3238 (u_longlong_t)mgid.gid_guid); 3239 } 3240 3241 /* 3242 * This code handles delayed Tx completion cleanups for mcg's to which 3243 * disable_multicast has been issued, regular mcg related cleanups during 3244 * disable_multicast, disable_promiscuous and mcg traps, as well as 3245 * cleanups during driver detach time. Depending on the join state, 3246 * it deletes the mce from the appropriate list and issues the IBA 3247 * leave/detach; except in the disable_multicast case when the mce 3248 * is left on the active list for a subsequent Tx completion cleanup. 3249 */ 3250 static void 3251 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3252 uint8_t jstate) 3253 { 3254 ibd_mce_t *tmce; 3255 boolean_t do_detach = B_TRUE; 3256 3257 /* 3258 * Before detaching, we must check whether the other list 3259 * contains the mcg; if we detach blindly, the consumer 3260 * who set up the other list will also stop receiving 3261 * traffic. 3262 */ 3263 if (jstate == IB_MC_JSTATE_FULL) { 3264 /* 3265 * The following check is only relevant while coming 3266 * from the Tx completion path in the reap case. 3267 */ 3268 if (!mce->mc_fullreap) 3269 return; 3270 mutex_enter(&state->id_mc_mutex); 3271 IBD_MCACHE_PULLOUT_FULL(state, mce); 3272 mutex_exit(&state->id_mc_mutex); 3273 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3274 do_detach = B_FALSE; 3275 } else if (jstate == IB_MC_JSTATE_NON) { 3276 IBD_MCACHE_PULLOUT_NON(state, mce); 3277 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3278 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3279 do_detach = B_FALSE; 3280 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3281 mutex_enter(&state->id_mc_mutex); 3282 IBD_MCACHE_PULLOUT_FULL(state, mce); 3283 mutex_exit(&state->id_mc_mutex); 3284 do_detach = B_FALSE; 3285 } 3286 3287 /* 3288 * If we are reacting to a mcg trap and leaving our sendonly or 3289 * non membership, the mcg is possibly already gone, so attempting 3290 * to leave might fail. On the other hand, we must try to leave 3291 * anyway, since this might be a trap from long ago, and we could 3292 * have potentially sendonly joined to a recent incarnation of 3293 * the mcg and are about to loose track of this information. 3294 */ 3295 if (do_detach) { 3296 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3297 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3298 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3299 } 3300 3301 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3302 kmem_free(mce, sizeof (ibd_mce_t)); 3303 } 3304 3305 /* 3306 * Async code executed due to multicast and promiscuous disable requests 3307 * and mcg trap handling; also executed during driver detach. Mostly, a 3308 * leave and detach is done; except for the fullmember case when Tx 3309 * requests are pending, whence arrangements are made for subsequent 3310 * cleanup on Tx completion. 3311 */ 3312 static void 3313 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3314 { 3315 ipoib_mac_t mcmac; 3316 boolean_t recycled; 3317 ibd_mce_t *mce; 3318 3319 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3320 jstate, mgid.gid_prefix, mgid.gid_guid); 3321 3322 if (jstate == IB_MC_JSTATE_NON) { 3323 recycled = B_TRUE; 3324 mce = IBD_MCACHE_FIND_NON(state, mgid); 3325 /* 3326 * In case we are handling a mcg trap, we might not find 3327 * the mcg in the non list. 3328 */ 3329 if (mce == NULL) { 3330 return; 3331 } 3332 } else { 3333 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3334 3335 /* 3336 * In case we are handling a mcg trap, make sure the trap 3337 * is not arriving late; if we have an mce that indicates 3338 * that we are already a fullmember, that would be a clear 3339 * indication that the trap arrived late (ie, is for a 3340 * previous incarnation of the mcg). 3341 */ 3342 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3343 if ((mce == NULL) || (mce->mc_jstate == 3344 IB_MC_JSTATE_FULL)) { 3345 return; 3346 } 3347 } else { 3348 ASSERT(jstate == IB_MC_JSTATE_FULL); 3349 3350 /* 3351 * If join group failed, mce will be NULL here. 3352 * This is because in GLDv3 driver, set multicast 3353 * will always return success. 3354 */ 3355 if (mce == NULL) { 3356 return; 3357 } 3358 3359 mce->mc_fullreap = B_TRUE; 3360 } 3361 3362 /* 3363 * If no pending Tx's remain that reference the AH 3364 * for the mcg, recycle it from active to free list. 3365 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3366 * so the last completing Tx will cause an async reap 3367 * operation to be invoked, at which time we will drop our 3368 * membership to the mcg so that the pending Tx's complete 3369 * successfully. Refer to comments on "AH and MCE active 3370 * list manipulation" at top of this file. The lock protects 3371 * against Tx fast path and Tx cleanup code. 3372 */ 3373 mutex_enter(&state->id_ac_mutex); 3374 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3375 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3376 IB_MC_JSTATE_SEND_ONLY_NON)); 3377 mutex_exit(&state->id_ac_mutex); 3378 } 3379 3380 if (recycled) { 3381 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3382 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3383 ibd_async_reap_group(state, mce, mgid, jstate); 3384 } 3385 } 3386 3387 /* 3388 * Find the broadcast address as defined by IPoIB; implicitly 3389 * determines the IBA scope, mtu, tclass etc of the link the 3390 * interface is going to be a member of. 3391 */ 3392 static ibt_status_t 3393 ibd_find_bgroup(ibd_state_t *state) 3394 { 3395 ibt_mcg_attr_t mcg_attr; 3396 uint_t numg; 3397 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3398 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3399 IB_MC_SCOPE_GLOBAL }; 3400 int i, mcgmtu; 3401 boolean_t found = B_FALSE; 3402 int ret; 3403 ibt_mcg_info_t mcg_info; 3404 3405 state->id_bgroup_created = B_FALSE; 3406 state->id_bgroup_present = B_FALSE; 3407 3408 query_bcast_grp: 3409 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3410 mcg_attr.mc_pkey = state->id_pkey; 3411 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3412 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3413 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3414 3415 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3416 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3417 3418 /* 3419 * Look for the IPoIB broadcast group. 3420 */ 3421 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3422 state->id_mgid.gid_prefix = 3423 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3424 ((uint64_t)state->id_scope << 48) | 3425 ((uint32_t)(state->id_pkey << 16))); 3426 mcg_attr.mc_mgid = state->id_mgid; 3427 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3428 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3429 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3430 found = B_TRUE; 3431 break; 3432 } 3433 } 3434 3435 if (!found) { 3436 if (state->id_create_broadcast_group) { 3437 /* 3438 * If we created the broadcast group, but failed to 3439 * find it, we can't do anything except leave the 3440 * one we created and return failure. 3441 */ 3442 if (state->id_bgroup_created) { 3443 ibd_print_warn(state, "IPoIB broadcast group " 3444 "absent. Unable to query after create."); 3445 goto find_bgroup_fail; 3446 } 3447 3448 /* 3449 * Create the ipoib broadcast group if it didn't exist 3450 */ 3451 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3452 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3453 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3454 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3455 mcg_attr.mc_pkey = state->id_pkey; 3456 mcg_attr.mc_flow = 0; 3457 mcg_attr.mc_sl = 0; 3458 mcg_attr.mc_tclass = 0; 3459 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3460 state->id_mgid.gid_prefix = 3461 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3462 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3463 ((uint32_t)(state->id_pkey << 16))); 3464 mcg_attr.mc_mgid = state->id_mgid; 3465 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3466 3467 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3468 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3469 ibd_print_warn(state, "IPoIB broadcast group " 3470 "absent, create failed: ret = %d\n", ret); 3471 state->id_bgroup_created = B_FALSE; 3472 return (IBT_FAILURE); 3473 } 3474 state->id_bgroup_created = B_TRUE; 3475 goto query_bcast_grp; 3476 } else { 3477 ibd_print_warn(state, "IPoIB broadcast group absent"); 3478 return (IBT_FAILURE); 3479 } 3480 } 3481 3482 /* 3483 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3484 */ 3485 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3486 if (state->id_mtu < mcgmtu) { 3487 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3488 "greater than port's maximum MTU %d", mcgmtu, 3489 state->id_mtu); 3490 ibt_free_mcg_info(state->id_mcinfo, 1); 3491 goto find_bgroup_fail; 3492 } 3493 state->id_mtu = mcgmtu; 3494 state->id_bgroup_present = B_TRUE; 3495 3496 return (IBT_SUCCESS); 3497 3498 find_bgroup_fail: 3499 if (state->id_bgroup_created) { 3500 (void) ibt_leave_mcg(state->id_sgid, 3501 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3502 IB_MC_JSTATE_FULL); 3503 } 3504 3505 return (IBT_FAILURE); 3506 } 3507 3508 static int 3509 ibd_alloc_tx_copybufs(ibd_state_t *state) 3510 { 3511 ibt_mr_attr_t mem_attr; 3512 3513 /* 3514 * Allocate one big chunk for all regular tx copy bufs 3515 */ 3516 state->id_tx_buf_sz = state->id_mtu; 3517 if (state->id_lso_policy && state->id_lso_capable && 3518 (state->id_ud_tx_copy_thresh > state->id_mtu)) { 3519 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh; 3520 } 3521 3522 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe * 3523 state->id_tx_buf_sz, KM_SLEEP); 3524 3525 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe * 3526 sizeof (ibd_swqe_t), KM_SLEEP); 3527 3528 /* 3529 * Do one memory registration on the entire txbuf area 3530 */ 3531 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3532 mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz; 3533 mem_attr.mr_as = NULL; 3534 mem_attr.mr_flags = IBT_MR_SLEEP; 3535 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3536 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3537 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3538 kmem_free(state->id_tx_wqes, 3539 state->id_ud_num_swqe * sizeof (ibd_swqe_t)); 3540 kmem_free(state->id_tx_bufs, 3541 state->id_ud_num_swqe * state->id_tx_buf_sz); 3542 state->id_tx_bufs = NULL; 3543 return (DDI_FAILURE); 3544 } 3545 3546 return (DDI_SUCCESS); 3547 } 3548 3549 static int 3550 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3551 { 3552 ibt_mr_attr_t mem_attr; 3553 ibd_lsobuf_t *buflist; 3554 ibd_lsobuf_t *lbufp; 3555 ibd_lsobuf_t *tail; 3556 ibd_lsobkt_t *bktp; 3557 uint8_t *membase; 3558 uint8_t *memp; 3559 uint_t memsz; 3560 int i; 3561 3562 /* 3563 * Allocate the lso bucket 3564 */ 3565 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3566 3567 /* 3568 * Allocate the entire lso memory and register it 3569 */ 3570 memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ; 3571 membase = kmem_zalloc(memsz, KM_SLEEP); 3572 3573 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3574 mem_attr.mr_len = memsz; 3575 mem_attr.mr_as = NULL; 3576 mem_attr.mr_flags = IBT_MR_SLEEP; 3577 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3578 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3579 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3580 kmem_free(membase, memsz); 3581 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3582 return (DDI_FAILURE); 3583 } 3584 3585 mutex_enter(&state->id_lso_lock); 3586 3587 /* 3588 * Now allocate the buflist. Note that the elements in the buflist and 3589 * the buffers in the lso memory have a permanent 1-1 relation, so we 3590 * can always derive the address of a buflist entry from the address of 3591 * an lso buffer. 3592 */ 3593 buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t), 3594 KM_SLEEP); 3595 3596 /* 3597 * Set up the lso buf chain 3598 */ 3599 memp = membase; 3600 lbufp = buflist; 3601 for (i = 0; i < state->id_num_lso_bufs; i++) { 3602 lbufp->lb_isfree = 1; 3603 lbufp->lb_buf = memp; 3604 lbufp->lb_next = lbufp + 1; 3605 3606 tail = lbufp; 3607 3608 memp += IBD_LSO_BUFSZ; 3609 lbufp++; 3610 } 3611 tail->lb_next = NULL; 3612 3613 /* 3614 * Set up the LSO buffer information in ibd state 3615 */ 3616 bktp->bkt_bufl = buflist; 3617 bktp->bkt_free_head = buflist; 3618 bktp->bkt_mem = membase; 3619 bktp->bkt_nelem = state->id_num_lso_bufs; 3620 bktp->bkt_nfree = bktp->bkt_nelem; 3621 3622 state->id_lso = bktp; 3623 mutex_exit(&state->id_lso_lock); 3624 3625 return (DDI_SUCCESS); 3626 } 3627 3628 /* 3629 * Statically allocate Tx buffer list(s). 3630 */ 3631 static int 3632 ibd_init_txlist(ibd_state_t *state) 3633 { 3634 ibd_swqe_t *swqe; 3635 ibt_lkey_t lkey; 3636 int i; 3637 uint_t len; 3638 uint8_t *bufaddr; 3639 3640 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3641 return (DDI_FAILURE); 3642 3643 if (state->id_lso_policy && state->id_lso_capable) { 3644 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3645 state->id_lso_capable = B_FALSE; 3646 } 3647 3648 mutex_enter(&state->id_tx_list.dl_mutex); 3649 state->id_tx_list.dl_head = NULL; 3650 state->id_tx_list.dl_pending_sends = B_FALSE; 3651 state->id_tx_list.dl_cnt = 0; 3652 mutex_exit(&state->id_tx_list.dl_mutex); 3653 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3654 state->id_tx_rel_list.dl_head = NULL; 3655 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3656 state->id_tx_rel_list.dl_cnt = 0; 3657 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3658 3659 /* 3660 * Allocate and setup the swqe list 3661 */ 3662 lkey = state->id_tx_mr_desc.md_lkey; 3663 bufaddr = state->id_tx_bufs; 3664 len = state->id_tx_buf_sz; 3665 swqe = state->id_tx_wqes; 3666 mutex_enter(&state->id_tx_list.dl_mutex); 3667 for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) { 3668 swqe->swqe_next = NULL; 3669 swqe->swqe_im_mblk = NULL; 3670 3671 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3672 bufaddr; 3673 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3674 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3675 3676 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3677 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3678 swqe->w_swr.wr_trans = IBT_UD_SRV; 3679 3680 /* These are set in send */ 3681 swqe->w_swr.wr_nds = 0; 3682 swqe->w_swr.wr_sgl = NULL; 3683 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3684 3685 /* add to list */ 3686 state->id_tx_list.dl_cnt++; 3687 swqe->swqe_next = state->id_tx_list.dl_head; 3688 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3689 } 3690 mutex_exit(&state->id_tx_list.dl_mutex); 3691 3692 return (DDI_SUCCESS); 3693 } 3694 3695 static int 3696 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3697 uint32_t *nds_p) 3698 { 3699 ibd_lsobkt_t *bktp; 3700 ibd_lsobuf_t *lbufp; 3701 ibd_lsobuf_t *nextp; 3702 ibt_lkey_t lso_lkey; 3703 uint_t frag_sz; 3704 uint_t num_needed; 3705 int i; 3706 3707 ASSERT(sgl_p != NULL); 3708 ASSERT(nds_p != NULL); 3709 ASSERT(req_sz != 0); 3710 3711 /* 3712 * Determine how many bufs we'd need for the size requested 3713 */ 3714 num_needed = req_sz / IBD_LSO_BUFSZ; 3715 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3716 num_needed++; 3717 3718 mutex_enter(&state->id_lso_lock); 3719 3720 /* 3721 * If we don't have enough lso bufs, return failure 3722 */ 3723 ASSERT(state->id_lso != NULL); 3724 bktp = state->id_lso; 3725 if (bktp->bkt_nfree < num_needed) { 3726 mutex_exit(&state->id_lso_lock); 3727 return (-1); 3728 } 3729 3730 /* 3731 * Pick the first 'num_needed' bufs from the free list 3732 */ 3733 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3734 lbufp = bktp->bkt_free_head; 3735 for (i = 0; i < num_needed; i++) { 3736 ASSERT(lbufp->lb_isfree != 0); 3737 ASSERT(lbufp->lb_buf != NULL); 3738 3739 nextp = lbufp->lb_next; 3740 3741 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3742 sgl_p[i].ds_key = lso_lkey; 3743 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3744 3745 lbufp->lb_isfree = 0; 3746 lbufp->lb_next = NULL; 3747 3748 lbufp = nextp; 3749 } 3750 bktp->bkt_free_head = lbufp; 3751 3752 /* 3753 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3754 * to adjust the last sgl entry's length. Since we know we need atleast 3755 * one, the i-1 use below is ok. 3756 */ 3757 if (frag_sz) { 3758 sgl_p[i-1].ds_len = frag_sz; 3759 } 3760 3761 /* 3762 * Update nfree count and return 3763 */ 3764 bktp->bkt_nfree -= num_needed; 3765 3766 mutex_exit(&state->id_lso_lock); 3767 3768 *nds_p = num_needed; 3769 3770 return (0); 3771 } 3772 3773 static void 3774 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3775 { 3776 ibd_lsobkt_t *bktp; 3777 ibd_lsobuf_t *lbufp; 3778 uint8_t *lso_mem_end; 3779 uint_t ndx; 3780 int i; 3781 3782 mutex_enter(&state->id_lso_lock); 3783 3784 bktp = state->id_lso; 3785 ASSERT(bktp != NULL); 3786 3787 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3788 for (i = 0; i < nds; i++) { 3789 uint8_t *va; 3790 3791 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3792 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3793 3794 /* 3795 * Figure out the buflist element this sgl buffer corresponds 3796 * to and put it back at the head 3797 */ 3798 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3799 lbufp = bktp->bkt_bufl + ndx; 3800 3801 ASSERT(lbufp->lb_isfree == 0); 3802 ASSERT(lbufp->lb_buf == va); 3803 3804 lbufp->lb_isfree = 1; 3805 lbufp->lb_next = bktp->bkt_free_head; 3806 bktp->bkt_free_head = lbufp; 3807 } 3808 bktp->bkt_nfree += nds; 3809 3810 mutex_exit(&state->id_lso_lock); 3811 } 3812 3813 static void 3814 ibd_free_tx_copybufs(ibd_state_t *state) 3815 { 3816 /* 3817 * Unregister txbuf mr 3818 */ 3819 if (ibt_deregister_mr(state->id_hca_hdl, 3820 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3821 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3822 } 3823 state->id_tx_mr_hdl = NULL; 3824 3825 /* 3826 * Free txbuf memory 3827 */ 3828 kmem_free(state->id_tx_wqes, state->id_ud_num_swqe * 3829 sizeof (ibd_swqe_t)); 3830 kmem_free(state->id_tx_bufs, state->id_ud_num_swqe * 3831 state->id_tx_buf_sz); 3832 state->id_tx_wqes = NULL; 3833 state->id_tx_bufs = NULL; 3834 } 3835 3836 static void 3837 ibd_free_tx_lsobufs(ibd_state_t *state) 3838 { 3839 ibd_lsobkt_t *bktp; 3840 3841 mutex_enter(&state->id_lso_lock); 3842 3843 if ((bktp = state->id_lso) == NULL) { 3844 mutex_exit(&state->id_lso_lock); 3845 return; 3846 } 3847 3848 /* 3849 * First, free the buflist 3850 */ 3851 ASSERT(bktp->bkt_bufl != NULL); 3852 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3853 3854 /* 3855 * Unregister the LSO memory and free it 3856 */ 3857 ASSERT(bktp->bkt_mr_hdl != NULL); 3858 if (ibt_deregister_mr(state->id_hca_hdl, 3859 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3860 DPRINT(10, 3861 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3862 } 3863 ASSERT(bktp->bkt_mem); 3864 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3865 3866 /* 3867 * Finally free the bucket 3868 */ 3869 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3870 state->id_lso = NULL; 3871 3872 mutex_exit(&state->id_lso_lock); 3873 } 3874 3875 /* 3876 * Free the statically allocated Tx buffer list. 3877 */ 3878 static void 3879 ibd_fini_txlist(ibd_state_t *state) 3880 { 3881 /* 3882 * Free the allocated swqes 3883 */ 3884 mutex_enter(&state->id_tx_list.dl_mutex); 3885 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3886 state->id_tx_list.dl_head = NULL; 3887 state->id_tx_list.dl_pending_sends = B_FALSE; 3888 state->id_tx_list.dl_cnt = 0; 3889 state->id_tx_rel_list.dl_head = NULL; 3890 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3891 state->id_tx_rel_list.dl_cnt = 0; 3892 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3893 mutex_exit(&state->id_tx_list.dl_mutex); 3894 3895 ibd_free_tx_lsobufs(state); 3896 ibd_free_tx_copybufs(state); 3897 } 3898 3899 /* 3900 * post a list of rwqes, NULL terminated. 3901 */ 3902 static void 3903 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3904 { 3905 uint_t i; 3906 uint_t num_posted; 3907 ibt_status_t ibt_status; 3908 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3909 3910 while (rwqe) { 3911 /* Post up to IBD_RX_POST_CNT receive work requests */ 3912 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3913 wrs[i] = rwqe->w_rwr; 3914 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3915 if (rwqe == NULL) { 3916 i++; 3917 break; 3918 } 3919 } 3920 3921 /* 3922 * If posting fails for some reason, we'll never receive 3923 * completion intimation, so we'll need to cleanup. But 3924 * we need to make sure we don't clean up nodes whose 3925 * wrs have been successfully posted. We assume that the 3926 * hca driver returns on the first failure to post and 3927 * therefore the first 'num_posted' entries don't need 3928 * cleanup here. 3929 */ 3930 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3931 3932 num_posted = 0; 3933 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3934 &num_posted); 3935 if (ibt_status != IBT_SUCCESS) { 3936 /* This cannot happen unless the device has an error. */ 3937 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3938 "posting multiple wrs failed: " 3939 "requested=%d, done=%d, ret=%d", 3940 IBD_RX_POST_CNT, num_posted, ibt_status); 3941 atomic_add_32(&state->id_rx_list.dl_cnt, 3942 num_posted - i); 3943 } 3944 } 3945 } 3946 3947 /* 3948 * Grab a list of rwqes from the array of lists, and post the list. 3949 */ 3950 static void 3951 ibd_post_recv_intr(ibd_state_t *state) 3952 { 3953 ibd_rx_queue_t *rxp; 3954 ibd_rwqe_t *list; 3955 3956 /* rotate through the rx_queue array, expecting an adequate number */ 3957 state->id_rx_post_queue_index = 3958 (state->id_rx_post_queue_index + 1) & 3959 (state->id_rx_nqueues - 1); 3960 3961 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3962 mutex_enter(&rxp->rx_post_lock); 3963 list = WQE_TO_RWQE(rxp->rx_head); 3964 rxp->rx_head = NULL; 3965 rxp->rx_cnt = 0; 3966 mutex_exit(&rxp->rx_post_lock); 3967 ibd_post_recv_list(state, list); 3968 } 3969 3970 /* macro explained below */ 3971 #define RX_QUEUE_HASH(rwqe) \ 3972 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3973 3974 /* 3975 * Add a rwqe to one of the the Rx lists. If the list is large enough 3976 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3977 * 3978 * Note: one of 2^N lists is chosen via a hash. This is done 3979 * because using one list is contentious. If the first list is busy 3980 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3981 * 3982 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3983 * even distribution of mapping rwqes to the 2^N queues. 3984 */ 3985 static void 3986 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3987 { 3988 ibd_rx_queue_t *rxp; 3989 3990 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 3991 3992 if (!mutex_tryenter(&rxp->rx_post_lock)) { 3993 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 3994 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 3995 mutex_enter(&rxp->rx_post_lock); 3996 } 3997 rwqe->rwqe_next = rxp->rx_head; 3998 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 3999 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 4000 4001 /* only call ibt_post_recv() every Nth time through here */ 4002 if ((active & (state->id_rx_nqueues - 1)) == 0) { 4003 rxp->rx_head = NULL; 4004 rxp->rx_cnt = 0; 4005 mutex_exit(&rxp->rx_post_lock); 4006 ibd_post_recv_list(state, rwqe); 4007 return; 4008 } 4009 } 4010 rxp->rx_head = RWQE_TO_WQE(rwqe); 4011 mutex_exit(&rxp->rx_post_lock); 4012 } 4013 4014 static int 4015 ibd_alloc_rx_copybufs(ibd_state_t *state) 4016 { 4017 ibt_mr_attr_t mem_attr; 4018 int i; 4019 4020 /* 4021 * Allocate one big chunk for all regular rx copy bufs 4022 */ 4023 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 4024 4025 state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe * 4026 state->id_rx_buf_sz, KM_SLEEP); 4027 4028 state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe * 4029 sizeof (ibd_rwqe_t), KM_SLEEP); 4030 4031 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 4032 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 4033 sizeof (ibd_rx_queue_t), KM_SLEEP); 4034 for (i = 0; i < state->id_rx_nqueues; i++) { 4035 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4036 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 4037 } 4038 4039 /* 4040 * Do one memory registration on the entire rxbuf area 4041 */ 4042 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 4043 mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz; 4044 mem_attr.mr_as = NULL; 4045 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 4046 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 4047 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 4048 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 4049 kmem_free(state->id_rx_wqes, 4050 state->id_ud_num_rwqe * sizeof (ibd_rwqe_t)); 4051 kmem_free(state->id_rx_bufs, 4052 state->id_ud_num_rwqe * state->id_rx_buf_sz); 4053 state->id_rx_bufs = NULL; 4054 state->id_rx_wqes = NULL; 4055 return (DDI_FAILURE); 4056 } 4057 4058 return (DDI_SUCCESS); 4059 } 4060 4061 /* 4062 * Allocate the statically allocated Rx buffer list. 4063 */ 4064 static int 4065 ibd_init_rxlist(ibd_state_t *state) 4066 { 4067 ibd_rwqe_t *rwqe, *next; 4068 ibd_wqe_t *list; 4069 ibt_lkey_t lkey; 4070 int i; 4071 uint_t len; 4072 uint8_t *bufaddr; 4073 4074 mutex_enter(&state->id_rx_free_list.dl_mutex); 4075 if (state->id_rx_free_list.dl_head != NULL) { 4076 /* rx rsrcs were never freed. Just repost them */ 4077 len = state->id_rx_buf_sz; 4078 list = state->id_rx_free_list.dl_head; 4079 state->id_rx_free_list.dl_head = NULL; 4080 state->id_rx_free_list.dl_cnt = 0; 4081 mutex_exit(&state->id_rx_free_list.dl_mutex); 4082 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4083 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4084 if ((rwqe->rwqe_im_mblk = desballoc( 4085 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 4086 &rwqe->w_freemsg_cb)) == NULL) { 4087 /* allow freemsg_cb to free the rwqes */ 4088 if (atomic_dec_32_nv(&state->id_running) != 0) { 4089 cmn_err(CE_WARN, "ibd_init_rxlist: " 4090 "id_running was not 1\n"); 4091 } 4092 DPRINT(10, "ibd_init_rxlist : " 4093 "failed in desballoc()"); 4094 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4095 rwqe = next) { 4096 next = WQE_TO_RWQE(rwqe->rwqe_next); 4097 if (rwqe->rwqe_im_mblk) { 4098 atomic_inc_32(&state-> 4099 id_rx_list. 4100 dl_bufs_outstanding); 4101 freemsg(rwqe->rwqe_im_mblk); 4102 } else 4103 ibd_free_rwqe(state, rwqe); 4104 } 4105 atomic_inc_32(&state->id_running); 4106 return (DDI_FAILURE); 4107 } 4108 } 4109 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4110 return (DDI_SUCCESS); 4111 } 4112 mutex_exit(&state->id_rx_free_list.dl_mutex); 4113 4114 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 4115 return (DDI_FAILURE); 4116 4117 /* 4118 * Allocate and setup the rwqe list 4119 */ 4120 len = state->id_rx_buf_sz; 4121 lkey = state->id_rx_mr_desc.md_lkey; 4122 rwqe = state->id_rx_wqes; 4123 bufaddr = state->id_rx_bufs; 4124 list = NULL; 4125 for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) { 4126 rwqe->w_state = state; 4127 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 4128 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 4129 4130 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 4131 4132 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 4133 &rwqe->w_freemsg_cb)) == NULL) { 4134 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 4135 /* allow freemsg_cb to free the rwqes */ 4136 if (atomic_dec_32_nv(&state->id_running) != 0) { 4137 cmn_err(CE_WARN, "ibd_init_rxlist: " 4138 "id_running was not 1\n"); 4139 } 4140 DPRINT(10, "ibd_init_rxlist : " 4141 "failed in desballoc()"); 4142 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4143 rwqe = next) { 4144 next = WQE_TO_RWQE(rwqe->rwqe_next); 4145 freemsg(rwqe->rwqe_im_mblk); 4146 } 4147 atomic_inc_32(&state->id_running); 4148 4149 /* remove reference to free'd rwqes */ 4150 mutex_enter(&state->id_rx_free_list.dl_mutex); 4151 state->id_rx_free_list.dl_head = NULL; 4152 state->id_rx_free_list.dl_cnt = 0; 4153 mutex_exit(&state->id_rx_free_list.dl_mutex); 4154 4155 ibd_fini_rxlist(state); 4156 return (DDI_FAILURE); 4157 } 4158 4159 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 4160 rwqe->rwqe_copybuf.ic_sgl.ds_va = 4161 (ib_vaddr_t)(uintptr_t)bufaddr; 4162 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 4163 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 4164 rwqe->w_rwr.wr_nds = 1; 4165 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 4166 4167 rwqe->rwqe_next = list; 4168 list = RWQE_TO_WQE(rwqe); 4169 } 4170 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4171 4172 return (DDI_SUCCESS); 4173 } 4174 4175 static void 4176 ibd_free_rx_copybufs(ibd_state_t *state) 4177 { 4178 int i; 4179 4180 /* 4181 * Unregister rxbuf mr 4182 */ 4183 if (ibt_deregister_mr(state->id_hca_hdl, 4184 state->id_rx_mr_hdl) != IBT_SUCCESS) { 4185 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 4186 } 4187 state->id_rx_mr_hdl = NULL; 4188 4189 /* 4190 * Free rxbuf memory 4191 */ 4192 for (i = 0; i < state->id_rx_nqueues; i++) { 4193 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4194 mutex_destroy(&rxp->rx_post_lock); 4195 } 4196 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 4197 sizeof (ibd_rx_queue_t)); 4198 kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe * 4199 sizeof (ibd_rwqe_t)); 4200 kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe * 4201 state->id_rx_buf_sz); 4202 state->id_rx_queues = NULL; 4203 state->id_rx_wqes = NULL; 4204 state->id_rx_bufs = NULL; 4205 } 4206 4207 static void 4208 ibd_free_rx_rsrcs(ibd_state_t *state) 4209 { 4210 mutex_enter(&state->id_rx_free_list.dl_mutex); 4211 if (state->id_rx_free_list.dl_head == NULL) { 4212 /* already freed */ 4213 mutex_exit(&state->id_rx_free_list.dl_mutex); 4214 return; 4215 } 4216 ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe); 4217 ibd_free_rx_copybufs(state); 4218 state->id_rx_free_list.dl_cnt = 0; 4219 state->id_rx_free_list.dl_head = NULL; 4220 mutex_exit(&state->id_rx_free_list.dl_mutex); 4221 } 4222 4223 /* 4224 * Free the statically allocated Rx buffer list. 4225 */ 4226 static void 4227 ibd_fini_rxlist(ibd_state_t *state) 4228 { 4229 ibd_rwqe_t *rwqe; 4230 int i; 4231 4232 /* run through the rx_queue's, calling freemsg() */ 4233 for (i = 0; i < state->id_rx_nqueues; i++) { 4234 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4235 mutex_enter(&rxp->rx_post_lock); 4236 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 4237 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4238 freemsg(rwqe->rwqe_im_mblk); 4239 rxp->rx_cnt--; 4240 } 4241 rxp->rx_head = NULL; 4242 mutex_exit(&rxp->rx_post_lock); 4243 } 4244 4245 /* cannot free rx resources unless gld returned everything */ 4246 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 4247 ibd_free_rx_rsrcs(state); 4248 } 4249 4250 /* 4251 * Free an allocated recv wqe. 4252 */ 4253 /* ARGSUSED */ 4254 static void 4255 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 4256 { 4257 /* 4258 * desballoc() failed (no memory). 4259 * 4260 * This rwqe is placed on a free list so that it 4261 * can be reinstated when memory is available. 4262 * 4263 * NOTE: no code currently exists to reinstate 4264 * these "lost" rwqes. 4265 */ 4266 mutex_enter(&state->id_rx_free_list.dl_mutex); 4267 state->id_rx_free_list.dl_cnt++; 4268 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 4269 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 4270 mutex_exit(&state->id_rx_free_list.dl_mutex); 4271 } 4272 4273 /* 4274 * IBA Rx completion queue handler. Guaranteed to be single 4275 * threaded and nonreentrant for this CQ. 4276 */ 4277 /* ARGSUSED */ 4278 static void 4279 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4280 { 4281 ibd_state_t *state = (ibd_state_t *)arg; 4282 4283 atomic_inc_64(&state->id_num_intrs); 4284 4285 if (ibd_rx_softintr == 1) { 4286 mutex_enter(&state->id_rcq_poll_lock); 4287 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 4288 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 4289 mutex_exit(&state->id_rcq_poll_lock); 4290 return; 4291 } else { 4292 mutex_exit(&state->id_rcq_poll_lock); 4293 ddi_trigger_softintr(state->id_rx); 4294 } 4295 } else 4296 (void) ibd_intr((caddr_t)state); 4297 } 4298 4299 /* 4300 * CQ handler for Tx completions, when the Tx CQ is in 4301 * interrupt driven mode. 4302 */ 4303 /* ARGSUSED */ 4304 static void 4305 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4306 { 4307 ibd_state_t *state = (ibd_state_t *)arg; 4308 4309 atomic_inc_64(&state->id_num_intrs); 4310 4311 if (ibd_tx_softintr == 1) { 4312 mutex_enter(&state->id_scq_poll_lock); 4313 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 4314 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 4315 mutex_exit(&state->id_scq_poll_lock); 4316 return; 4317 } else { 4318 mutex_exit(&state->id_scq_poll_lock); 4319 ddi_trigger_softintr(state->id_tx); 4320 } 4321 } else 4322 (void) ibd_tx_recycle((caddr_t)state); 4323 } 4324 4325 /* 4326 * Multicast group create/delete trap handler. These will be delivered 4327 * on a kernel thread (handling can thus block) and can be invoked 4328 * concurrently. The handler can be invoked anytime after it is 4329 * registered and before ibt_detach(). 4330 */ 4331 /* ARGSUSED */ 4332 static void 4333 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4334 ibt_subnet_event_t *event) 4335 { 4336 ibd_state_t *state = (ibd_state_t *)arg; 4337 ibd_req_t *req; 4338 4339 /* 4340 * The trap handler will get invoked once for every event for 4341 * every port. The input "gid" is the GID0 of the port the 4342 * trap came in on; we just need to act on traps that came 4343 * to our port, meaning the port on which the ipoib interface 4344 * resides. Since ipoib uses GID0 of the port, we just match 4345 * the gids to check whether we need to handle the trap. 4346 */ 4347 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4348 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4349 return; 4350 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4351 4352 DPRINT(10, "ibd_notices_handler : %d\n", code); 4353 4354 switch (code) { 4355 case IBT_SM_EVENT_UNAVAILABLE: 4356 /* 4357 * If we are in promiscuous mode or have 4358 * sendnonmembers, we need to print a warning 4359 * message right now. Else, just store the 4360 * information, print when we enter promiscuous 4361 * mode or attempt nonmember send. We might 4362 * also want to stop caching sendnonmember. 4363 */ 4364 ibd_print_warn(state, "IBA multicast support " 4365 "degraded due to unavailability of multicast " 4366 "traps"); 4367 break; 4368 case IBT_SM_EVENT_AVAILABLE: 4369 /* 4370 * If we printed a warning message above or 4371 * while trying to nonmember send or get into 4372 * promiscuous mode, print an okay message. 4373 */ 4374 ibd_print_warn(state, "IBA multicast support " 4375 "restored due to availability of multicast " 4376 "traps"); 4377 break; 4378 case IBT_SM_EVENT_MCG_CREATED: 4379 case IBT_SM_EVENT_MCG_DELETED: 4380 /* 4381 * If it is a "deleted" event and we are in late hca 4382 * init, nothing to do. 4383 */ 4384 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4385 IBD_DRV_IN_LATE_HCA_INIT) && (code == 4386 IBT_SM_EVENT_MCG_DELETED)) { 4387 break; 4388 } 4389 /* 4390 * Common processing of creation/deletion traps. 4391 * First check if the instance is being 4392 * [de]initialized; back off then, without doing 4393 * anything more, since we are not sure if the 4394 * async thread is around, or whether we might 4395 * be racing with the detach code in ibd_m_stop() 4396 * that scans the mcg list. 4397 */ 4398 if (!ibd_async_safe(state)) 4399 return; 4400 4401 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4402 req->rq_gid = event->sm_notice_gid; 4403 req->rq_ptr = (void *)code; 4404 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4405 break; 4406 } 4407 } 4408 4409 static void 4410 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4411 { 4412 ib_gid_t mgid = req->rq_gid; 4413 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4414 int ret; 4415 ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff; 4416 4417 DPRINT(10, "ibd_async_trap : %d\n", code); 4418 4419 /* 4420 * Check if we have already joined the IPoIB broadcast group for our 4421 * PKEY. If joined, perform the rest of the operation. 4422 * Else, the interface is not initialised. Do the initialisation here 4423 * by calling ibd_start() and return. 4424 */ 4425 4426 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4427 IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) && 4428 (code == IBT_SM_EVENT_MCG_CREATED)) { 4429 /* 4430 * If we are in late HCA init and a notification for the 4431 * creation of a MCG came in, check if it is the IPoIB MCG for 4432 * this pkey. If not, return. 4433 */ 4434 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey != 4435 state->id_pkey)) { 4436 ibd_async_done(state); 4437 return; 4438 } 4439 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4440 /* 4441 * Check if there is still a necessity to start the interface. 4442 * It is possible that the user attempted unplumb at just about 4443 * the same time, and if unplumb succeeded, we have nothing to 4444 * do. 4445 */ 4446 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4447 IBD_DRV_IN_LATE_HCA_INIT) && 4448 ((ret = ibd_start(state)) != 0)) { 4449 DPRINT(10, "ibd_async_trap: cannot start from late HCA " 4450 "init, ret=%d", ret); 4451 } 4452 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4453 ibd_async_done(state); 4454 return; 4455 } 4456 4457 /* 4458 * Atomically search the nonmember and sendonlymember lists and 4459 * delete. 4460 */ 4461 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4462 4463 if (state->id_prom_op == IBD_OP_COMPLETED) { 4464 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4465 4466 /* 4467 * If in promiscuous mode, try to join/attach to the new 4468 * mcg. Given the unreliable out-of-order mode of trap 4469 * delivery, we can never be sure whether it is a problem 4470 * if the join fails. Thus, we warn the admin of a failure 4471 * if this was a creation trap. Note that the trap might 4472 * actually be reporting a long past event, and the mcg 4473 * might already have been deleted, thus we might be warning 4474 * in vain. 4475 */ 4476 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4477 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4478 ibd_print_warn(state, "IBA promiscuous mode missed " 4479 "new multicast gid %016llx:%016llx", 4480 (u_longlong_t)mgid.gid_prefix, 4481 (u_longlong_t)mgid.gid_guid); 4482 } 4483 4484 /* 4485 * Free the request slot allocated by the subnet event thread. 4486 */ 4487 ibd_async_done(state); 4488 } 4489 4490 /* 4491 * GLDv3 entry point to get capabilities. 4492 */ 4493 static boolean_t 4494 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4495 { 4496 ibd_state_t *state = arg; 4497 4498 if (state->id_type == IBD_PORT_DRIVER) 4499 return (B_FALSE); 4500 4501 switch (cap) { 4502 case MAC_CAPAB_HCKSUM: { 4503 uint32_t *txflags = cap_data; 4504 4505 /* 4506 * We either do full checksum or not do it at all 4507 */ 4508 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4509 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4510 else 4511 return (B_FALSE); 4512 break; 4513 } 4514 4515 case MAC_CAPAB_LSO: { 4516 mac_capab_lso_t *cap_lso = cap_data; 4517 4518 /* 4519 * In addition to the capability and policy, since LSO 4520 * relies on hw checksum, we'll not enable LSO if we 4521 * don't have hw checksum. Of course, if the HCA doesn't 4522 * provide the reserved lkey capability, enabling LSO will 4523 * actually affect performance adversely, so we'll disable 4524 * LSO even for that case. 4525 */ 4526 if (!state->id_lso_policy || !state->id_lso_capable) 4527 return (B_FALSE); 4528 4529 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4530 return (B_FALSE); 4531 4532 if (state->id_hca_res_lkey_capab == 0) { 4533 ibd_print_warn(state, "no reserved-lkey capability, " 4534 "disabling LSO"); 4535 return (B_FALSE); 4536 } 4537 4538 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4539 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4540 break; 4541 } 4542 4543 default: 4544 return (B_FALSE); 4545 } 4546 4547 return (B_TRUE); 4548 } 4549 4550 /* 4551 * callback function for set/get of properties 4552 */ 4553 static int 4554 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4555 uint_t pr_valsize, const void *pr_val) 4556 { 4557 ibd_state_t *state = arg; 4558 int err = 0; 4559 uint32_t link_mode; 4560 4561 /* Cannot set properties on a port driver */ 4562 if (state->id_type == IBD_PORT_DRIVER) { 4563 return (ENOTSUP); 4564 } 4565 4566 switch (pr_num) { 4567 case MAC_PROP_IB_LINKMODE: 4568 if (state->id_mac_state & IBD_DRV_STARTED) { 4569 err = EBUSY; 4570 break; 4571 } 4572 if (pr_val == NULL) { 4573 err = EINVAL; 4574 break; 4575 } 4576 bcopy(pr_val, &link_mode, sizeof (link_mode)); 4577 if (link_mode != IBD_LINK_MODE_UD && 4578 link_mode != IBD_LINK_MODE_RC) { 4579 err = EINVAL; 4580 } else { 4581 if (link_mode == IBD_LINK_MODE_RC) { 4582 if (state->id_enable_rc) { 4583 return (0); 4584 } 4585 state->id_enable_rc = 1; 4586 /* inform MAC framework of new MTU */ 4587 err = mac_maxsdu_update(state->id_mh, 4588 state->rc_mtu - IPOIB_HDRSIZE); 4589 } else { 4590 if (!state->id_enable_rc) { 4591 return (0); 4592 } 4593 state->id_enable_rc = 0; 4594 err = mac_maxsdu_update(state->id_mh, 4595 state->id_mtu - IPOIB_HDRSIZE); 4596 } 4597 (void) ibd_record_capab(state); 4598 mac_capab_update(state->id_mh); 4599 } 4600 break; 4601 case MAC_PROP_PRIVATE: 4602 err = ibd_set_priv_prop(state, pr_name, 4603 pr_valsize, pr_val); 4604 break; 4605 default: 4606 err = ENOTSUP; 4607 break; 4608 } 4609 return (err); 4610 } 4611 4612 static int 4613 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4614 uint_t pr_valsize, void *pr_val) 4615 { 4616 ibd_state_t *state = arg; 4617 int err = 0; 4618 4619 switch (pr_num) { 4620 case MAC_PROP_MTU: 4621 break; 4622 default: 4623 if (state->id_type == IBD_PORT_DRIVER) { 4624 return (ENOTSUP); 4625 } 4626 break; 4627 } 4628 4629 switch (pr_num) { 4630 case MAC_PROP_IB_LINKMODE: 4631 *(uint_t *)pr_val = state->id_enable_rc; 4632 break; 4633 case MAC_PROP_PRIVATE: 4634 err = ibd_get_priv_prop(state, pr_name, pr_valsize, 4635 pr_val); 4636 break; 4637 default: 4638 err = ENOTSUP; 4639 break; 4640 } 4641 return (err); 4642 } 4643 4644 static void 4645 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4646 mac_prop_info_handle_t prh) 4647 { 4648 ibd_state_t *state = arg; 4649 4650 switch (pr_num) { 4651 case MAC_PROP_IB_LINKMODE: { 4652 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE); 4653 break; 4654 } 4655 case MAC_PROP_MTU: { 4656 uint32_t min, max; 4657 if (state->id_type == IBD_PORT_DRIVER) { 4658 min = 1500; 4659 max = IBD_DEF_RC_MAX_SDU; 4660 } else if (state->id_enable_rc) { 4661 min = max = IBD_DEF_RC_MAX_SDU; 4662 } else { 4663 min = max = state->id_mtu - IPOIB_HDRSIZE; 4664 } 4665 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4666 mac_prop_info_set_range_uint32(prh, min, max); 4667 break; 4668 } 4669 case MAC_PROP_PRIVATE: { 4670 char valstr[64]; 4671 int value; 4672 4673 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 4674 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4675 return; 4676 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4677 value = IBD_DEF_COALESCE_COMPLETIONS; 4678 } else if (strcmp(pr_name, 4679 "_ibd_create_broadcast_group") == 0) { 4680 value = IBD_DEF_CREATE_BCAST_GROUP; 4681 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4682 value = IBD_DEF_HASH_SIZE; 4683 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4684 value = IBD_DEF_LSO_POLICY; 4685 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4686 value = IBD_DEF_NUM_AH; 4687 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4688 value = IBD_DEF_NUM_LSO_BUFS; 4689 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4690 value = IBD_DEF_RC_ENABLE_SRQ; 4691 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4692 value = IBD_DEF_RC_NUM_RWQE; 4693 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4694 value = IBD_DEF_RC_NUM_SRQ; 4695 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4696 value = IBD_DEF_RC_NUM_SWQE; 4697 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4698 value = IBD_DEF_RC_RX_COMP_COUNT; 4699 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4700 value = IBD_DEF_RC_RX_COMP_USEC; 4701 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4702 value = IBD_DEF_RC_RX_COPY_THRESH; 4703 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4704 value = IBD_DEF_RC_RX_RWQE_THRESH; 4705 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4706 value = IBD_DEF_RC_TX_COMP_COUNT; 4707 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 4708 value = IBD_DEF_RC_TX_COMP_USEC; 4709 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 4710 value = IBD_DEF_RC_TX_COPY_THRESH; 4711 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 4712 value = IBD_DEF_UD_NUM_RWQE; 4713 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 4714 value = IBD_DEF_UD_NUM_SWQE; 4715 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 4716 value = IBD_DEF_UD_RX_COMP_COUNT; 4717 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 4718 value = IBD_DEF_UD_RX_COMP_USEC; 4719 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 4720 value = IBD_DEF_UD_TX_COMP_COUNT; 4721 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 4722 value = IBD_DEF_UD_TX_COMP_USEC; 4723 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 4724 value = IBD_DEF_UD_TX_COPY_THRESH; 4725 } else { 4726 return; 4727 } 4728 4729 (void) snprintf(valstr, sizeof (valstr), "%d", value); 4730 mac_prop_info_set_default_str(prh, valstr); 4731 break; 4732 } 4733 } /* switch (pr_num) */ 4734 } 4735 4736 /* ARGSUSED2 */ 4737 static int 4738 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name, 4739 uint_t pr_valsize, const void *pr_val) 4740 { 4741 int err = 0; 4742 long result; 4743 4744 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4745 if (pr_val == NULL) { 4746 return (EINVAL); 4747 } 4748 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4749 if (result < 0 || result > 1) { 4750 err = EINVAL; 4751 } else { 4752 state->id_allow_coalesce_comp_tuning = (result == 1) ? 4753 B_TRUE: B_FALSE; 4754 } 4755 return (err); 4756 } 4757 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 4758 if (state->id_mac_state & IBD_DRV_STARTED) { 4759 return (EBUSY); 4760 } 4761 if (pr_val == NULL) { 4762 return (EINVAL); 4763 } 4764 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4765 if (result < 0 || result > 1) { 4766 err = EINVAL; 4767 } else { 4768 state->id_create_broadcast_group = (result == 1) ? 4769 B_TRUE: B_FALSE; 4770 } 4771 return (err); 4772 } 4773 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4774 if (state->id_mac_state & IBD_DRV_STARTED) { 4775 return (EBUSY); 4776 } 4777 if (pr_val == NULL) { 4778 return (EINVAL); 4779 } 4780 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4781 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) { 4782 err = EINVAL; 4783 } else { 4784 state->id_hash_size = (uint32_t)result; 4785 } 4786 return (err); 4787 } 4788 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4789 if (state->id_mac_state & IBD_DRV_STARTED) { 4790 return (EBUSY); 4791 } 4792 if (pr_val == NULL) { 4793 return (EINVAL); 4794 } 4795 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4796 if (result < 0 || result > 1) { 4797 err = EINVAL; 4798 } else { 4799 state->id_lso_policy = (result == 1) ? 4800 B_TRUE: B_FALSE; 4801 } 4802 mac_capab_update(state->id_mh); 4803 return (err); 4804 } 4805 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4806 if (state->id_mac_state & IBD_DRV_STARTED) { 4807 return (EBUSY); 4808 } 4809 if (pr_val == NULL) { 4810 return (EINVAL); 4811 } 4812 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4813 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) { 4814 err = EINVAL; 4815 } else { 4816 state->id_num_ah = (uint32_t)result; 4817 } 4818 return (err); 4819 } 4820 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4821 if (state->id_mac_state & IBD_DRV_STARTED) { 4822 return (EBUSY); 4823 } 4824 if (!state->id_lso_policy || !state->id_lso_capable) { 4825 return (EINVAL); 4826 } 4827 if (pr_val == NULL) { 4828 return (EINVAL); 4829 } 4830 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4831 if (result < IBD_MIN_NUM_LSO_BUFS || 4832 result > IBD_MAX_NUM_LSO_BUFS) { 4833 err = EINVAL; 4834 } else { 4835 state->id_num_lso_bufs = (uint32_t)result; 4836 } 4837 return (err); 4838 } 4839 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4840 if (state->id_mac_state & IBD_DRV_STARTED) { 4841 return (EBUSY); 4842 } 4843 if (pr_val == NULL) { 4844 return (EINVAL); 4845 } 4846 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4847 if (result < 0 || result > 1) { 4848 err = EINVAL; 4849 } else { 4850 state->rc_enable_srq = (result == 1) ? 4851 B_TRUE: B_FALSE; 4852 } 4853 if (!state->rc_enable_srq) { 4854 state->id_rc_num_srq = 0; 4855 } 4856 return (err); 4857 } 4858 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4859 if (state->id_mac_state & IBD_DRV_STARTED) { 4860 return (EBUSY); 4861 } 4862 if (pr_val == NULL) { 4863 return (EINVAL); 4864 } 4865 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4866 if (result < IBD_MIN_RC_NUM_RWQE || 4867 result > IBD_MAX_RC_NUM_RWQE) { 4868 err = EINVAL; 4869 } else { 4870 state->id_rc_num_rwqe = (uint32_t)result; 4871 if (state->id_allow_coalesce_comp_tuning && 4872 state->id_rc_rx_comp_count > state->id_rc_num_rwqe) 4873 state->id_rc_rx_comp_count = 4874 state->id_rc_num_rwqe; 4875 if (state->id_rc_num_srq > state->id_rc_num_rwqe) 4876 state->id_rc_num_srq = 4877 state->id_rc_num_rwqe - 1; 4878 /* 4879 * If rx_rwqe_threshold is greater than the number of 4880 * rwqes, pull it back to 25% of number of rwqes. 4881 */ 4882 if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe) 4883 state->id_rc_rx_rwqe_thresh = 4884 (state->id_rc_num_rwqe >> 2); 4885 4886 } 4887 return (err); 4888 } 4889 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4890 if (state->id_mac_state & IBD_DRV_STARTED) { 4891 return (EBUSY); 4892 } 4893 if (pr_val == NULL) { 4894 return (EINVAL); 4895 } 4896 if (!state->rc_enable_srq) 4897 return (EINVAL); 4898 4899 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4900 if (result < IBD_MIN_RC_NUM_SRQ || 4901 result >= state->id_rc_num_rwqe) { 4902 err = EINVAL; 4903 } else 4904 state->id_rc_num_srq = (uint32_t)result; 4905 return (err); 4906 } 4907 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4908 if (state->id_mac_state & IBD_DRV_STARTED) { 4909 return (EBUSY); 4910 } 4911 if (pr_val == NULL) { 4912 return (EINVAL); 4913 } 4914 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4915 if (result < IBD_MIN_RC_NUM_SWQE || 4916 result > IBD_MAX_RC_NUM_SWQE) { 4917 err = EINVAL; 4918 } else { 4919 state->id_rc_num_swqe = (uint32_t)result; 4920 if (state->id_allow_coalesce_comp_tuning && 4921 state->id_rc_tx_comp_count > state->id_rc_num_swqe) 4922 state->id_rc_tx_comp_count = 4923 state->id_rc_num_swqe; 4924 } 4925 return (err); 4926 } 4927 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4928 if (!state->id_allow_coalesce_comp_tuning) { 4929 return (ENOTSUP); 4930 } 4931 if (pr_val == NULL) { 4932 return (EINVAL); 4933 } 4934 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4935 if (result < 1 || result > state->id_rc_num_rwqe) { 4936 err = EINVAL; 4937 } else { 4938 state->id_rc_rx_comp_count = (uint32_t)result; 4939 } 4940 return (err); 4941 } 4942 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4943 if (!state->id_allow_coalesce_comp_tuning) { 4944 return (ENOTSUP); 4945 } 4946 if (pr_val == NULL) { 4947 return (EINVAL); 4948 } 4949 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4950 if (result < 1) { 4951 err = EINVAL; 4952 } else { 4953 state->id_rc_rx_comp_usec = (uint32_t)result; 4954 } 4955 return (err); 4956 } 4957 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4958 if (state->id_mac_state & IBD_DRV_STARTED) { 4959 return (EBUSY); 4960 } 4961 if (pr_val == NULL) { 4962 return (EINVAL); 4963 } 4964 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4965 if (result < IBD_MIN_RC_RX_COPY_THRESH || 4966 result > state->rc_mtu) { 4967 err = EINVAL; 4968 } else { 4969 state->id_rc_rx_copy_thresh = (uint32_t)result; 4970 } 4971 return (err); 4972 } 4973 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4974 if (state->id_mac_state & IBD_DRV_STARTED) { 4975 return (EBUSY); 4976 } 4977 if (pr_val == NULL) { 4978 return (EINVAL); 4979 } 4980 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4981 if (result < IBD_MIN_RC_RX_RWQE_THRESH || 4982 result >= state->id_rc_num_rwqe) { 4983 err = EINVAL; 4984 } else { 4985 state->id_rc_rx_rwqe_thresh = (uint32_t)result; 4986 } 4987 return (err); 4988 } 4989 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4990 if (!state->id_allow_coalesce_comp_tuning) { 4991 return (ENOTSUP); 4992 } 4993 if (pr_val == NULL) { 4994 return (EINVAL); 4995 } 4996 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4997 if (result < 1 || result > state->id_rc_num_swqe) { 4998 err = EINVAL; 4999 } else { 5000 state->id_rc_tx_comp_count = (uint32_t)result; 5001 } 5002 return (err); 5003 } 5004 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5005 if (!state->id_allow_coalesce_comp_tuning) { 5006 return (ENOTSUP); 5007 } 5008 if (pr_val == NULL) { 5009 return (EINVAL); 5010 } 5011 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5012 if (result < 1) 5013 err = EINVAL; 5014 else { 5015 state->id_rc_tx_comp_usec = (uint32_t)result; 5016 } 5017 return (err); 5018 } 5019 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5020 if (state->id_mac_state & IBD_DRV_STARTED) { 5021 return (EBUSY); 5022 } 5023 if (pr_val == NULL) { 5024 return (EINVAL); 5025 } 5026 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5027 if (result < IBD_MIN_RC_TX_COPY_THRESH || 5028 result > state->rc_mtu) { 5029 err = EINVAL; 5030 } else { 5031 state->id_rc_tx_copy_thresh = (uint32_t)result; 5032 } 5033 return (err); 5034 } 5035 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5036 if (state->id_mac_state & IBD_DRV_STARTED) { 5037 return (EBUSY); 5038 } 5039 if (pr_val == NULL) { 5040 return (EINVAL); 5041 } 5042 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5043 if (result < IBD_MIN_UD_NUM_RWQE || 5044 result > IBD_MAX_UD_NUM_RWQE) { 5045 err = EINVAL; 5046 } else { 5047 if (result > state->id_hca_max_chan_sz) { 5048 state->id_ud_num_rwqe = 5049 state->id_hca_max_chan_sz; 5050 } else { 5051 state->id_ud_num_rwqe = (uint32_t)result; 5052 } 5053 if (state->id_allow_coalesce_comp_tuning && 5054 state->id_ud_rx_comp_count > state->id_ud_num_rwqe) 5055 state->id_ud_rx_comp_count = 5056 state->id_ud_num_rwqe; 5057 } 5058 return (err); 5059 } 5060 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5061 if (state->id_mac_state & IBD_DRV_STARTED) { 5062 return (EBUSY); 5063 } 5064 if (pr_val == NULL) { 5065 return (EINVAL); 5066 } 5067 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5068 if (result < IBD_MIN_UD_NUM_SWQE || 5069 result > IBD_MAX_UD_NUM_SWQE) { 5070 err = EINVAL; 5071 } else { 5072 if (result > state->id_hca_max_chan_sz) { 5073 state->id_ud_num_swqe = 5074 state->id_hca_max_chan_sz; 5075 } else { 5076 state->id_ud_num_swqe = (uint32_t)result; 5077 } 5078 if (state->id_allow_coalesce_comp_tuning && 5079 state->id_ud_tx_comp_count > state->id_ud_num_swqe) 5080 state->id_ud_tx_comp_count = 5081 state->id_ud_num_swqe; 5082 } 5083 return (err); 5084 } 5085 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5086 if (!state->id_allow_coalesce_comp_tuning) { 5087 return (ENOTSUP); 5088 } 5089 if (pr_val == NULL) { 5090 return (EINVAL); 5091 } 5092 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5093 if (result < 1 || result > state->id_ud_num_rwqe) { 5094 err = EINVAL; 5095 } else { 5096 state->id_ud_rx_comp_count = (uint32_t)result; 5097 } 5098 return (err); 5099 } 5100 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5101 if (!state->id_allow_coalesce_comp_tuning) { 5102 return (ENOTSUP); 5103 } 5104 if (pr_val == NULL) { 5105 return (EINVAL); 5106 } 5107 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5108 if (result < 1) { 5109 err = EINVAL; 5110 } else { 5111 state->id_ud_rx_comp_usec = (uint32_t)result; 5112 } 5113 return (err); 5114 } 5115 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5116 if (!state->id_allow_coalesce_comp_tuning) { 5117 return (ENOTSUP); 5118 } 5119 if (pr_val == NULL) { 5120 return (EINVAL); 5121 } 5122 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5123 if (result < 1 || result > state->id_ud_num_swqe) { 5124 err = EINVAL; 5125 } else { 5126 state->id_ud_tx_comp_count = (uint32_t)result; 5127 } 5128 return (err); 5129 } 5130 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5131 if (!state->id_allow_coalesce_comp_tuning) { 5132 return (ENOTSUP); 5133 } 5134 if (pr_val == NULL) { 5135 return (EINVAL); 5136 } 5137 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5138 if (result < 1) { 5139 err = EINVAL; 5140 } else { 5141 state->id_ud_tx_comp_usec = (uint32_t)result; 5142 } 5143 return (err); 5144 } 5145 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5146 if (state->id_mac_state & IBD_DRV_STARTED) { 5147 return (EBUSY); 5148 } 5149 if (pr_val == NULL) { 5150 return (EINVAL); 5151 } 5152 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5153 if (result < IBD_MIN_UD_TX_COPY_THRESH || 5154 result > IBD_MAX_UD_TX_COPY_THRESH) { 5155 err = EINVAL; 5156 } else { 5157 state->id_ud_tx_copy_thresh = (uint32_t)result; 5158 } 5159 return (err); 5160 } 5161 return (ENOTSUP); 5162 } 5163 5164 static int 5165 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize, 5166 void *pr_val) 5167 { 5168 int err = ENOTSUP; 5169 int value; 5170 5171 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 5172 value = state->id_bgroup_present; 5173 err = 0; 5174 goto done; 5175 } 5176 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 5177 value = state->id_allow_coalesce_comp_tuning; 5178 err = 0; 5179 goto done; 5180 } 5181 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 5182 value = state->id_create_broadcast_group; 5183 err = 0; 5184 goto done; 5185 } 5186 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 5187 value = state->id_hash_size; 5188 err = 0; 5189 goto done; 5190 } 5191 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 5192 value = state->id_lso_policy; 5193 err = 0; 5194 goto done; 5195 } 5196 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 5197 value = state->id_num_ah; 5198 err = 0; 5199 goto done; 5200 } 5201 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 5202 value = state->id_num_lso_bufs; 5203 err = 0; 5204 goto done; 5205 } 5206 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 5207 value = state->rc_enable_srq; 5208 err = 0; 5209 goto done; 5210 } 5211 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 5212 value = state->id_rc_num_rwqe; 5213 err = 0; 5214 goto done; 5215 } 5216 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 5217 value = state->id_rc_num_srq; 5218 err = 0; 5219 goto done; 5220 } 5221 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 5222 value = state->id_rc_num_swqe; 5223 err = 0; 5224 goto done; 5225 } 5226 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 5227 value = state->id_rc_rx_comp_count; 5228 err = 0; 5229 goto done; 5230 } 5231 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 5232 value = state->id_rc_rx_comp_usec; 5233 err = 0; 5234 goto done; 5235 } 5236 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 5237 value = state->id_rc_rx_copy_thresh; 5238 err = 0; 5239 goto done; 5240 } 5241 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 5242 value = state->id_rc_rx_rwqe_thresh; 5243 err = 0; 5244 goto done; 5245 } 5246 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 5247 value = state->id_rc_tx_comp_count; 5248 err = 0; 5249 goto done; 5250 } 5251 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5252 value = state->id_rc_tx_comp_usec; 5253 err = 0; 5254 goto done; 5255 } 5256 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5257 value = state->id_rc_tx_copy_thresh; 5258 err = 0; 5259 goto done; 5260 } 5261 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5262 value = state->id_ud_num_rwqe; 5263 err = 0; 5264 goto done; 5265 } 5266 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5267 value = state->id_ud_num_swqe; 5268 err = 0; 5269 goto done; 5270 } 5271 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5272 value = state->id_ud_rx_comp_count; 5273 err = 0; 5274 goto done; 5275 } 5276 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5277 value = state->id_ud_rx_comp_usec; 5278 err = 0; 5279 goto done; 5280 } 5281 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5282 value = state->id_ud_tx_comp_count; 5283 err = 0; 5284 goto done; 5285 } 5286 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5287 value = state->id_ud_tx_comp_usec; 5288 err = 0; 5289 goto done; 5290 } 5291 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5292 value = state->id_ud_tx_copy_thresh; 5293 err = 0; 5294 goto done; 5295 } 5296 done: 5297 if (err == 0) { 5298 (void) snprintf(pr_val, pr_valsize, "%d", value); 5299 } 5300 return (err); 5301 } 5302 5303 static int 5304 ibd_get_port_details(ibd_state_t *state) 5305 { 5306 ibt_hca_portinfo_t *port_infop; 5307 ibt_status_t ret; 5308 uint_t psize, port_infosz; 5309 5310 mutex_enter(&state->id_link_mutex); 5311 5312 /* 5313 * Query for port information 5314 */ 5315 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 5316 &port_infop, &psize, &port_infosz); 5317 if ((ret != IBT_SUCCESS) || (psize != 1)) { 5318 mutex_exit(&state->id_link_mutex); 5319 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 5320 "failed, ret=%d", ret); 5321 return (ENETDOWN); 5322 } 5323 5324 /* 5325 * If the link is active, verify the pkey 5326 */ 5327 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) { 5328 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 5329 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 5330 state->id_link_state = LINK_STATE_DOWN; 5331 } else { 5332 state->id_link_state = LINK_STATE_UP; 5333 } 5334 state->id_mtu = (128 << port_infop->p_mtu); 5335 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5336 state->id_sgid = *port_infop->p_sgid_tbl; 5337 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5338 /* 5339 * Now that the port is active, record the port speed 5340 */ 5341 state->id_link_speed = ibd_get_portspeed(state); 5342 } else { 5343 /* Make sure that these are handled in PORT_UP/CHANGE */ 5344 state->id_mtu = 0; 5345 state->id_link_state = LINK_STATE_DOWN; 5346 state->id_link_speed = 0; 5347 } 5348 mutex_exit(&state->id_link_mutex); 5349 ibt_free_portinfo(port_infop, port_infosz); 5350 5351 return (0); 5352 } 5353 5354 static int 5355 ibd_alloc_cqs(ibd_state_t *state) 5356 { 5357 ibt_hca_attr_t hca_attrs; 5358 ibt_cq_attr_t cq_attr; 5359 ibt_status_t ret; 5360 uint32_t real_size; 5361 uint_t num_rwqe_change = 0; 5362 uint_t num_swqe_change = 0; 5363 5364 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 5365 ASSERT(ret == IBT_SUCCESS); 5366 5367 /* 5368 * Allocate Rx/combined CQ: 5369 * Theoretically, there is no point in having more than #rwqe 5370 * plus #swqe cqe's, except that the CQ will be signaled for 5371 * overflow when the last wqe completes, if none of the previous 5372 * cqe's have been polled. Thus, we allocate just a few less wqe's 5373 * to make sure such overflow does not occur. 5374 */ 5375 cq_attr.cq_sched = NULL; 5376 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 5377 5378 /* 5379 * Allocate Receive CQ. 5380 */ 5381 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) { 5382 cq_attr.cq_size = state->id_ud_num_rwqe + 1; 5383 } else { 5384 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5385 num_rwqe_change = state->id_ud_num_rwqe; 5386 state->id_ud_num_rwqe = cq_attr.cq_size - 1; 5387 } 5388 5389 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5390 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 5391 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 5392 "failed, ret=%d\n", ret); 5393 return (DDI_FAILURE); 5394 } 5395 5396 if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count, 5397 state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) { 5398 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 5399 "moderation failed, ret=%d\n", ret); 5400 } 5401 5402 /* make the #rx wc's the same as max rx chain size */ 5403 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 5404 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 5405 state->id_rxwcs_size, KM_SLEEP); 5406 5407 /* 5408 * Allocate Send CQ. 5409 */ 5410 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) { 5411 cq_attr.cq_size = state->id_ud_num_swqe + 1; 5412 } else { 5413 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5414 num_swqe_change = state->id_ud_num_swqe; 5415 state->id_ud_num_swqe = cq_attr.cq_size - 1; 5416 } 5417 5418 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5419 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 5420 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 5421 "failed, ret=%d\n", ret); 5422 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 5423 state->id_rxwcs_size); 5424 (void) ibt_free_cq(state->id_rcq_hdl); 5425 return (DDI_FAILURE); 5426 } 5427 if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count, 5428 state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) { 5429 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 5430 "moderation failed, ret=%d\n", ret); 5431 } 5432 5433 state->id_txwcs_size = IBD_TX_POLL_THRESH; 5434 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 5435 state->id_txwcs_size, KM_SLEEP); 5436 5437 /* 5438 * Print message in case we could not allocate as many wqe's 5439 * as was requested. 5440 */ 5441 if (num_rwqe_change) { 5442 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 5443 "%d", state->id_ud_num_rwqe, num_rwqe_change); 5444 } 5445 if (num_swqe_change) { 5446 ibd_print_warn(state, "Setting #swqe = %d instead of default " 5447 "%d", state->id_ud_num_swqe, num_swqe_change); 5448 } 5449 5450 return (DDI_SUCCESS); 5451 } 5452 5453 static int 5454 ibd_setup_ud_channel(ibd_state_t *state) 5455 { 5456 ibt_ud_chan_alloc_args_t ud_alloc_attr; 5457 ibt_ud_chan_query_attr_t ud_chan_attr; 5458 ibt_status_t ret; 5459 5460 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 5461 if (state->id_hca_res_lkey_capab) 5462 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 5463 if (state->id_lso_policy && state->id_lso_capable) 5464 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 5465 5466 ud_alloc_attr.ud_hca_port_num = state->id_port; 5467 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 5468 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 5469 ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe; 5470 ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe; 5471 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 5472 ud_alloc_attr.ud_scq = state->id_scq_hdl; 5473 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 5474 ud_alloc_attr.ud_pd = state->id_pd_hdl; 5475 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 5476 ud_alloc_attr.ud_clone_chan = NULL; 5477 5478 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 5479 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 5480 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 5481 "failed, ret=%d\n", ret); 5482 return (DDI_FAILURE); 5483 } 5484 5485 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 5486 &ud_chan_attr)) != IBT_SUCCESS) { 5487 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 5488 "failed, ret=%d\n", ret); 5489 (void) ibt_free_channel(state->id_chnl_hdl); 5490 return (DDI_FAILURE); 5491 } 5492 5493 state->id_qpnum = ud_chan_attr.ud_qpn; 5494 5495 return (DDI_SUCCESS); 5496 } 5497 5498 static int 5499 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 5500 { 5501 uint32_t progress = state->id_mac_state; 5502 uint_t attempts; 5503 ibt_status_t ret; 5504 ib_gid_t mgid; 5505 ibd_mce_t *mce; 5506 uint8_t jstate; 5507 5508 if (atomic_dec_32_nv(&state->id_running) != 0) 5509 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 5510 5511 /* 5512 * Before we try to stop/undo whatever we did in ibd_start(), 5513 * we need to mark the link state appropriately to prevent the 5514 * ip layer from using this instance for any new transfers. Note 5515 * that if the original state of the link was "up" when we're 5516 * here, we'll set the final link state to "unknown", to behave 5517 * in the same fashion as other ethernet drivers. 5518 */ 5519 mutex_enter(&state->id_link_mutex); 5520 if (cur_link_state == LINK_STATE_DOWN) { 5521 state->id_link_state = cur_link_state; 5522 } else { 5523 state->id_link_state = LINK_STATE_UNKNOWN; 5524 } 5525 mutex_exit(&state->id_link_mutex); 5526 bzero(&state->id_macaddr, sizeof (ipoib_mac_t)); 5527 mac_link_update(state->id_mh, state->id_link_state); 5528 5529 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 5530 if (progress & IBD_DRV_STARTED) { 5531 state->id_mac_state &= (~IBD_DRV_STARTED); 5532 } 5533 5534 if (progress & IBD_DRV_IN_LATE_HCA_INIT) { 5535 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT); 5536 } 5537 5538 /* Stop listen under Reliable Connected Mode */ 5539 if (progress & IBD_DRV_RC_LISTEN) { 5540 ASSERT(state->id_enable_rc); 5541 if (state->rc_listen_hdl != NULL) { 5542 ibd_rc_stop_listen(state); 5543 } 5544 state->id_mac_state &= (~IBD_DRV_RC_LISTEN); 5545 } 5546 5547 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) { 5548 (void) ibd_rc_close_all_chan(state); 5549 } 5550 5551 /* 5552 * First, stop receive interrupts; this stops the driver from 5553 * handing up buffers to higher layers. Wait for receive buffers 5554 * to be returned and give up after 1 second. 5555 */ 5556 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 5557 attempts = 10; 5558 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 5559 0) > 0) { 5560 delay(drv_usectohz(100000)); 5561 if (--attempts == 0) { 5562 /* 5563 * There are pending bufs with the network 5564 * layer and we have no choice but to wait 5565 * for them to be done with. Reap all the 5566 * Tx/Rx completions that were posted since 5567 * we turned off the notification and 5568 * return failure. 5569 */ 5570 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 5571 DPRINT(2, "ibd_undo_start: " 5572 "reclaiming failed"); 5573 break; 5574 } 5575 } 5576 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 5577 } 5578 5579 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { 5580 ibd_rc_fini_tx_largebuf_list(state); 5581 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); 5582 } 5583 5584 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 5585 ASSERT(state->id_enable_rc); 5586 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) { 5587 ibd_rc_fini_srq_list(state); 5588 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 5589 } else { 5590 cmn_err(CE_CONT, "ibd_undo_start: srq bufs " 5591 "outstanding\n"); 5592 } 5593 } 5594 5595 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 5596 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 5597 5598 mutex_enter(&state->id_trap_lock); 5599 state->id_trap_stop = B_TRUE; 5600 while (state->id_trap_inprog > 0) 5601 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 5602 mutex_exit(&state->id_trap_lock); 5603 5604 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 5605 } 5606 5607 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 5608 /* 5609 * Flushing the channel ensures that all pending WQE's 5610 * are marked with flush_error and handed to the CQ. It 5611 * does not guarantee the invocation of the CQ handler. 5612 * This call is guaranteed to return successfully for 5613 * UD QPNs. 5614 */ 5615 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 5616 IBT_SUCCESS) { 5617 DPRINT(10, "ibd_undo_start: flush_channel " 5618 "failed, ret=%d", ret); 5619 } 5620 5621 /* 5622 * Give some time for the TX CQ handler to process the 5623 * completions. 5624 */ 5625 mutex_enter(&state->id_tx_list.dl_mutex); 5626 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5627 attempts = 10; 5628 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 5629 != state->id_ud_num_swqe) { 5630 if (--attempts == 0) 5631 break; 5632 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5633 mutex_exit(&state->id_tx_list.dl_mutex); 5634 delay(drv_usectohz(100000)); 5635 mutex_enter(&state->id_tx_list.dl_mutex); 5636 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5637 } 5638 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5639 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 5640 state->id_ud_num_swqe) { 5641 cmn_err(CE_WARN, "tx resources not freed\n"); 5642 } 5643 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5644 mutex_exit(&state->id_tx_list.dl_mutex); 5645 5646 attempts = 10; 5647 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5648 if (--attempts == 0) 5649 break; 5650 delay(drv_usectohz(100000)); 5651 } 5652 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 5653 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5654 cmn_err(CE_WARN, "rx resources not freed\n"); 5655 } 5656 5657 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 5658 } 5659 5660 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 5661 /* 5662 * Drop all residual full/non membership. This includes full 5663 * membership to the broadcast group, and any nonmembership 5664 * acquired during transmits. We do this after the Tx completion 5665 * handlers are done, since those might result in some late 5666 * leaves; this also eliminates a potential race with that 5667 * path wrt the mc full list insert/delete. Trap handling 5668 * has also been suppressed at this point. Thus, no locks 5669 * are required while traversing the mc full list. 5670 */ 5671 DPRINT(2, "ibd_undo_start: clear full cache entries"); 5672 mce = list_head(&state->id_mc_full); 5673 while (mce != NULL) { 5674 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5675 jstate = mce->mc_jstate; 5676 mce = list_next(&state->id_mc_full, mce); 5677 ibd_leave_group(state, mgid, jstate); 5678 } 5679 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 5680 } 5681 5682 if (progress & IBD_DRV_RXLIST_ALLOCD) { 5683 ibd_fini_rxlist(state); 5684 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 5685 } 5686 5687 if (progress & IBD_DRV_TXLIST_ALLOCD) { 5688 ibd_fini_txlist(state); 5689 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 5690 } 5691 5692 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 5693 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 5694 IBT_SUCCESS) { 5695 DPRINT(10, "ibd_undo_start: free_channel " 5696 "failed, ret=%d", ret); 5697 } 5698 5699 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 5700 } 5701 5702 if (progress & IBD_DRV_CQS_ALLOCD) { 5703 kmem_free(state->id_txwcs, 5704 sizeof (ibt_wc_t) * state->id_txwcs_size); 5705 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 5706 IBT_SUCCESS) { 5707 DPRINT(10, "ibd_undo_start: free_cq(scq) " 5708 "failed, ret=%d", ret); 5709 } 5710 5711 kmem_free(state->id_rxwcs, 5712 sizeof (ibt_wc_t) * state->id_rxwcs_size); 5713 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 5714 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 5715 "ret=%d", ret); 5716 } 5717 5718 state->id_txwcs = NULL; 5719 state->id_rxwcs = NULL; 5720 state->id_scq_hdl = NULL; 5721 state->id_rcq_hdl = NULL; 5722 5723 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 5724 } 5725 5726 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 5727 mutex_enter(&state->id_ac_mutex); 5728 mod_hash_destroy_hash(state->id_ah_active_hash); 5729 mutex_exit(&state->id_ac_mutex); 5730 ibd_acache_fini(state); 5731 5732 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 5733 } 5734 5735 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 5736 /* 5737 * If we'd created the ipoib broadcast group and had 5738 * successfully joined it, leave it now 5739 */ 5740 if (state->id_bgroup_created) { 5741 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 5742 jstate = IB_MC_JSTATE_FULL; 5743 (void) ibt_leave_mcg(state->id_sgid, mgid, 5744 state->id_sgid, jstate); 5745 } 5746 ibt_free_mcg_info(state->id_mcinfo, 1); 5747 5748 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 5749 } 5750 5751 return (DDI_SUCCESS); 5752 } 5753 5754 /* 5755 * These pair of routines are used to set/clear the condition that 5756 * the caller is likely to do something to change the id_mac_state. 5757 * If there's already someone doing either a start or a stop (possibly 5758 * due to the async handler detecting a pkey relocation event, a plumb 5759 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 5760 * that's done. 5761 */ 5762 static void 5763 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 5764 { 5765 mutex_enter(&state->id_macst_lock); 5766 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 5767 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 5768 5769 state->id_mac_state |= flag; 5770 mutex_exit(&state->id_macst_lock); 5771 } 5772 5773 static void 5774 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 5775 { 5776 mutex_enter(&state->id_macst_lock); 5777 state->id_mac_state &= (~flag); 5778 cv_signal(&state->id_macst_cv); 5779 mutex_exit(&state->id_macst_lock); 5780 } 5781 5782 /* 5783 * GLDv3 entry point to start hardware. 5784 */ 5785 /*ARGSUSED*/ 5786 static int 5787 ibd_m_start(void *arg) 5788 { 5789 ibd_state_t *state = arg; 5790 int ret; 5791 5792 if (state->id_type == IBD_PORT_DRIVER) 5793 return (EINVAL); 5794 5795 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5796 if (state->id_mac_state & IBD_DRV_IN_DELETION) { 5797 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5798 return (EIO); 5799 } 5800 5801 ret = ibd_start(state); 5802 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5803 return (ret); 5804 } 5805 5806 static int 5807 ibd_start(ibd_state_t *state) 5808 { 5809 int err; 5810 ibt_status_t ret; 5811 int late_hca_init = 0; 5812 5813 if (state->id_mac_state & IBD_DRV_STARTED) 5814 return (DDI_SUCCESS); 5815 5816 /* 5817 * We do not increment the running flag when calling ibd_start() as 5818 * a result of some event which moves the state away from late HCA 5819 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability. 5820 */ 5821 if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 5822 (atomic_inc_32_nv(&state->id_running) != 1)) { 5823 DPRINT(10, "ibd_start: id_running is non-zero"); 5824 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 5825 atomic_dec_32(&state->id_running); 5826 return (EINVAL); 5827 } 5828 5829 /* 5830 * Get port details; if we fail here, something bad happened. 5831 * Fail plumb. 5832 */ 5833 if ((err = ibd_get_port_details(state)) != 0) { 5834 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 5835 goto start_fail; 5836 } 5837 /* 5838 * If state->id_link_state is DOWN, it indicates that either the port 5839 * is down, or the pkey is not available. In both cases, resort to late 5840 * initialization. Register for subnet notices, and return success. 5841 */ 5842 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 5843 if (state->id_link_state == LINK_STATE_DOWN) { 5844 late_hca_init = 1; 5845 goto late_hca_init_return; 5846 } 5847 5848 /* 5849 * Find the IPoIB broadcast group 5850 */ 5851 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 5852 /* Resort to late initialization */ 5853 late_hca_init = 1; 5854 goto reg_snet_notices; 5855 } 5856 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 5857 5858 /* 5859 * Initialize per-interface caches and lists; if we fail here, 5860 * it is most likely due to a lack of resources 5861 */ 5862 if (ibd_acache_init(state) != DDI_SUCCESS) { 5863 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 5864 err = ENOMEM; 5865 goto start_fail; 5866 } 5867 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 5868 5869 /* 5870 * Allocate send and receive completion queues 5871 */ 5872 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 5873 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 5874 err = ENOMEM; 5875 goto start_fail; 5876 } 5877 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 5878 5879 /* 5880 * Setup a UD channel 5881 */ 5882 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 5883 err = ENOMEM; 5884 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 5885 goto start_fail; 5886 } 5887 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 5888 5889 /* 5890 * Allocate and initialize the tx buffer list 5891 */ 5892 if (ibd_init_txlist(state) != DDI_SUCCESS) { 5893 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 5894 err = ENOMEM; 5895 goto start_fail; 5896 } 5897 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 5898 5899 /* 5900 * Create the send cq handler here 5901 */ 5902 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 5903 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 5904 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 5905 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 5906 "failed, ret=%d", ret); 5907 err = EINVAL; 5908 goto start_fail; 5909 } 5910 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 5911 5912 /* 5913 * Allocate and initialize the rx buffer list 5914 */ 5915 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 5916 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 5917 err = ENOMEM; 5918 goto start_fail; 5919 } 5920 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 5921 5922 /* 5923 * Join IPoIB broadcast group 5924 */ 5925 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 5926 DPRINT(10, "ibd_start: ibd_join_group() failed"); 5927 err = ENOTACTIVE; 5928 goto start_fail; 5929 } 5930 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 5931 5932 /* 5933 * When we did mac_register() in ibd_attach(), we didn't register 5934 * the real macaddr and we didn't have the true port mtu. Now that 5935 * we're almost ready, set the local mac address and broadcast 5936 * addresses and update gldv3 about the real values of these 5937 * parameters. 5938 */ 5939 if (state->id_enable_rc) { 5940 ibd_h2n_mac(&state->id_macaddr, 5941 IBD_MAC_ADDR_RC + state->id_qpnum, 5942 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5943 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum, 5944 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5945 } else { 5946 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 5947 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5948 } 5949 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 5950 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 5951 5952 if (!state->id_enable_rc) { 5953 (void) mac_maxsdu_update(state->id_mh, state->id_mtu 5954 - IPOIB_HDRSIZE); 5955 } 5956 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 5957 5958 /* 5959 * Setup the receive cq handler 5960 */ 5961 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 5962 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 5963 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 5964 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 5965 "failed, ret=%d", ret); 5966 err = EINVAL; 5967 goto start_fail; 5968 } 5969 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 5970 5971 reg_snet_notices: 5972 /* 5973 * In case of normal initialization sequence, 5974 * Setup the subnet notices handler after we've initialized the acache/ 5975 * mcache and started the async thread, both of which are required for 5976 * the trap handler to function properly. 5977 * 5978 * Now that the async thread has been started (and we've already done 5979 * a mac_register() during attach so mac_tx_update() can be called 5980 * if necessary without any problem), we can enable the trap handler 5981 * to queue requests to the async thread. 5982 * 5983 * In case of late hca initialization, the subnet notices handler will 5984 * only handle MCG created/deleted event. The action performed as part 5985 * of handling these events is to start the interface. So, the 5986 * acache/mcache initialization is not a necessity in such cases for 5987 * registering the subnet notices handler. Also, if we are in 5988 * ibd_start() as a result of, say, some event handling after entering 5989 * late hca initialization phase no need to register again. 5990 */ 5991 if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) { 5992 ibt_register_subnet_notices(state->id_ibt_hdl, 5993 ibd_snet_notices_handler, state); 5994 mutex_enter(&state->id_trap_lock); 5995 state->id_trap_stop = B_FALSE; 5996 mutex_exit(&state->id_trap_lock); 5997 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 5998 } 5999 6000 late_hca_init_return: 6001 if (late_hca_init == 1) { 6002 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT; 6003 /* 6004 * In case of late initialization, mark the link state as down, 6005 * immaterial of the actual link state as reported in the 6006 * port_info. 6007 */ 6008 state->id_link_state = LINK_STATE_DOWN; 6009 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 6010 mac_link_update(state->id_mh, state->id_link_state); 6011 return (DDI_SUCCESS); 6012 } 6013 6014 if (state->id_enable_rc) { 6015 if (state->rc_enable_srq) { 6016 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) { 6017 if (ibd_rc_repost_srq_free_list(state) != 6018 IBT_SUCCESS) { 6019 err = ENOMEM; 6020 goto start_fail; 6021 } 6022 } else { 6023 /* Allocate SRQ resource */ 6024 if (ibd_rc_init_srq_list(state) != 6025 IBT_SUCCESS) { 6026 err = ENOMEM; 6027 goto start_fail; 6028 } 6029 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; 6030 } 6031 } 6032 6033 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { 6034 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " 6035 "failed"); 6036 err = ENOMEM; 6037 goto start_fail; 6038 } 6039 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; 6040 6041 /* RC: begin to listen only after everything is available */ 6042 if (ibd_rc_listen(state) != IBT_SUCCESS) { 6043 DPRINT(10, "ibd_start: ibd_rc_listen() failed"); 6044 err = EINVAL; 6045 goto start_fail; 6046 } 6047 state->id_mac_state |= IBD_DRV_RC_LISTEN; 6048 } 6049 6050 /* 6051 * Indicate link status to GLDv3 and higher layers. By default, 6052 * we assume we are in up state (which must have been true at 6053 * least at the time the broadcast mcg's were probed); if there 6054 * were any up/down transitions till the time we come here, the 6055 * async handler will have updated last known state, which we 6056 * use to tell GLDv3. The async handler will not send any 6057 * notifications to GLDv3 till we reach here in the initialization 6058 * sequence. 6059 */ 6060 mac_link_update(state->id_mh, state->id_link_state); 6061 state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT; 6062 state->id_mac_state |= IBD_DRV_STARTED; 6063 6064 return (DDI_SUCCESS); 6065 6066 start_fail: 6067 /* 6068 * If we ran into a problem during ibd_start() and ran into 6069 * some other problem during undoing our partial work, we can't 6070 * do anything about it. Ignore any errors we might get from 6071 * ibd_undo_start() and just return the original error we got. 6072 */ 6073 (void) ibd_undo_start(state, LINK_STATE_DOWN); 6074 return (err); 6075 } 6076 6077 /* 6078 * GLDv3 entry point to stop hardware from receiving packets. 6079 */ 6080 /*ARGSUSED*/ 6081 static void 6082 ibd_m_stop(void *arg) 6083 { 6084 ibd_state_t *state = (ibd_state_t *)arg; 6085 6086 if (state->id_type == IBD_PORT_DRIVER) 6087 return; 6088 6089 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6090 6091 (void) ibd_undo_start(state, state->id_link_state); 6092 6093 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6094 } 6095 6096 /* 6097 * GLDv3 entry point to modify device's mac address. We do not 6098 * allow address modifications. 6099 */ 6100 static int 6101 ibd_m_unicst(void *arg, const uint8_t *macaddr) 6102 { 6103 ibd_state_t *state = arg; 6104 6105 if (state->id_type == IBD_PORT_DRIVER) 6106 return (EINVAL); 6107 6108 /* 6109 * Don't bother even comparing the macaddr if we haven't 6110 * completed ibd_m_start(). 6111 */ 6112 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6113 return (0); 6114 6115 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 6116 return (0); 6117 else 6118 return (EINVAL); 6119 } 6120 6121 /* 6122 * The blocking part of the IBA join/leave operations are done out 6123 * of here on the async thread. 6124 */ 6125 static void 6126 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 6127 { 6128 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 6129 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 6130 6131 if (op == IBD_ASYNC_JOIN) { 6132 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 6133 ibd_print_warn(state, "Join multicast group failed :" 6134 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6135 } 6136 } else { 6137 /* 6138 * Here, we must search for the proper mcg_info and 6139 * use that to leave the group. 6140 */ 6141 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 6142 } 6143 } 6144 6145 /* 6146 * GLDv3 entry point for multicast enable/disable requests. 6147 * This function queues the operation to the async thread and 6148 * return success for a valid multicast address. 6149 */ 6150 static int 6151 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 6152 { 6153 ibd_state_t *state = (ibd_state_t *)arg; 6154 ipoib_mac_t maddr, *mcast; 6155 ib_gid_t mgid; 6156 ibd_req_t *req; 6157 6158 if (state->id_type == IBD_PORT_DRIVER) 6159 return (EINVAL); 6160 6161 /* 6162 * If we haven't completed ibd_m_start(), async thread wouldn't 6163 * have been started and id_bcaddr wouldn't be set, so there's 6164 * no point in continuing. 6165 */ 6166 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6167 return (0); 6168 6169 /* 6170 * The incoming multicast address might not be aligned properly 6171 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 6172 * it to look like one though, to get the offsets of the mc gid, 6173 * since we know we are not going to dereference any values with 6174 * the ipoib_mac_t pointer. 6175 */ 6176 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 6177 mcast = &maddr; 6178 6179 /* 6180 * Check validity of MCG address. We could additionally check 6181 * that a enable/disable is not being issued on the "broadcast" 6182 * mcg, but since this operation is only invokable by privileged 6183 * programs anyway, we allow the flexibility to those dlpi apps. 6184 * Note that we do not validate the "scope" of the IBA mcg. 6185 */ 6186 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 6187 return (EINVAL); 6188 6189 /* 6190 * fill in multicast pkey and scope 6191 */ 6192 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 6193 6194 /* 6195 * If someone is trying to JOIN/LEAVE the broadcast group, we do 6196 * nothing (i.e. we stay JOINed to the broadcast group done in 6197 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 6198 * requires to be joined to broadcast groups at all times. 6199 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 6200 * depends on this. 6201 */ 6202 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6203 return (0); 6204 6205 ibd_n2h_gid(mcast, &mgid); 6206 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6207 if (req == NULL) 6208 return (ENOMEM); 6209 6210 req->rq_gid = mgid; 6211 6212 if (add) { 6213 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 6214 mgid.gid_prefix, mgid.gid_guid); 6215 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 6216 } else { 6217 DPRINT(1, "ibd_m_multicst : unset_multicast : " 6218 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6219 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 6220 } 6221 return (0); 6222 } 6223 6224 /* 6225 * The blocking part of the IBA promiscuous operations are done 6226 * out of here on the async thread. The dlpireq parameter indicates 6227 * whether this invocation is due to a dlpi request or due to 6228 * a port up/down event. 6229 */ 6230 static void 6231 ibd_async_unsetprom(ibd_state_t *state) 6232 { 6233 ibd_mce_t *mce = list_head(&state->id_mc_non); 6234 ib_gid_t mgid; 6235 6236 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 6237 6238 while (mce != NULL) { 6239 mgid = mce->mc_info.mc_adds_vect.av_dgid; 6240 mce = list_next(&state->id_mc_non, mce); 6241 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 6242 } 6243 state->id_prom_op = IBD_OP_NOTSTARTED; 6244 } 6245 6246 /* 6247 * The blocking part of the IBA promiscuous operations are done 6248 * out of here on the async thread. The dlpireq parameter indicates 6249 * whether this invocation is due to a dlpi request or due to 6250 * a port up/down event. 6251 */ 6252 static void 6253 ibd_async_setprom(ibd_state_t *state) 6254 { 6255 ibt_mcg_attr_t mcg_attr; 6256 ibt_mcg_info_t *mcg_info; 6257 ib_gid_t mgid; 6258 uint_t numg; 6259 int i; 6260 char ret = IBD_OP_COMPLETED; 6261 6262 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 6263 6264 /* 6265 * Obtain all active MC groups on the IB fabric with 6266 * specified criteria (scope + Pkey + Qkey + mtu). 6267 */ 6268 bzero(&mcg_attr, sizeof (mcg_attr)); 6269 mcg_attr.mc_pkey = state->id_pkey; 6270 mcg_attr.mc_scope = state->id_scope; 6271 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 6272 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 6273 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 6274 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 6275 IBT_SUCCESS) { 6276 ibd_print_warn(state, "Could not get list of IBA multicast " 6277 "groups"); 6278 ret = IBD_OP_ERRORED; 6279 goto done; 6280 } 6281 6282 /* 6283 * Iterate over the returned mcg's and join as NonMember 6284 * to the IP mcg's. 6285 */ 6286 for (i = 0; i < numg; i++) { 6287 /* 6288 * Do a NonMember JOIN on the MC group. 6289 */ 6290 mgid = mcg_info[i].mc_adds_vect.av_dgid; 6291 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 6292 ibd_print_warn(state, "IBA promiscuous mode missed " 6293 "multicast gid %016llx:%016llx", 6294 (u_longlong_t)mgid.gid_prefix, 6295 (u_longlong_t)mgid.gid_guid); 6296 } 6297 6298 ibt_free_mcg_info(mcg_info, numg); 6299 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 6300 done: 6301 state->id_prom_op = ret; 6302 } 6303 6304 /* 6305 * GLDv3 entry point for multicast promiscuous enable/disable requests. 6306 * GLDv3 assumes phys state receives more packets than multi state, 6307 * which is not true for IPoIB. Thus, treat the multi and phys 6308 * promiscuous states the same way to work with GLDv3's assumption. 6309 */ 6310 static int 6311 ibd_m_promisc(void *arg, boolean_t on) 6312 { 6313 ibd_state_t *state = (ibd_state_t *)arg; 6314 ibd_req_t *req; 6315 6316 if (state->id_type == IBD_PORT_DRIVER) 6317 return (EINVAL); 6318 6319 /* 6320 * Async thread wouldn't have been started if we haven't 6321 * passed ibd_m_start() 6322 */ 6323 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6324 return (0); 6325 6326 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6327 if (req == NULL) 6328 return (ENOMEM); 6329 if (on) { 6330 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 6331 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 6332 } else { 6333 DPRINT(1, "ibd_m_promisc : unset_promisc"); 6334 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 6335 } 6336 6337 return (0); 6338 } 6339 6340 /* 6341 * GLDv3 entry point for gathering statistics. 6342 */ 6343 static int 6344 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 6345 { 6346 ibd_state_t *state = (ibd_state_t *)arg; 6347 6348 switch (stat) { 6349 case MAC_STAT_IFSPEED: 6350 *val = state->id_link_speed; 6351 break; 6352 case MAC_STAT_MULTIRCV: 6353 *val = state->id_multi_rcv; 6354 break; 6355 case MAC_STAT_BRDCSTRCV: 6356 *val = state->id_brd_rcv; 6357 break; 6358 case MAC_STAT_MULTIXMT: 6359 *val = state->id_multi_xmt; 6360 break; 6361 case MAC_STAT_BRDCSTXMT: 6362 *val = state->id_brd_xmt; 6363 break; 6364 case MAC_STAT_RBYTES: 6365 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte 6366 + state->rc_rcv_copy_byte; 6367 break; 6368 case MAC_STAT_IPACKETS: 6369 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt 6370 + state->rc_rcv_copy_pkt; 6371 break; 6372 case MAC_STAT_OBYTES: 6373 *val = state->id_xmt_bytes + state->rc_xmt_bytes; 6374 break; 6375 case MAC_STAT_OPACKETS: 6376 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + 6377 state->rc_xmt_fragmented_pkt + 6378 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; 6379 break; 6380 case MAC_STAT_OERRORS: 6381 *val = state->id_ah_error; /* failed AH translation */ 6382 break; 6383 case MAC_STAT_IERRORS: 6384 *val = 0; 6385 break; 6386 case MAC_STAT_NOXMTBUF: 6387 *val = state->id_tx_short + state->rc_swqe_short + 6388 state->rc_xmt_buf_short; 6389 break; 6390 case MAC_STAT_NORCVBUF: 6391 default: 6392 return (ENOTSUP); 6393 } 6394 6395 return (0); 6396 } 6397 6398 static void 6399 ibd_async_txsched(ibd_state_t *state) 6400 { 6401 ibd_resume_transmission(state); 6402 } 6403 6404 static void 6405 ibd_resume_transmission(ibd_state_t *state) 6406 { 6407 int flag; 6408 int met_thresh = 0; 6409 int thresh = 0; 6410 int ret = -1; 6411 6412 mutex_enter(&state->id_sched_lock); 6413 if (state->id_sched_needed & IBD_RSRC_SWQE) { 6414 mutex_enter(&state->id_tx_list.dl_mutex); 6415 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6416 met_thresh = state->id_tx_list.dl_cnt + 6417 state->id_tx_rel_list.dl_cnt; 6418 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6419 mutex_exit(&state->id_tx_list.dl_mutex); 6420 thresh = IBD_FREE_SWQES_THRESH; 6421 flag = IBD_RSRC_SWQE; 6422 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 6423 ASSERT(state->id_lso != NULL); 6424 mutex_enter(&state->id_lso_lock); 6425 met_thresh = state->id_lso->bkt_nfree; 6426 thresh = IBD_FREE_LSOS_THRESH; 6427 mutex_exit(&state->id_lso_lock); 6428 flag = IBD_RSRC_LSOBUF; 6429 if (met_thresh > thresh) 6430 state->id_sched_lso_cnt++; 6431 } 6432 if (met_thresh > thresh) { 6433 state->id_sched_needed &= ~flag; 6434 state->id_sched_cnt++; 6435 ret = 0; 6436 } 6437 mutex_exit(&state->id_sched_lock); 6438 6439 if (ret == 0) 6440 mac_tx_update(state->id_mh); 6441 } 6442 6443 /* 6444 * Release the send wqe back into free list. 6445 */ 6446 static void 6447 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 6448 { 6449 /* 6450 * Add back on Tx list for reuse. 6451 */ 6452 ASSERT(tail->swqe_next == NULL); 6453 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6454 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 6455 tail->swqe_next = state->id_tx_rel_list.dl_head; 6456 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 6457 state->id_tx_rel_list.dl_cnt += n; 6458 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6459 } 6460 6461 /* 6462 * Acquire a send wqe from free list. 6463 * Returns error number and send wqe pointer. 6464 */ 6465 static ibd_swqe_t * 6466 ibd_acquire_swqe(ibd_state_t *state) 6467 { 6468 ibd_swqe_t *wqe; 6469 6470 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6471 if (state->id_tx_rel_list.dl_head != NULL) { 6472 /* transfer id_tx_rel_list to id_tx_list */ 6473 state->id_tx_list.dl_head = 6474 state->id_tx_rel_list.dl_head; 6475 state->id_tx_list.dl_cnt = 6476 state->id_tx_rel_list.dl_cnt; 6477 state->id_tx_list.dl_pending_sends = B_FALSE; 6478 6479 /* clear id_tx_rel_list */ 6480 state->id_tx_rel_list.dl_head = NULL; 6481 state->id_tx_rel_list.dl_cnt = 0; 6482 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6483 6484 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 6485 state->id_tx_list.dl_cnt -= 1; 6486 state->id_tx_list.dl_head = wqe->swqe_next; 6487 } else { /* no free swqe */ 6488 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6489 state->id_tx_list.dl_pending_sends = B_TRUE; 6490 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 6491 state->id_tx_short++; 6492 wqe = NULL; 6493 } 6494 return (wqe); 6495 } 6496 6497 static int 6498 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 6499 ibt_ud_dest_hdl_t ud_dest) 6500 { 6501 mblk_t *nmp; 6502 int iph_len, tcph_len; 6503 ibt_wr_lso_t *lso; 6504 uintptr_t ip_start, tcp_start; 6505 uint8_t *dst; 6506 uint_t pending, mblen; 6507 6508 /* 6509 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 6510 * we need to adjust it here for lso. 6511 */ 6512 lso = &(node->w_swr.wr.ud_lso); 6513 lso->lso_ud_dest = ud_dest; 6514 lso->lso_mss = mss; 6515 6516 /* 6517 * Calculate the LSO header size and set it in the UD LSO structure. 6518 * Note that the only assumption we make is that each of the IPoIB, 6519 * IP and TCP headers will be contained in a single mblk fragment; 6520 * together, the headers may span multiple mblk fragments. 6521 */ 6522 nmp = mp; 6523 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 6524 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 6525 ip_start = (uintptr_t)nmp->b_cont->b_rptr 6526 + (ip_start - (uintptr_t)(nmp->b_wptr)); 6527 nmp = nmp->b_cont; 6528 6529 } 6530 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 6531 6532 tcp_start = ip_start + iph_len; 6533 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 6534 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 6535 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 6536 nmp = nmp->b_cont; 6537 } 6538 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 6539 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 6540 6541 /* 6542 * If the lso header fits entirely within a single mblk fragment, 6543 * we'll avoid an additional copy of the lso header here and just 6544 * pass the b_rptr of the mblk directly. 6545 * 6546 * If this isn't true, we'd have to allocate for it explicitly. 6547 */ 6548 if (lso->lso_hdr_sz <= MBLKL(mp)) { 6549 lso->lso_hdr = mp->b_rptr; 6550 } else { 6551 /* On work completion, remember to free this allocated hdr */ 6552 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 6553 if (lso->lso_hdr == NULL) { 6554 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 6555 "sz = %d", lso->lso_hdr_sz); 6556 lso->lso_hdr_sz = 0; 6557 lso->lso_mss = 0; 6558 return (-1); 6559 } 6560 } 6561 6562 /* 6563 * Copy in the lso header only if we need to 6564 */ 6565 if (lso->lso_hdr != mp->b_rptr) { 6566 dst = lso->lso_hdr; 6567 pending = lso->lso_hdr_sz; 6568 6569 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 6570 mblen = MBLKL(nmp); 6571 if (pending > mblen) { 6572 bcopy(nmp->b_rptr, dst, mblen); 6573 dst += mblen; 6574 pending -= mblen; 6575 } else { 6576 bcopy(nmp->b_rptr, dst, pending); 6577 break; 6578 } 6579 } 6580 } 6581 6582 return (0); 6583 } 6584 6585 static void 6586 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 6587 { 6588 ibt_wr_lso_t *lso; 6589 6590 if ((!node) || (!mp)) 6591 return; 6592 6593 /* 6594 * Free any header space that we might've allocated if we 6595 * did an LSO 6596 */ 6597 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 6598 lso = &(node->w_swr.wr.ud_lso); 6599 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 6600 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 6601 lso->lso_hdr = NULL; 6602 lso->lso_hdr_sz = 0; 6603 } 6604 } 6605 } 6606 6607 static void 6608 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 6609 { 6610 uint_t i; 6611 uint_t num_posted; 6612 uint_t n_wrs; 6613 ibt_status_t ibt_status; 6614 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 6615 ibd_swqe_t *tx_head, *elem; 6616 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 6617 6618 /* post the one request, then check for more */ 6619 ibt_status = ibt_post_send(state->id_chnl_hdl, 6620 &node->w_swr, 1, NULL); 6621 if (ibt_status != IBT_SUCCESS) { 6622 ibd_print_warn(state, "ibd_post_send: " 6623 "posting one wr failed: ret=%d", ibt_status); 6624 ibd_tx_cleanup(state, node); 6625 } 6626 6627 tx_head = NULL; 6628 for (;;) { 6629 if (tx_head == NULL) { 6630 mutex_enter(&state->id_txpost_lock); 6631 tx_head = state->id_tx_head; 6632 if (tx_head == NULL) { 6633 state->id_tx_busy = 0; 6634 mutex_exit(&state->id_txpost_lock); 6635 return; 6636 } 6637 state->id_tx_head = NULL; 6638 mutex_exit(&state->id_txpost_lock); 6639 } 6640 6641 /* 6642 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 6643 * at a time if possible, and keep posting them. 6644 */ 6645 for (n_wrs = 0, elem = tx_head; 6646 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 6647 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 6648 nodes[n_wrs] = elem; 6649 wrs[n_wrs] = elem->w_swr; 6650 } 6651 tx_head = elem; 6652 6653 ASSERT(n_wrs != 0); 6654 6655 /* 6656 * If posting fails for some reason, we'll never receive 6657 * completion intimation, so we'll need to cleanup. But 6658 * we need to make sure we don't clean up nodes whose 6659 * wrs have been successfully posted. We assume that the 6660 * hca driver returns on the first failure to post and 6661 * therefore the first 'num_posted' entries don't need 6662 * cleanup here. 6663 */ 6664 num_posted = 0; 6665 ibt_status = ibt_post_send(state->id_chnl_hdl, 6666 wrs, n_wrs, &num_posted); 6667 if (ibt_status != IBT_SUCCESS) { 6668 ibd_print_warn(state, "ibd_post_send: " 6669 "posting multiple wrs failed: " 6670 "requested=%d, done=%d, ret=%d", 6671 n_wrs, num_posted, ibt_status); 6672 6673 for (i = num_posted; i < n_wrs; i++) 6674 ibd_tx_cleanup(state, nodes[i]); 6675 } 6676 } 6677 } 6678 6679 static int 6680 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 6681 uint_t lsohdr_sz) 6682 { 6683 ibt_wr_ds_t *sgl; 6684 ibt_status_t ibt_status; 6685 mblk_t *nmp; 6686 mblk_t *data_mp; 6687 uchar_t *bufp; 6688 size_t blksize; 6689 size_t skip; 6690 size_t avail; 6691 uint_t pktsize; 6692 uint_t frag_len; 6693 uint_t pending_hdr; 6694 int nmblks; 6695 int i; 6696 6697 /* 6698 * Let's skip ahead to the data if this is LSO 6699 */ 6700 data_mp = mp; 6701 pending_hdr = 0; 6702 if (lsohdr_sz) { 6703 pending_hdr = lsohdr_sz; 6704 for (nmp = mp; nmp; nmp = nmp->b_cont) { 6705 frag_len = nmp->b_wptr - nmp->b_rptr; 6706 if (frag_len > pending_hdr) 6707 break; 6708 pending_hdr -= frag_len; 6709 } 6710 data_mp = nmp; /* start of data past lso header */ 6711 ASSERT(data_mp != NULL); 6712 } 6713 6714 /* 6715 * Calculate the size of message data and number of msg blocks 6716 */ 6717 pktsize = 0; 6718 for (nmblks = 0, nmp = data_mp; nmp != NULL; 6719 nmp = nmp->b_cont, nmblks++) { 6720 pktsize += MBLKL(nmp); 6721 } 6722 pktsize -= pending_hdr; 6723 6724 /* 6725 * We only do ibt_map_mem_iov() if the pktsize is above the 6726 * "copy-threshold", and if the number of mp fragments is less than 6727 * the maximum acceptable. 6728 */ 6729 if ((state->id_hca_res_lkey_capab) && 6730 (pktsize > state->id_ud_tx_copy_thresh) && 6731 (nmblks < state->id_max_sqseg_hiwm)) { 6732 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 6733 ibt_iov_attr_t iov_attr; 6734 6735 iov_attr.iov_as = NULL; 6736 iov_attr.iov = iov_arr; 6737 iov_attr.iov_buf = NULL; 6738 iov_attr.iov_list_len = nmblks; 6739 iov_attr.iov_wr_nds = state->id_max_sqseg; 6740 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 6741 iov_attr.iov_flags = IBT_IOV_SLEEP; 6742 6743 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 6744 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 6745 iov_arr[i].iov_len = MBLKL(nmp); 6746 if (i == 0) { 6747 iov_arr[i].iov_addr += pending_hdr; 6748 iov_arr[i].iov_len -= pending_hdr; 6749 } 6750 } 6751 6752 node->w_buftype = IBD_WQE_MAPPED; 6753 node->w_swr.wr_sgl = node->w_sgl; 6754 6755 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 6756 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 6757 if (ibt_status != IBT_SUCCESS) { 6758 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 6759 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 6760 goto ibd_copy_path; 6761 } 6762 6763 return (0); 6764 } 6765 6766 ibd_copy_path: 6767 if (pktsize <= state->id_tx_buf_sz) { 6768 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 6769 node->w_swr.wr_nds = 1; 6770 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 6771 node->w_buftype = IBD_WQE_TXBUF; 6772 6773 /* 6774 * Even though this is the copy path for transfers less than 6775 * id_tx_buf_sz, it could still be an LSO packet. If so, it 6776 * is possible the first data mblk fragment (data_mp) still 6777 * contains part of the LSO header that we need to skip. 6778 */ 6779 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6780 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 6781 blksize = MBLKL(nmp) - pending_hdr; 6782 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 6783 bufp += blksize; 6784 pending_hdr = 0; 6785 } 6786 6787 return (0); 6788 } 6789 6790 /* 6791 * Copy path for transfers greater than id_tx_buf_sz 6792 */ 6793 node->w_swr.wr_sgl = node->w_sgl; 6794 if (ibd_acquire_lsobufs(state, pktsize, 6795 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 6796 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 6797 return (-1); 6798 } 6799 node->w_buftype = IBD_WQE_LSOBUF; 6800 6801 /* 6802 * Copy the larger-than-id_tx_buf_sz packet into a set of 6803 * fixed-sized, pre-mapped LSO buffers. Note that we might 6804 * need to skip part of the LSO header in the first fragment 6805 * as before. 6806 */ 6807 nmp = data_mp; 6808 skip = pending_hdr; 6809 for (i = 0; i < node->w_swr.wr_nds; i++) { 6810 sgl = node->w_swr.wr_sgl + i; 6811 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 6812 avail = IBD_LSO_BUFSZ; 6813 while (nmp && avail) { 6814 blksize = MBLKL(nmp) - skip; 6815 if (blksize > avail) { 6816 bcopy(nmp->b_rptr + skip, bufp, avail); 6817 skip += avail; 6818 avail = 0; 6819 } else { 6820 bcopy(nmp->b_rptr + skip, bufp, blksize); 6821 skip = 0; 6822 avail -= blksize; 6823 bufp += blksize; 6824 nmp = nmp->b_cont; 6825 } 6826 } 6827 } 6828 6829 return (0); 6830 } 6831 6832 /* 6833 * Schedule a completion queue polling to reap the resource we're 6834 * short on. If we implement the change to reap tx completions 6835 * in a separate thread, we'll need to wake up that thread here. 6836 */ 6837 static int 6838 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 6839 { 6840 ibd_req_t *req; 6841 6842 mutex_enter(&state->id_sched_lock); 6843 state->id_sched_needed |= resource_type; 6844 mutex_exit(&state->id_sched_lock); 6845 6846 /* 6847 * If we are asked to queue a work entry, we need to do it 6848 */ 6849 if (q_flag) { 6850 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6851 if (req == NULL) 6852 return (-1); 6853 6854 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 6855 } 6856 6857 return (0); 6858 } 6859 6860 /* 6861 * The passed in packet has this format: 6862 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 6863 */ 6864 static boolean_t 6865 ibd_send(ibd_state_t *state, mblk_t *mp) 6866 { 6867 ibd_ace_t *ace; 6868 ibd_swqe_t *node; 6869 ipoib_mac_t *dest; 6870 ib_header_info_t *ipibp; 6871 ip6_t *ip6h; 6872 uint_t pktsize; 6873 uint32_t mss; 6874 uint32_t hckflags; 6875 uint32_t lsoflags = 0; 6876 uint_t lsohdr_sz = 0; 6877 int ret, len; 6878 boolean_t dofree = B_FALSE; 6879 boolean_t rc; 6880 /* if (rc_chan == NULL) send by UD; else send by RC; */ 6881 ibd_rc_chan_t *rc_chan; 6882 int nmblks; 6883 mblk_t *nmp; 6884 6885 /* 6886 * If we aren't done with the device initialization and start, 6887 * we shouldn't be here. 6888 */ 6889 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6890 return (B_FALSE); 6891 6892 /* 6893 * Obtain an address handle for the destination. 6894 */ 6895 ipibp = (ib_header_info_t *)mp->b_rptr; 6896 dest = (ipoib_mac_t *)&ipibp->ib_dst; 6897 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6898 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 6899 6900 rc_chan = NULL; 6901 ace = ibd_acache_lookup(state, dest, &ret, 1); 6902 if (state->id_enable_rc && (ace != NULL) && 6903 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { 6904 if (ace->ac_chan == NULL) { 6905 state->rc_null_conn++; 6906 } else { 6907 if (ace->ac_chan->chan_state == 6908 IBD_RC_STATE_ACT_ESTAB) { 6909 rc_chan = ace->ac_chan; 6910 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 6911 node = WQE_TO_SWQE( 6912 rc_chan->tx_wqe_list.dl_head); 6913 if (node != NULL) { 6914 rc_chan->tx_wqe_list.dl_cnt -= 1; 6915 rc_chan->tx_wqe_list.dl_head = 6916 node->swqe_next; 6917 } else { 6918 node = ibd_rc_acquire_swqes(rc_chan); 6919 } 6920 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 6921 6922 if (node == NULL) { 6923 state->rc_swqe_short++; 6924 mutex_enter(&state->id_sched_lock); 6925 state->id_sched_needed |= 6926 IBD_RSRC_RC_SWQE; 6927 mutex_exit(&state->id_sched_lock); 6928 ibd_dec_ref_ace(state, ace); 6929 return (B_FALSE); 6930 } 6931 } else { 6932 state->rc_no_estab_conn++; 6933 } 6934 } 6935 } 6936 6937 if (rc_chan == NULL) { 6938 mutex_enter(&state->id_tx_list.dl_mutex); 6939 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 6940 if (node != NULL) { 6941 state->id_tx_list.dl_cnt -= 1; 6942 state->id_tx_list.dl_head = node->swqe_next; 6943 } else { 6944 node = ibd_acquire_swqe(state); 6945 } 6946 mutex_exit(&state->id_tx_list.dl_mutex); 6947 if (node == NULL) { 6948 /* 6949 * If we don't have an swqe available, schedule a 6950 * transmit completion queue cleanup and hold off on 6951 * sending more packets until we have some free swqes 6952 */ 6953 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { 6954 if (ace != NULL) { 6955 ibd_dec_ref_ace(state, ace); 6956 } 6957 return (B_FALSE); 6958 } 6959 6960 /* 6961 * If a poll cannot be scheduled, we have no choice but 6962 * to drop this packet 6963 */ 6964 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 6965 if (ace != NULL) { 6966 ibd_dec_ref_ace(state, ace); 6967 } 6968 return (B_TRUE); 6969 } 6970 } 6971 6972 /* 6973 * Initialize the commonly used fields in swqe to NULL to protect 6974 * against ibd_tx_cleanup accidentally misinterpreting these on a 6975 * failure. 6976 */ 6977 node->swqe_im_mblk = NULL; 6978 node->w_swr.wr_nds = 0; 6979 node->w_swr.wr_sgl = NULL; 6980 node->w_swr.wr_opcode = IBT_WRC_SEND; 6981 6982 /* 6983 * Calculate the size of message data and number of msg blocks 6984 */ 6985 pktsize = 0; 6986 for (nmblks = 0, nmp = mp; nmp != NULL; 6987 nmp = nmp->b_cont, nmblks++) { 6988 pktsize += MBLKL(nmp); 6989 } 6990 6991 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6992 atomic_inc_64(&state->id_brd_xmt); 6993 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6994 atomic_inc_64(&state->id_multi_xmt); 6995 6996 if (ace != NULL) { 6997 node->w_ahandle = ace; 6998 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 6999 } else { 7000 DPRINT(5, 7001 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 7002 ((ret == EFAULT) ? "failed" : "queued"), 7003 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 7004 htonl(dest->ipoib_gidpref[1]), 7005 htonl(dest->ipoib_gidsuff[0]), 7006 htonl(dest->ipoib_gidsuff[1])); 7007 state->rc_ace_not_found++; 7008 node->w_ahandle = NULL; 7009 7010 /* 7011 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 7012 * can not find a path for the specific dest address. We 7013 * should get rid of this kind of packet. We also should get 7014 * rid of the packet if we cannot schedule a poll via the 7015 * async thread. For the normal case, ibd will return the 7016 * packet to upper layer and wait for AH creating. 7017 * 7018 * Note that we always queue a work slot entry for the async 7019 * thread when we fail AH lookup (even in intr mode); this is 7020 * due to the convoluted way the code currently looks for AH. 7021 */ 7022 if (ret == EFAULT) { 7023 dofree = B_TRUE; 7024 rc = B_TRUE; 7025 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 7026 dofree = B_TRUE; 7027 rc = B_TRUE; 7028 } else { 7029 dofree = B_FALSE; 7030 rc = B_FALSE; 7031 } 7032 goto ibd_send_fail; 7033 } 7034 7035 /* 7036 * For ND6 packets, padding is at the front of the source lladdr. 7037 * Insert the padding at front. 7038 */ 7039 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 7040 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 7041 if (!pullupmsg(mp, IPV6_HDR_LEN + 7042 sizeof (ib_header_info_t))) { 7043 DPRINT(10, "ibd_send: pullupmsg failure "); 7044 dofree = B_TRUE; 7045 rc = B_TRUE; 7046 goto ibd_send_fail; 7047 } 7048 ipibp = (ib_header_info_t *)mp->b_rptr; 7049 } 7050 ip6h = (ip6_t *)((uchar_t *)ipibp + 7051 sizeof (ib_header_info_t)); 7052 len = ntohs(ip6h->ip6_plen); 7053 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 7054 mblk_t *pad; 7055 7056 pad = allocb(4, 0); 7057 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 7058 linkb(mp, pad); 7059 if (MBLKL(mp) < sizeof (ib_header_info_t) + 7060 IPV6_HDR_LEN + len + 4) { 7061 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 7062 IPV6_HDR_LEN + len + 4)) { 7063 DPRINT(10, "ibd_send: pullupmsg " 7064 "failure "); 7065 dofree = B_TRUE; 7066 rc = B_TRUE; 7067 goto ibd_send_fail; 7068 } 7069 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 7070 sizeof (ib_header_info_t)); 7071 } 7072 7073 /* LINTED: E_CONSTANT_CONDITION */ 7074 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 7075 } 7076 } 7077 7078 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); 7079 mp->b_rptr += sizeof (ib_addrs_t); 7080 pktsize -= sizeof (ib_addrs_t); 7081 7082 if (rc_chan) { /* send in RC mode */ 7083 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 7084 ibt_iov_attr_t iov_attr; 7085 uint_t i; 7086 size_t blksize; 7087 uchar_t *bufp; 7088 ibd_rc_tx_largebuf_t *lbufp; 7089 7090 atomic_add_64(&state->rc_xmt_bytes, pktsize); 7091 7092 /* 7093 * Upper layer does Tx checksum, we don't need do any 7094 * checksum here. 7095 */ 7096 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); 7097 7098 /* 7099 * We only do ibt_map_mem_iov() if the pktsize is above 7100 * the "copy-threshold", and if the number of mp 7101 * fragments is less than the maximum acceptable. 7102 */ 7103 if (pktsize <= state->id_rc_tx_copy_thresh) { 7104 atomic_inc_64(&state->rc_xmt_small_pkt); 7105 /* 7106 * Only process unicast packet in Reliable Connected 7107 * mode. 7108 */ 7109 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 7110 node->w_swr.wr_nds = 1; 7111 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 7112 node->w_buftype = IBD_WQE_TXBUF; 7113 7114 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 7115 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7116 blksize = MBLKL(nmp); 7117 bcopy(nmp->b_rptr, bufp, blksize); 7118 bufp += blksize; 7119 } 7120 freemsg(mp); 7121 ASSERT(node->swqe_im_mblk == NULL); 7122 } else { 7123 if ((state->rc_enable_iov_map) && 7124 (nmblks < state->rc_max_sqseg_hiwm)) { 7125 7126 /* do ibt_map_mem_iov() */ 7127 iov_attr.iov_as = NULL; 7128 iov_attr.iov = iov_arr; 7129 iov_attr.iov_buf = NULL; 7130 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; 7131 iov_attr.iov_lso_hdr_sz = 0; 7132 iov_attr.iov_flags = IBT_IOV_SLEEP; 7133 7134 i = 0; 7135 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7136 iov_arr[i].iov_len = MBLKL(nmp); 7137 if (iov_arr[i].iov_len != 0) { 7138 iov_arr[i].iov_addr = (caddr_t) 7139 (void *)nmp->b_rptr; 7140 i++; 7141 } 7142 } 7143 iov_attr.iov_list_len = i; 7144 node->w_swr.wr_sgl = node->w_sgl; 7145 7146 ret = ibt_map_mem_iov(state->id_hca_hdl, 7147 &iov_attr, (ibt_all_wr_t *)&node->w_swr, 7148 &node->w_mi_hdl); 7149 if (ret != IBT_SUCCESS) { 7150 atomic_inc_64( 7151 &state->rc_xmt_map_fail_pkt); 7152 DPRINT(30, "ibd_send: ibt_map_mem_iov(" 7153 ") failed, nmblks=%d, real_nmblks" 7154 "=%d, ret=0x%x", nmblks, i, ret); 7155 goto ibd_rc_large_copy; 7156 } 7157 7158 atomic_inc_64(&state->rc_xmt_map_succ_pkt); 7159 node->w_buftype = IBD_WQE_MAPPED; 7160 node->swqe_im_mblk = mp; 7161 } else { 7162 atomic_inc_64(&state->rc_xmt_fragmented_pkt); 7163 ibd_rc_large_copy: 7164 mutex_enter(&state->rc_tx_large_bufs_lock); 7165 if (state->rc_tx_largebuf_nfree == 0) { 7166 state->rc_xmt_buf_short++; 7167 mutex_exit 7168 (&state->rc_tx_large_bufs_lock); 7169 mutex_enter(&state->id_sched_lock); 7170 state->id_sched_needed |= 7171 IBD_RSRC_RC_TX_LARGEBUF; 7172 mutex_exit(&state->id_sched_lock); 7173 dofree = B_FALSE; 7174 rc = B_FALSE; 7175 /* 7176 * If we don't have Tx large bufs, 7177 * return failure. node->w_buftype 7178 * should not be IBD_WQE_RC_COPYBUF, 7179 * otherwise it will cause problem 7180 * in ibd_rc_tx_cleanup() 7181 */ 7182 node->w_buftype = IBD_WQE_TXBUF; 7183 goto ibd_send_fail; 7184 } 7185 7186 lbufp = state->rc_tx_largebuf_free_head; 7187 ASSERT(lbufp->lb_buf != NULL); 7188 state->rc_tx_largebuf_free_head = 7189 lbufp->lb_next; 7190 lbufp->lb_next = NULL; 7191 /* Update nfree count */ 7192 state->rc_tx_largebuf_nfree --; 7193 mutex_exit(&state->rc_tx_large_bufs_lock); 7194 bufp = lbufp->lb_buf; 7195 node->w_sgl[0].ds_va = 7196 (ib_vaddr_t)(uintptr_t)bufp; 7197 node->w_sgl[0].ds_key = 7198 state->rc_tx_mr_desc.md_lkey; 7199 node->w_sgl[0].ds_len = pktsize; 7200 node->w_swr.wr_sgl = node->w_sgl; 7201 node->w_swr.wr_nds = 1; 7202 node->w_buftype = IBD_WQE_RC_COPYBUF; 7203 node->w_rc_tx_largebuf = lbufp; 7204 7205 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7206 blksize = MBLKL(nmp); 7207 if (blksize != 0) { 7208 bcopy(nmp->b_rptr, bufp, 7209 blksize); 7210 bufp += blksize; 7211 } 7212 } 7213 freemsg(mp); 7214 ASSERT(node->swqe_im_mblk == NULL); 7215 } 7216 } 7217 7218 node->swqe_next = NULL; 7219 mutex_enter(&rc_chan->tx_post_lock); 7220 if (rc_chan->tx_busy) { 7221 if (rc_chan->tx_head) { 7222 rc_chan->tx_tail->swqe_next = 7223 SWQE_TO_WQE(node); 7224 } else { 7225 rc_chan->tx_head = node; 7226 } 7227 rc_chan->tx_tail = node; 7228 mutex_exit(&rc_chan->tx_post_lock); 7229 } else { 7230 rc_chan->tx_busy = 1; 7231 mutex_exit(&rc_chan->tx_post_lock); 7232 ibd_rc_post_send(rc_chan, node); 7233 } 7234 7235 return (B_TRUE); 7236 } /* send by RC */ 7237 7238 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { 7239 /* 7240 * Too long pktsize. The packet size from GLD should <= 7241 * state->id_mtu + sizeof (ib_addrs_t) 7242 */ 7243 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { 7244 ibd_req_t *req; 7245 7246 mutex_enter(&ace->tx_too_big_mutex); 7247 if (ace->tx_too_big_ongoing) { 7248 mutex_exit(&ace->tx_too_big_mutex); 7249 state->rc_xmt_reenter_too_long_pkt++; 7250 dofree = B_TRUE; 7251 } else { 7252 ace->tx_too_big_ongoing = B_TRUE; 7253 mutex_exit(&ace->tx_too_big_mutex); 7254 state->rc_xmt_icmp_too_long_pkt++; 7255 7256 req = kmem_cache_alloc(state->id_req_kmc, 7257 KM_NOSLEEP); 7258 if (req == NULL) { 7259 ibd_print_warn(state, "ibd_send: alloc " 7260 "ibd_req_t fail"); 7261 /* Drop it. */ 7262 dofree = B_TRUE; 7263 } else { 7264 req->rq_ptr = mp; 7265 req->rq_ptr2 = ace; 7266 ibd_queue_work_slot(state, req, 7267 IBD_ASYNC_RC_TOO_BIG); 7268 dofree = B_FALSE; 7269 } 7270 } 7271 } else { 7272 ibd_print_warn(state, "Reliable Connected mode is on. " 7273 "Multicast packet length %d > %d is too long to " 7274 "send packet (%d > %d), drop it", 7275 pktsize, state->id_mtu); 7276 state->rc_xmt_drop_too_long_pkt++; 7277 /* Drop it. */ 7278 dofree = B_TRUE; 7279 } 7280 rc = B_TRUE; 7281 goto ibd_send_fail; 7282 } 7283 7284 atomic_add_64(&state->id_xmt_bytes, pktsize); 7285 atomic_inc_64(&state->id_xmt_pkt); 7286 7287 /* 7288 * Do LSO and checksum related work here. For LSO send, adjust the 7289 * ud destination, the opcode and the LSO header information to the 7290 * work request. 7291 */ 7292 mac_lso_get(mp, &mss, &lsoflags); 7293 if ((lsoflags & HW_LSO) != HW_LSO) { 7294 node->w_swr.wr_opcode = IBT_WRC_SEND; 7295 lsohdr_sz = 0; 7296 } else { 7297 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 7298 /* 7299 * The routine can only fail if there's no memory; we 7300 * can only drop the packet if this happens 7301 */ 7302 ibd_print_warn(state, 7303 "ibd_send: no memory, lso posting failed"); 7304 dofree = B_TRUE; 7305 rc = B_TRUE; 7306 goto ibd_send_fail; 7307 } 7308 7309 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 7310 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 7311 } 7312 7313 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); 7314 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 7315 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 7316 else 7317 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 7318 7319 /* 7320 * Prepare the sgl for posting; the routine can only fail if there's 7321 * no lso buf available for posting. If this is the case, we should 7322 * probably resched for lso bufs to become available and then try again. 7323 */ 7324 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 7325 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 7326 dofree = B_TRUE; 7327 rc = B_TRUE; 7328 } else { 7329 dofree = B_FALSE; 7330 rc = B_FALSE; 7331 } 7332 goto ibd_send_fail; 7333 } 7334 node->swqe_im_mblk = mp; 7335 7336 /* 7337 * Queue the wqe to hardware; since we can now simply queue a 7338 * post instead of doing it serially, we cannot assume anything 7339 * about the 'node' after ibd_post_send() returns. 7340 */ 7341 node->swqe_next = NULL; 7342 7343 mutex_enter(&state->id_txpost_lock); 7344 if (state->id_tx_busy) { 7345 if (state->id_tx_head) { 7346 state->id_tx_tail->swqe_next = 7347 SWQE_TO_WQE(node); 7348 } else { 7349 state->id_tx_head = node; 7350 } 7351 state->id_tx_tail = node; 7352 mutex_exit(&state->id_txpost_lock); 7353 } else { 7354 state->id_tx_busy = 1; 7355 mutex_exit(&state->id_txpost_lock); 7356 ibd_post_send(state, node); 7357 } 7358 7359 return (B_TRUE); 7360 7361 ibd_send_fail: 7362 if (node && mp) 7363 ibd_free_lsohdr(node, mp); 7364 7365 if (dofree) 7366 freemsg(mp); 7367 7368 if (node != NULL) { 7369 if (rc_chan) { 7370 ibd_rc_tx_cleanup(node); 7371 } else { 7372 ibd_tx_cleanup(state, node); 7373 } 7374 } 7375 7376 return (rc); 7377 } 7378 7379 /* 7380 * GLDv3 entry point for transmitting datagram. 7381 */ 7382 static mblk_t * 7383 ibd_m_tx(void *arg, mblk_t *mp) 7384 { 7385 ibd_state_t *state = (ibd_state_t *)arg; 7386 mblk_t *next; 7387 7388 if (state->id_type == IBD_PORT_DRIVER) { 7389 freemsgchain(mp); 7390 return (NULL); 7391 } 7392 7393 if ((state->id_link_state != LINK_STATE_UP) || 7394 !(state->id_mac_state & IBD_DRV_STARTED)) { 7395 freemsgchain(mp); 7396 mp = NULL; 7397 } 7398 7399 while (mp != NULL) { 7400 next = mp->b_next; 7401 mp->b_next = NULL; 7402 if (ibd_send(state, mp) == B_FALSE) { 7403 /* Send fail */ 7404 mp->b_next = next; 7405 break; 7406 } 7407 mp = next; 7408 } 7409 7410 return (mp); 7411 } 7412 7413 /* 7414 * this handles Tx and Rx completions. With separate CQs, this handles 7415 * only Rx completions. 7416 */ 7417 static uint_t 7418 ibd_intr(caddr_t arg) 7419 { 7420 ibd_state_t *state = (ibd_state_t *)arg; 7421 7422 ibd_poll_rcq(state, state->id_rcq_hdl); 7423 7424 return (DDI_INTR_CLAIMED); 7425 } 7426 7427 /* 7428 * Poll and fully drain the send cq 7429 */ 7430 static void 7431 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7432 { 7433 ibt_wc_t *wcs = state->id_txwcs; 7434 uint_t numwcs = state->id_txwcs_size; 7435 ibd_wqe_t *wqe; 7436 ibd_swqe_t *head, *tail; 7437 ibt_wc_t *wc; 7438 uint_t num_polled; 7439 int i; 7440 7441 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7442 head = tail = NULL; 7443 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7444 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 7445 if (wc->wc_status != IBT_WC_SUCCESS) { 7446 /* 7447 * Channel being torn down. 7448 */ 7449 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7450 DPRINT(5, "ibd_drain_scq: flush error"); 7451 DPRINT(10, "ibd_drain_scq: Bad " 7452 "status %d", wc->wc_status); 7453 } else { 7454 DPRINT(10, "ibd_drain_scq: " 7455 "unexpected wc_status %d", 7456 wc->wc_status); 7457 } 7458 /* 7459 * Fallthrough to invoke the Tx handler to 7460 * release held resources, e.g., AH refcount. 7461 */ 7462 } 7463 /* 7464 * Add this swqe to the list to be cleaned up. 7465 */ 7466 if (head) 7467 tail->swqe_next = wqe; 7468 else 7469 head = WQE_TO_SWQE(wqe); 7470 tail = WQE_TO_SWQE(wqe); 7471 } 7472 tail->swqe_next = NULL; 7473 ibd_tx_cleanup_list(state, head, tail); 7474 7475 /* 7476 * Resume any blocked transmissions if possible 7477 */ 7478 ibd_resume_transmission(state); 7479 } 7480 } 7481 7482 /* 7483 * Poll and fully drain the receive cq 7484 */ 7485 static void 7486 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7487 { 7488 ibt_wc_t *wcs = state->id_rxwcs; 7489 uint_t numwcs = state->id_rxwcs_size; 7490 ibd_rwqe_t *rwqe; 7491 ibt_wc_t *wc; 7492 uint_t num_polled; 7493 int i; 7494 mblk_t *head, *tail, *mp; 7495 7496 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7497 head = tail = NULL; 7498 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7499 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 7500 if (wc->wc_status != IBT_WC_SUCCESS) { 7501 /* 7502 * Channel being torn down. 7503 */ 7504 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7505 DPRINT(5, "ibd_drain_rcq: " 7506 "expected flushed rwqe"); 7507 } else { 7508 DPRINT(5, "ibd_drain_rcq: " 7509 "unexpected wc_status %d", 7510 wc->wc_status); 7511 } 7512 atomic_inc_32( 7513 &state->id_rx_list.dl_bufs_outstanding); 7514 freemsg(rwqe->rwqe_im_mblk); 7515 continue; 7516 } 7517 mp = ibd_process_rx(state, rwqe, wc); 7518 if (mp == NULL) 7519 continue; 7520 7521 /* 7522 * Add this mp to the list to send to the nw layer. 7523 */ 7524 if (head) 7525 tail->b_next = mp; 7526 else 7527 head = mp; 7528 tail = mp; 7529 } 7530 if (head) 7531 mac_rx(state->id_mh, state->id_rh, head); 7532 7533 /* 7534 * Account for #rwqes polled. 7535 * Post more here, if less than one fourth full. 7536 */ 7537 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 7538 (state->id_ud_num_rwqe / 4)) 7539 ibd_post_recv_intr(state); 7540 } 7541 } 7542 7543 /* 7544 * Common code for interrupt handling as well as for polling 7545 * for all completed wqe's while detaching. 7546 */ 7547 static void 7548 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7549 { 7550 int flag, redo_flag; 7551 int redo = 1; 7552 7553 flag = IBD_CQ_POLLING; 7554 redo_flag = IBD_REDO_CQ_POLLING; 7555 7556 mutex_enter(&state->id_scq_poll_lock); 7557 if (state->id_scq_poll_busy & flag) { 7558 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 7559 state->id_scq_poll_busy |= redo_flag; 7560 mutex_exit(&state->id_scq_poll_lock); 7561 return; 7562 } 7563 state->id_scq_poll_busy |= flag; 7564 mutex_exit(&state->id_scq_poll_lock); 7565 7566 /* 7567 * In some cases (eg detaching), this code can be invoked on 7568 * any cpu after disabling cq notification (thus no concurrency 7569 * exists). Apart from that, the following applies normally: 7570 * Transmit completion handling could be from any cpu if 7571 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 7572 * is interrupt driven. 7573 */ 7574 7575 /* 7576 * Poll and drain the CQ 7577 */ 7578 ibd_drain_scq(state, cq_hdl); 7579 7580 /* 7581 * Enable CQ notifications and redrain the cq to catch any 7582 * completions we might have missed after the ibd_drain_scq() 7583 * above and before the ibt_enable_cq_notify() that follows. 7584 * Finally, service any new requests to poll the cq that 7585 * could've come in after the ibt_enable_cq_notify(). 7586 */ 7587 do { 7588 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 7589 IBT_SUCCESS) { 7590 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7591 } 7592 7593 ibd_drain_scq(state, cq_hdl); 7594 7595 mutex_enter(&state->id_scq_poll_lock); 7596 if (state->id_scq_poll_busy & redo_flag) 7597 state->id_scq_poll_busy &= ~redo_flag; 7598 else { 7599 state->id_scq_poll_busy &= ~flag; 7600 redo = 0; 7601 } 7602 mutex_exit(&state->id_scq_poll_lock); 7603 7604 } while (redo); 7605 } 7606 7607 /* 7608 * Common code for interrupt handling as well as for polling 7609 * for all completed wqe's while detaching. 7610 */ 7611 static void 7612 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 7613 { 7614 int flag, redo_flag; 7615 int redo = 1; 7616 7617 flag = IBD_CQ_POLLING; 7618 redo_flag = IBD_REDO_CQ_POLLING; 7619 7620 mutex_enter(&state->id_rcq_poll_lock); 7621 if (state->id_rcq_poll_busy & flag) { 7622 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 7623 state->id_rcq_poll_busy |= redo_flag; 7624 mutex_exit(&state->id_rcq_poll_lock); 7625 return; 7626 } 7627 state->id_rcq_poll_busy |= flag; 7628 mutex_exit(&state->id_rcq_poll_lock); 7629 7630 /* 7631 * Poll and drain the CQ 7632 */ 7633 ibd_drain_rcq(state, rcq); 7634 7635 /* 7636 * Enable CQ notifications and redrain the cq to catch any 7637 * completions we might have missed after the ibd_drain_cq() 7638 * above and before the ibt_enable_cq_notify() that follows. 7639 * Finally, service any new requests to poll the cq that 7640 * could've come in after the ibt_enable_cq_notify(). 7641 */ 7642 do { 7643 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 7644 IBT_SUCCESS) { 7645 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7646 } 7647 7648 ibd_drain_rcq(state, rcq); 7649 7650 mutex_enter(&state->id_rcq_poll_lock); 7651 if (state->id_rcq_poll_busy & redo_flag) 7652 state->id_rcq_poll_busy &= ~redo_flag; 7653 else { 7654 state->id_rcq_poll_busy &= ~flag; 7655 redo = 0; 7656 } 7657 mutex_exit(&state->id_rcq_poll_lock); 7658 7659 } while (redo); 7660 } 7661 7662 /* 7663 * Unmap the memory area associated with a given swqe. 7664 */ 7665 void 7666 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 7667 { 7668 ibt_status_t stat; 7669 7670 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 7671 7672 if (swqe->w_mi_hdl) { 7673 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 7674 swqe->w_mi_hdl)) != IBT_SUCCESS) { 7675 DPRINT(10, 7676 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 7677 } 7678 swqe->w_mi_hdl = NULL; 7679 } 7680 swqe->w_swr.wr_nds = 0; 7681 } 7682 7683 void 7684 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 7685 { 7686 /* 7687 * The recycling logic can be eliminated from here 7688 * and put into the async thread if we create another 7689 * list to hold ACE's for unjoined mcg's. 7690 */ 7691 if (DEC_REF_DO_CYCLE(ace)) { 7692 ibd_mce_t *mce; 7693 7694 /* 7695 * Check with the lock taken: we decremented 7696 * reference count without the lock, and some 7697 * transmitter might already have bumped the 7698 * reference count (possible in case of multicast 7699 * disable when we leave the AH on the active 7700 * list). If not still 0, get out, leaving the 7701 * recycle bit intact. 7702 * 7703 * Atomically transition the AH from active 7704 * to free list, and queue a work request to 7705 * leave the group and destroy the mce. No 7706 * transmitter can be looking at the AH or 7707 * the MCE in between, since we have the 7708 * ac_mutex lock. In the SendOnly reap case, 7709 * it is not necessary to hold the ac_mutex 7710 * and recheck the ref count (since the AH was 7711 * taken off the active list), we just do it 7712 * to have uniform processing with the Full 7713 * reap case. 7714 */ 7715 mutex_enter(&state->id_ac_mutex); 7716 mce = ace->ac_mce; 7717 if (GET_REF_CYCLE(ace) == 0) { 7718 CLEAR_REFCYCLE(ace); 7719 /* 7720 * Identify the case of fullmember reap as 7721 * opposed to mcg trap reap. Also, port up 7722 * might set ac_mce to NULL to indicate Tx 7723 * cleanup should do no more than put the 7724 * AH in the free list (see ibd_async_link). 7725 */ 7726 if (mce != NULL) { 7727 ace->ac_mce = NULL; 7728 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 7729 /* 7730 * mc_req was initialized at mce 7731 * creation time. 7732 */ 7733 ibd_queue_work_slot(state, 7734 &mce->mc_req, IBD_ASYNC_REAP); 7735 } 7736 IBD_ACACHE_INSERT_FREE(state, ace); 7737 } 7738 mutex_exit(&state->id_ac_mutex); 7739 } 7740 } 7741 7742 /* 7743 * Common code that deals with clean ups after a successful or 7744 * erroneous transmission attempt. 7745 */ 7746 static void 7747 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 7748 { 7749 ibd_ace_t *ace = swqe->w_ahandle; 7750 7751 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 7752 7753 /* 7754 * If this was a dynamic mapping in ibd_send(), we need to 7755 * unmap here. If this was an lso buffer we'd used for sending, 7756 * we need to release the lso buf to the pool, since the resource 7757 * is scarce. However, if this was simply a normal send using 7758 * the copybuf (present in each swqe), we don't need to release it. 7759 */ 7760 if (swqe->swqe_im_mblk != NULL) { 7761 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7762 ibd_unmap_mem(state, swqe); 7763 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7764 ibd_release_lsobufs(state, 7765 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7766 } 7767 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7768 freemsg(swqe->swqe_im_mblk); 7769 swqe->swqe_im_mblk = NULL; 7770 } 7771 7772 /* 7773 * Drop the reference count on the AH; it can be reused 7774 * now for a different destination if there are no more 7775 * posted sends that will use it. This can be eliminated 7776 * if we can always associate each Tx buffer with an AH. 7777 * The ace can be null if we are cleaning up from the 7778 * ibd_send() error path. 7779 */ 7780 if (ace != NULL) { 7781 ibd_dec_ref_ace(state, ace); 7782 } 7783 7784 /* 7785 * Release the send wqe for reuse. 7786 */ 7787 swqe->swqe_next = NULL; 7788 ibd_release_swqe(state, swqe, swqe, 1); 7789 } 7790 7791 static void 7792 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 7793 { 7794 ibd_ace_t *ace; 7795 ibd_swqe_t *swqe; 7796 int n = 0; 7797 7798 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 7799 7800 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 7801 7802 /* 7803 * If this was a dynamic mapping in ibd_send(), we need to 7804 * unmap here. If this was an lso buffer we'd used for sending, 7805 * we need to release the lso buf to the pool, since the 7806 * resource is scarce. However, if this was simply a normal 7807 * send using the copybuf (present in each swqe), we don't need 7808 * to release it. 7809 */ 7810 if (swqe->swqe_im_mblk != NULL) { 7811 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7812 ibd_unmap_mem(state, swqe); 7813 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7814 ibd_release_lsobufs(state, 7815 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7816 } 7817 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7818 freemsg(swqe->swqe_im_mblk); 7819 swqe->swqe_im_mblk = NULL; 7820 } 7821 7822 /* 7823 * Drop the reference count on the AH; it can be reused 7824 * now for a different destination if there are no more 7825 * posted sends that will use it. This can be eliminated 7826 * if we can always associate each Tx buffer with an AH. 7827 * The ace can be null if we are cleaning up from the 7828 * ibd_send() error path. 7829 */ 7830 ace = swqe->w_ahandle; 7831 if (ace != NULL) { 7832 ibd_dec_ref_ace(state, ace); 7833 } 7834 n++; 7835 } 7836 7837 /* 7838 * Release the send wqes for reuse. 7839 */ 7840 ibd_release_swqe(state, head, tail, n); 7841 } 7842 7843 /* 7844 * Processing to be done after receipt of a packet; hand off to GLD 7845 * in the format expected by GLD. The received packet has this 7846 * format: 2b sap :: 00 :: data. 7847 */ 7848 static mblk_t * 7849 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 7850 { 7851 ib_header_info_t *phdr; 7852 mblk_t *mp; 7853 ipoib_hdr_t *ipibp; 7854 ipha_t *iphap; 7855 ip6_t *ip6h; 7856 int len; 7857 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 7858 uint32_t bufs; 7859 7860 /* 7861 * Track number handed to upper layer that need to be returned. 7862 */ 7863 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 7864 7865 /* Never run out of rwqes, use allocb when running low */ 7866 if (bufs >= state->id_rx_bufs_outstanding_limit) { 7867 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 7868 atomic_inc_32(&state->id_rx_allocb); 7869 mp = allocb(pkt_len, BPRI_HI); 7870 if (mp) { 7871 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 7872 ibd_post_recv(state, rwqe); 7873 } else { /* no memory */ 7874 atomic_inc_32(&state->id_rx_allocb_failed); 7875 ibd_post_recv(state, rwqe); 7876 return (NULL); 7877 } 7878 } else { 7879 mp = rwqe->rwqe_im_mblk; 7880 } 7881 7882 7883 /* 7884 * Adjust write pointer depending on how much data came in. 7885 */ 7886 mp->b_wptr = mp->b_rptr + pkt_len; 7887 7888 /* 7889 * Make sure this is NULL or we're in trouble. 7890 */ 7891 if (mp->b_next != NULL) { 7892 ibd_print_warn(state, 7893 "ibd_process_rx: got duplicate mp from rcq?"); 7894 mp->b_next = NULL; 7895 } 7896 7897 /* 7898 * the IB link will deliver one of the IB link layer 7899 * headers called, the Global Routing Header (GRH). 7900 * ibd driver uses the information in GRH to build the 7901 * Header_info structure and pass it with the datagram up 7902 * to GLDv3. 7903 * If the GRH is not valid, indicate to GLDv3 by setting 7904 * the VerTcFlow field to 0. 7905 */ 7906 phdr = (ib_header_info_t *)mp->b_rptr; 7907 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 7908 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 7909 7910 /* if it is loop back packet, just drop it. */ 7911 if (state->id_enable_rc) { 7912 if (bcmp(&phdr->ib_grh.ipoib_sqpn, 7913 &state->rc_macaddr_loopback, 7914 IPOIB_ADDRL) == 0) { 7915 freemsg(mp); 7916 return (NULL); 7917 } 7918 } else { 7919 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 7920 IPOIB_ADDRL) == 0) { 7921 freemsg(mp); 7922 return (NULL); 7923 } 7924 } 7925 7926 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 7927 sizeof (ipoib_mac_t)); 7928 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 7929 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 7930 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 7931 } else { 7932 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 7933 } 7934 } else { 7935 /* 7936 * It can not be a IBA multicast packet. Must have been 7937 * unicast for us. Just copy the interface address to dst. 7938 */ 7939 phdr->ib_grh.ipoib_vertcflow = 0; 7940 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 7941 sizeof (ipoib_mac_t)); 7942 } 7943 7944 /* 7945 * For ND6 packets, padding is at the front of the source/target 7946 * lladdr. However the inet6 layer is not aware of it, hence remove 7947 * the padding from such packets. 7948 */ 7949 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 7950 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 7951 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 7952 len = ntohs(ip6h->ip6_plen); 7953 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 7954 /* LINTED: E_CONSTANT_CONDITION */ 7955 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 7956 } 7957 } 7958 7959 /* 7960 * Update statistics 7961 */ 7962 atomic_add_64(&state->id_rcv_bytes, pkt_len); 7963 atomic_inc_64(&state->id_rcv_pkt); 7964 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 7965 atomic_inc_64(&state->id_brd_rcv); 7966 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 7967 atomic_inc_64(&state->id_multi_rcv); 7968 7969 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 7970 /* 7971 * Set receive checksum status in mp 7972 * Hardware checksumming can be considered valid only if: 7973 * 1. CQE.IP_OK bit is set 7974 * 2. CQE.CKSUM = 0xffff 7975 * 3. IPv6 routing header is not present in the packet 7976 * 4. If there are no IP_OPTIONS in the IP HEADER 7977 */ 7978 7979 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 7980 (wc->wc_cksum == 0xFFFF) && 7981 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 7982 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 7983 } 7984 7985 return (mp); 7986 } 7987 7988 /* 7989 * Callback code invoked from STREAMs when the receive data buffer is 7990 * free for recycling. 7991 */ 7992 static void 7993 ibd_freemsg_cb(char *arg) 7994 { 7995 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 7996 ibd_state_t *state = rwqe->w_state; 7997 7998 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 7999 8000 /* 8001 * If the driver is stopped, just free the rwqe. 8002 */ 8003 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 8004 DPRINT(6, "ibd_freemsg: wqe being freed"); 8005 rwqe->rwqe_im_mblk = NULL; 8006 ibd_free_rwqe(state, rwqe); 8007 return; 8008 } 8009 8010 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 8011 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 8012 if (rwqe->rwqe_im_mblk == NULL) { 8013 ibd_free_rwqe(state, rwqe); 8014 DPRINT(6, "ibd_freemsg: desballoc failed"); 8015 return; 8016 } 8017 8018 ibd_post_recv(state, rwqe); 8019 } 8020 8021 static uint_t 8022 ibd_tx_recycle(caddr_t arg) 8023 { 8024 ibd_state_t *state = (ibd_state_t *)arg; 8025 8026 /* 8027 * Poll for completed entries 8028 */ 8029 ibd_poll_scq(state, state->id_scq_hdl); 8030 8031 return (DDI_INTR_CLAIMED); 8032 } 8033 8034 #ifdef IBD_LOGGING 8035 static void 8036 ibd_log_init(void) 8037 { 8038 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 8039 ibd_lbuf_ndx = 0; 8040 8041 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 8042 } 8043 8044 static void 8045 ibd_log_fini(void) 8046 { 8047 if (ibd_lbuf) 8048 kmem_free(ibd_lbuf, IBD_LOG_SZ); 8049 ibd_lbuf_ndx = 0; 8050 ibd_lbuf = NULL; 8051 8052 mutex_destroy(&ibd_lbuf_lock); 8053 } 8054 8055 static void 8056 ibd_log(const char *fmt, ...) 8057 { 8058 va_list ap; 8059 uint32_t off; 8060 uint32_t msglen; 8061 char tmpbuf[IBD_DMAX_LINE]; 8062 8063 if (ibd_lbuf == NULL) 8064 return; 8065 8066 va_start(ap, fmt); 8067 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 8068 va_end(ap); 8069 8070 if (msglen >= IBD_DMAX_LINE) 8071 msglen = IBD_DMAX_LINE - 1; 8072 8073 mutex_enter(&ibd_lbuf_lock); 8074 8075 off = ibd_lbuf_ndx; /* current msg should go here */ 8076 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 8077 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 8078 8079 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 8080 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 8081 8082 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 8083 ibd_lbuf_ndx = 0; 8084 8085 mutex_exit(&ibd_lbuf_lock); 8086 8087 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 8088 } 8089 #endif 8090 8091 /* ARGSUSED */ 8092 static int 8093 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8094 int *rvalp) 8095 { 8096 ibd_create_ioctl_t *cmd = karg; 8097 ibd_state_t *state, *port_state, *p; 8098 int i, err, rval = 0; 8099 mac_register_t *macp; 8100 ibt_hca_portinfo_t *pinfop = NULL; 8101 ibt_status_t ibt_status; 8102 uint_t psize, pinfosz; 8103 boolean_t force_create = B_FALSE; 8104 8105 cmd->ibdioc.ioc_status = 0; 8106 8107 if (cmd->ibdioc.ioc_port_inst < 0) { 8108 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8109 return (EINVAL); 8110 } 8111 port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst); 8112 if (port_state == NULL) { 8113 DPRINT(10, "ibd_create_partition: failed to get state %d", 8114 cmd->ibdioc.ioc_port_inst); 8115 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8116 return (EINVAL); 8117 } 8118 8119 /* Limited PKeys not supported */ 8120 if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) { 8121 rval = EINVAL; 8122 goto part_create_return; 8123 } 8124 8125 if (cmd->ioc_force_create == 0) { 8126 /* 8127 * Check if the port pkey table contains the pkey for which 8128 * this partition is being created. 8129 */ 8130 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8131 port_state->id_port, &pinfop, &psize, &pinfosz); 8132 8133 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8134 rval = EINVAL; 8135 goto part_create_return; 8136 } 8137 8138 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) { 8139 rval = ENETDOWN; 8140 cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN; 8141 goto part_create_return; 8142 } 8143 8144 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) { 8145 if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) { 8146 break; 8147 } 8148 } 8149 if (i == pinfop->p_pkey_tbl_sz) { 8150 rval = EINVAL; 8151 cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT; 8152 goto part_create_return; 8153 } 8154 } else { 8155 force_create = B_TRUE; 8156 } 8157 8158 mutex_enter(&ibd_objlist_lock); 8159 for (p = ibd_objlist_head; p; p = p->id_next) { 8160 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) && 8161 (p->id_pkey == cmd->ioc_pkey)) { 8162 mutex_exit(&ibd_objlist_lock); 8163 rval = EEXIST; 8164 cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS; 8165 goto part_create_return; 8166 } 8167 } 8168 mutex_exit(&ibd_objlist_lock); 8169 8170 state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP); 8171 8172 state->id_type = IBD_PARTITION_OBJ; 8173 8174 state->id_plinkid = cmd->ioc_partid; 8175 state->id_dlinkid = cmd->ibdioc.ioc_linkid; 8176 state->id_port_inst = cmd->ibdioc.ioc_port_inst; 8177 8178 state->id_dip = port_state->id_dip; 8179 state->id_port = port_state->id_port; 8180 state->id_pkey = cmd->ioc_pkey; 8181 state->id_hca_guid = port_state->id_hca_guid; 8182 state->id_port_guid = port_state->id_port_guid; 8183 state->id_force_create = force_create; 8184 8185 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 8186 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 8187 8188 if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) { 8189 rval = EIO; 8190 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE; 8191 goto fail; 8192 } 8193 8194 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 8195 rval = EAGAIN; 8196 goto fail; 8197 } 8198 8199 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 8200 macp->m_dip = port_state->id_dip; 8201 macp->m_instance = (uint_t)-1; 8202 macp->m_driver = state; 8203 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 8204 macp->m_callbacks = &ibd_m_callbacks; 8205 macp->m_min_sdu = 0; 8206 if (state->id_enable_rc) { 8207 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 8208 } else { 8209 macp->m_max_sdu = IBD_DEF_MAX_SDU; 8210 } 8211 macp->m_priv_props = ibd_priv_props; 8212 8213 err = mac_register(macp, &state->id_mh); 8214 mac_free(macp); 8215 8216 if (err != 0) { 8217 DPRINT(10, "ibd_create_partition: mac_register() failed %d", 8218 err); 8219 rval = err; 8220 goto fail; 8221 } 8222 8223 err = dls_devnet_create(state->id_mh, 8224 cmd->ioc_partid, crgetzoneid(credp)); 8225 if (err != 0) { 8226 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed " 8227 "%d", err); 8228 rval = err; 8229 (void) mac_unregister(state->id_mh); 8230 goto fail; 8231 } 8232 8233 /* 8234 * Add the new partition state structure to the list 8235 */ 8236 mutex_enter(&ibd_objlist_lock); 8237 if (ibd_objlist_head) 8238 state->id_next = ibd_objlist_head; 8239 8240 ibd_objlist_head = state; 8241 mutex_exit(&ibd_objlist_lock); 8242 8243 part_create_return: 8244 if (pinfop) { 8245 ibt_free_portinfo(pinfop, pinfosz); 8246 } 8247 return (rval); 8248 8249 fail: 8250 if (pinfop) { 8251 ibt_free_portinfo(pinfop, pinfosz); 8252 } 8253 (void) ibd_part_unattach(state); 8254 kmem_free(state, sizeof (ibd_state_t)); 8255 return (rval); 8256 } 8257 8258 /* ARGSUSED */ 8259 static int 8260 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8261 int *rvalp) 8262 { 8263 int err; 8264 datalink_id_t tmpid; 8265 ibd_state_t *node, *prev; 8266 ibd_delete_ioctl_t *cmd = karg; 8267 8268 prev = NULL; 8269 8270 mutex_enter(&ibd_objlist_lock); 8271 node = ibd_objlist_head; 8272 8273 /* Find the ibd state structure corresponding the partion */ 8274 while (node != NULL) { 8275 if (node->id_plinkid == cmd->ioc_partid) 8276 break; 8277 prev = node; 8278 node = node->id_next; 8279 } 8280 8281 if (node == NULL) { 8282 mutex_exit(&ibd_objlist_lock); 8283 return (ENOENT); 8284 } 8285 8286 if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) { 8287 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed " 8288 "%d", err); 8289 mutex_exit(&ibd_objlist_lock); 8290 return (err); 8291 } 8292 8293 if ((err = mac_disable(node->id_mh)) != 0) { 8294 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid, 8295 crgetzoneid(credp)); 8296 mutex_exit(&ibd_objlist_lock); 8297 return (err); 8298 } 8299 8300 /* 8301 * Call ibd_part_unattach() only after making sure that the instance has 8302 * not been started yet and is also not in late hca init mode. 8303 */ 8304 ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8305 if ((node->id_mac_state & IBD_DRV_STARTED) || 8306 (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) || 8307 (ibd_part_unattach(node) != DDI_SUCCESS)) { 8308 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8309 mutex_exit(&ibd_objlist_lock); 8310 return (EBUSY); 8311 } 8312 node->id_mac_state |= IBD_DRV_IN_DELETION; 8313 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8314 8315 /* Remove the partition state structure from the linked list */ 8316 if (prev == NULL) 8317 ibd_objlist_head = node->id_next; 8318 else 8319 prev->id_next = node->id_next; 8320 mutex_exit(&ibd_objlist_lock); 8321 8322 if ((err = mac_unregister(node->id_mh)) != 0) { 8323 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d", 8324 err); 8325 } 8326 8327 cv_destroy(&node->id_macst_cv); 8328 mutex_destroy(&node->id_macst_lock); 8329 8330 kmem_free(node, sizeof (ibd_state_t)); 8331 8332 return (0); 8333 } 8334 8335 /* ARGSUSED */ 8336 static int 8337 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred, 8338 int *rvalp) 8339 { 8340 ibd_ioctl_t cmd; 8341 ibpart_ioctl_t partioc; 8342 ibport_ioctl_t portioc; 8343 #ifdef _MULTI_DATAMODEL 8344 ibport_ioctl32_t portioc32; 8345 #endif 8346 ibd_state_t *state, *port_state; 8347 int size; 8348 ibt_hca_portinfo_t *pinfop = NULL; 8349 ibt_status_t ibt_status; 8350 uint_t psize, pinfosz; 8351 int rval = 0; 8352 8353 size = sizeof (ibd_ioctl_t); 8354 if (ddi_copyin((void *)arg, &cmd, size, mode)) { 8355 return (EFAULT); 8356 } 8357 cmd.ioc_status = 0; 8358 switch (cmd.ioc_info_cmd) { 8359 case IBD_INFO_CMD_IBPART: 8360 size = sizeof (ibpart_ioctl_t); 8361 if (ddi_copyin((void *)arg, &partioc, size, mode)) { 8362 return (EFAULT); 8363 } 8364 8365 mutex_enter(&ibd_objlist_lock); 8366 /* Find the ibd state structure corresponding the partition */ 8367 for (state = ibd_objlist_head; state; state = state->id_next) { 8368 if (state->id_plinkid == cmd.ioc_linkid) { 8369 break; 8370 } 8371 } 8372 8373 if (state == NULL) { 8374 mutex_exit(&ibd_objlist_lock); 8375 return (ENOENT); 8376 } 8377 8378 partioc.ibdioc.ioc_linkid = state->id_dlinkid; 8379 partioc.ibdioc.ioc_port_inst = state->id_port_inst; 8380 partioc.ibdioc.ioc_portnum = state->id_port; 8381 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid; 8382 partioc.ibdioc.ioc_portguid = state->id_port_guid; 8383 partioc.ibdioc.ioc_status = 0; 8384 partioc.ioc_partid = state->id_plinkid; 8385 partioc.ioc_pkey = state->id_pkey; 8386 partioc.ioc_force_create = state->id_force_create; 8387 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) { 8388 mutex_exit(&ibd_objlist_lock); 8389 return (EFAULT); 8390 } 8391 mutex_exit(&ibd_objlist_lock); 8392 8393 break; 8394 8395 case IBD_INFO_CMD_IBPORT: 8396 if ((cmd.ioc_port_inst < 0) || ((port_state = 8397 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8398 DPRINT(10, "ibd_create_partition: failed to get" 8399 " state %d", cmd.ioc_port_inst); 8400 size = sizeof (ibd_ioctl_t); 8401 cmd.ioc_status = IBD_INVALID_PORT_INST; 8402 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8403 mode)) { 8404 return (EFAULT); 8405 } 8406 return (EINVAL); 8407 } 8408 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8409 port_state->id_port, &pinfop, &psize, &pinfosz); 8410 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8411 return (EINVAL); 8412 } 8413 #ifdef _MULTI_DATAMODEL 8414 switch (ddi_model_convert_from(mode & FMODELS)) { 8415 case DDI_MODEL_ILP32: { 8416 size = sizeof (ibport_ioctl32_t); 8417 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8418 rval = EFAULT; 8419 goto fail; 8420 } 8421 portioc32.ibdioc.ioc_status = 0; 8422 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8423 portioc32.ibdioc.ioc_hcaguid = 8424 port_state->id_hca_guid; 8425 portioc32.ibdioc.ioc_portguid = 8426 port_state->id_port_guid; 8427 if (portioc32.ioc_pkey_tbl_sz != 8428 pinfop->p_pkey_tbl_sz) { 8429 rval = EINVAL; 8430 size = sizeof (ibd_ioctl_t); 8431 portioc32.ibdioc.ioc_status = 8432 IBD_INVALID_PKEY_TBL_SIZE; 8433 if (ddi_copyout((void *)&portioc32.ibdioc, 8434 (void *)arg, size, mode)) { 8435 rval = EFAULT; 8436 goto fail; 8437 } 8438 goto fail; 8439 } 8440 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8441 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8442 (void *)(uintptr_t)portioc32.ioc_pkeys, size, 8443 mode)) { 8444 rval = EFAULT; 8445 goto fail; 8446 } 8447 size = sizeof (ibport_ioctl32_t); 8448 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8449 mode)) { 8450 rval = EFAULT; 8451 goto fail; 8452 } 8453 break; 8454 } 8455 case DDI_MODEL_NONE: 8456 size = sizeof (ibport_ioctl_t); 8457 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8458 rval = EFAULT; 8459 goto fail; 8460 } 8461 portioc.ibdioc.ioc_status = 0; 8462 portioc.ibdioc.ioc_portnum = port_state->id_port; 8463 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8464 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8465 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8466 rval = EINVAL; 8467 size = sizeof (ibd_ioctl_t); 8468 portioc.ibdioc.ioc_status = 8469 IBD_INVALID_PKEY_TBL_SIZE; 8470 if (ddi_copyout((void *)&portioc.ibdioc, 8471 (void *)arg, size, mode)) { 8472 rval = EFAULT; 8473 goto fail; 8474 } 8475 goto fail; 8476 } 8477 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8478 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8479 (void *)(portioc.ioc_pkeys), size, mode)) { 8480 rval = EFAULT; 8481 goto fail; 8482 } 8483 size = sizeof (ibport_ioctl_t); 8484 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8485 mode)) { 8486 rval = EFAULT; 8487 goto fail; 8488 } 8489 break; 8490 } 8491 #else /* ! _MULTI_DATAMODEL */ 8492 size = sizeof (ibport_ioctl_t); 8493 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8494 rval = EFAULT; 8495 goto fail; 8496 } 8497 portioc.ibdioc.ioc_status = 0; 8498 portioc.ibdioc.ioc_portnum = port_state->id_port; 8499 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8500 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8501 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8502 rval = EINVAL; 8503 size = sizeof (ibd_ioctl_t); 8504 portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE; 8505 if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg, 8506 size, mode)) { 8507 rval = EFAULT; 8508 goto fail; 8509 } 8510 goto fail; 8511 } 8512 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8513 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8514 (void *)(portioc.ioc_pkeys), size, mode)) { 8515 rval = EFAULT; 8516 goto fail; 8517 } 8518 size = sizeof (ibport_ioctl_t); 8519 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8520 mode)) { 8521 rval = EFAULT; 8522 goto fail; 8523 } 8524 #endif /* _MULTI_DATAMODEL */ 8525 8526 break; 8527 8528 case IBD_INFO_CMD_PKEYTBLSZ: 8529 if ((cmd.ioc_port_inst < 0) || ((port_state = 8530 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8531 DPRINT(10, "ibd_create_partition: failed to get" 8532 " state %d", cmd.ioc_port_inst); 8533 size = sizeof (ibd_ioctl_t); 8534 cmd.ioc_status = IBD_INVALID_PORT_INST; 8535 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8536 mode)) { 8537 return (EFAULT); 8538 } 8539 return (EINVAL); 8540 } 8541 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8542 port_state->id_port, &pinfop, &psize, &pinfosz); 8543 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8544 return (EINVAL); 8545 } 8546 #ifdef _MULTI_DATAMODEL 8547 switch (ddi_model_convert_from(mode & FMODELS)) { 8548 case DDI_MODEL_ILP32: { 8549 size = sizeof (ibport_ioctl32_t); 8550 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8551 rval = EFAULT; 8552 goto fail; 8553 } 8554 portioc32.ibdioc.ioc_status = 0; 8555 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8556 portioc32.ibdioc.ioc_hcaguid = 8557 port_state->id_hca_guid; 8558 portioc32.ibdioc.ioc_portguid = 8559 port_state->id_port_guid; 8560 portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8561 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8562 mode)) { 8563 rval = EFAULT; 8564 goto fail; 8565 } 8566 break; 8567 } 8568 case DDI_MODEL_NONE: 8569 size = sizeof (ibport_ioctl_t); 8570 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8571 rval = EFAULT; 8572 goto fail; 8573 } 8574 portioc.ibdioc.ioc_status = 0; 8575 portioc.ibdioc.ioc_portnum = port_state->id_port; 8576 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8577 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8578 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8579 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8580 mode)) { 8581 rval = EFAULT; 8582 goto fail; 8583 } 8584 break; 8585 } 8586 #else /* ! _MULTI_DATAMODEL */ 8587 size = sizeof (ibport_ioctl_t); 8588 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8589 rval = EFAULT; 8590 goto fail; 8591 } 8592 portioc.ibdioc.ioc_status = 0; 8593 portioc.ibdioc.ioc_portnum = port_state->id_port; 8594 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8595 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8596 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8597 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8598 mode)) { 8599 rval = EFAULT; 8600 goto fail; 8601 } 8602 #endif /* _MULTI_DATAMODEL */ 8603 break; 8604 8605 default: 8606 return (EINVAL); 8607 8608 } /* switch (cmd.ioc_info_cmd) */ 8609 fail: 8610 if (pinfop) { 8611 ibt_free_portinfo(pinfop, pinfosz); 8612 } 8613 return (rval); 8614 } 8615 8616 /* ARGSUSED */ 8617 static void 8618 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl, 8619 ibt_async_code_t code, ibt_async_event_t *event) 8620 { 8621 ibd_state_t *state = (ibd_state_t *)arg; 8622 link_state_t lstate; 8623 8624 switch (code) { 8625 case IBT_EVENT_PORT_UP: 8626 case IBT_ERROR_PORT_DOWN: 8627 if (ibd_get_port_state(state, &lstate) != 0) 8628 break; 8629 8630 if (state->id_link_state != lstate) { 8631 state->id_link_state = lstate; 8632 mac_link_update(state->id_mh, lstate); 8633 } 8634 break; 8635 default: 8636 break; 8637 } 8638 } 8639 8640 static int 8641 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate) 8642 { 8643 ibt_hca_portinfo_t *port_infop; 8644 uint_t psize, port_infosz; 8645 ibt_status_t ret; 8646 8647 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 8648 &port_infop, &psize, &port_infosz); 8649 if ((ret != IBT_SUCCESS) || (psize != 1)) 8650 return (-1); 8651 8652 state->id_sgid = *port_infop->p_sgid_tbl; 8653 state->id_link_speed = ibd_get_portspeed(state); 8654 8655 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) 8656 *lstate = LINK_STATE_UP; 8657 else 8658 *lstate = LINK_STATE_DOWN; 8659 8660 ibt_free_portinfo(port_infop, port_infosz); 8661 return (0); 8662 } 8663 8664 static int 8665 ibd_port_attach(dev_info_t *dip) 8666 { 8667 ibd_state_t *state; 8668 link_state_t lstate; 8669 int instance; 8670 ibt_status_t ret; 8671 8672 /* 8673 * Allocate softstate structure 8674 */ 8675 instance = ddi_get_instance(dip); 8676 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) { 8677 DPRINT(10, "ibd_attach: ddi_soft_state_zalloc() failed"); 8678 return (DDI_FAILURE); 8679 } 8680 8681 state = ddi_get_soft_state(ibd_list, instance); 8682 8683 state->id_dip = dip; 8684 state->id_type = IBD_PORT_DRIVER; 8685 8686 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 8687 "port-number", 0)) == 0) { 8688 DPRINT(10, "ibd_attach: invalid port number (%d)", 8689 state->id_port); 8690 return (DDI_FAILURE); 8691 } 8692 if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8693 "hca-guid", 0)) == 0) { 8694 DPRINT(10, "ibd_attach: hca has invalid guid (0x%llx)", 8695 state->id_hca_guid); 8696 return (DDI_FAILURE); 8697 } 8698 if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8699 "port-guid", 0)) == 0) { 8700 DPRINT(10, "ibd_attach: port has invalid guid (0x%llx)", 8701 state->id_port_guid); 8702 return (DDI_FAILURE); 8703 } 8704 8705 /* 8706 * Attach to IBTL 8707 */ 8708 if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state, 8709 &state->id_ibt_hdl)) != IBT_SUCCESS) { 8710 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 8711 goto done; 8712 } 8713 8714 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 8715 8716 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 8717 &state->id_hca_hdl)) != IBT_SUCCESS) { 8718 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 8719 goto done; 8720 } 8721 state->id_mac_state |= IBD_DRV_HCA_OPENED; 8722 8723 /* Update link status */ 8724 8725 if (ibd_get_port_state(state, &lstate) != 0) { 8726 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 8727 goto done; 8728 } 8729 state->id_link_state = lstate; 8730 /* 8731 * Register ibd interfaces with the Nemo framework 8732 */ 8733 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 8734 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 8735 goto done; 8736 } 8737 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 8738 8739 mac_link_update(state->id_mh, lstate); 8740 8741 return (DDI_SUCCESS); 8742 done: 8743 (void) ibd_port_unattach(state, dip); 8744 return (DDI_FAILURE); 8745 } 8746 8747 static int 8748 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip) 8749 { 8750 int instance; 8751 uint32_t progress = state->id_mac_state; 8752 ibt_status_t ret; 8753 8754 if (progress & IBD_DRV_MAC_REGISTERED) { 8755 (void) mac_unregister(state->id_mh); 8756 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 8757 } 8758 8759 if (progress & IBD_DRV_HCA_OPENED) { 8760 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 8761 IBT_SUCCESS) { 8762 ibd_print_warn(state, "failed to close " 8763 "HCA device, ret=%d", ret); 8764 } 8765 state->id_hca_hdl = NULL; 8766 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 8767 } 8768 8769 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 8770 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 8771 ibd_print_warn(state, 8772 "ibt_detach() failed, ret=%d", ret); 8773 } 8774 state->id_ibt_hdl = NULL; 8775 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 8776 } 8777 instance = ddi_get_instance(dip); 8778 ddi_soft_state_free(ibd_list, instance); 8779 8780 return (DDI_SUCCESS); 8781 } 8782 8783 ibt_status_t 8784 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr) 8785 { 8786 ibd_state_t *state; 8787 8788 mutex_enter(&ibd_objlist_lock); 8789 8790 /* Find the ibd state structure corresponding the partition */ 8791 for (state = ibd_objlist_head; state; state = state->id_next) { 8792 if (state->id_plinkid == linkid) { 8793 break; 8794 } 8795 } 8796 8797 if (state == NULL) { 8798 mutex_exit(&ibd_objlist_lock); 8799 return (IBT_NO_SUCH_OBJECT); 8800 } 8801 8802 attr->pa_dlinkid = state->id_dlinkid; 8803 attr->pa_plinkid = state->id_plinkid; 8804 attr->pa_port = state->id_port; 8805 attr->pa_hca_guid = state->id_hca_guid; 8806 attr->pa_port_guid = state->id_port_guid; 8807 attr->pa_pkey = state->id_pkey; 8808 8809 mutex_exit(&ibd_objlist_lock); 8810 8811 return (IBT_SUCCESS); 8812 } 8813 8814 ibt_status_t 8815 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts) 8816 { 8817 ibd_state_t *state; 8818 int n = 0; 8819 ibt_part_attr_t *attr; 8820 8821 mutex_enter(&ibd_objlist_lock); 8822 8823 for (state = ibd_objlist_head; state; state = state->id_next) 8824 n++; 8825 8826 *nparts = n; 8827 if (n == 0) { 8828 *attr_list = NULL; 8829 mutex_exit(&ibd_objlist_lock); 8830 return (IBT_SUCCESS); 8831 } 8832 8833 *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP); 8834 attr = *attr_list; 8835 for (state = ibd_objlist_head; state; state = state->id_next) { 8836 #ifdef DEBUG 8837 ASSERT(n > 0); 8838 n--; 8839 #endif 8840 attr->pa_dlinkid = state->id_dlinkid; 8841 attr->pa_plinkid = state->id_plinkid; 8842 attr->pa_port = state->id_port; 8843 attr->pa_hca_guid = state->id_hca_guid; 8844 attr->pa_port_guid = state->id_port_guid; 8845 attr->pa_pkey = state->id_pkey; 8846 attr++; 8847 } 8848 8849 mutex_exit(&ibd_objlist_lock); 8850 return (IBT_SUCCESS); 8851 } 8852