1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * An implementation of the IPoIB standard based on PSARC 2001/289. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/modctl.h> 35 #include <sys/stropts.h> 36 #include <sys/stream.h> 37 #include <sys/strsun.h> 38 #include <sys/strsubr.h> 39 #include <sys/dlpi.h> 40 #include <sys/mac_provider.h> 41 42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 43 #include <sys/sysmacros.h> /* for offsetof */ 44 #include <sys/disp.h> /* for async thread pri */ 45 #include <sys/atomic.h> /* for atomic_add*() */ 46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 47 #include <netinet/in.h> /* for netinet/ip.h below */ 48 #include <netinet/ip.h> /* for struct ip */ 49 #include <netinet/udp.h> /* for struct udphdr */ 50 #include <inet/common.h> /* for inet/ip.h below */ 51 #include <inet/ip.h> /* for ipha_t */ 52 #include <inet/ip6.h> /* for ip6_t */ 53 #include <inet/tcp.h> /* for tcph_t */ 54 #include <netinet/icmp6.h> /* for icmp6_t */ 55 #include <sys/callb.h> 56 #include <sys/modhash.h> 57 58 #include <sys/ib/clients/ibd/ibd.h> 59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 60 #include <sys/note.h> 61 #include <sys/multidata.h> 62 63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 64 65 #include <sys/priv_names.h> 66 #include <sys/dls.h> 67 #include <sys/dld_ioc.h> 68 #include <sys/policy.h> 69 #include <sys/ibpart.h> 70 #include <sys/file.h> 71 72 /* 73 * The write-up below includes details on the following: 74 * 1. The dladm administrative model. 75 * 2. Late HCA initialization feature. 76 * 3. Brussels support and its implications to the current architecture. 77 * 78 * 1. The dladm administrative model. 79 * ------------------------------------------ 80 * With the dladm model, ibnex will create one ibd instance per port. These 81 * instances will be created independent of the port state. 82 * 83 * The ibd driver is two faceted: One side of it working as the port driver and 84 * the other as the partition object driver. 85 * 86 * The port instance is a child of the HCA, and will have an entry in the devfs. 87 * A DDI attach only happens for the port driver, and its attach is 88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is 89 * handled in ibd_port_unattach(). 90 * 91 * The partition object is only a registrant to the mac layer via mac_register() 92 * and does not have an entry in the device tree. There is no DDI softstate 93 * managed by the DDI framework for the partition objects. However, the state is 94 * managed inside the ibd driver, and every partition object hangs off the 95 * "ibd_objlist_head". 96 * 97 * The partition object first comes into existence when a user runs the 98 * 'create-part' subcommand of dladm. This is like invoking the attach entry 99 * point of the partition object. The partition object goes away with the 100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry 101 * point of the partition object. 102 * 103 * The create-part and delete-part subcommands result in dld ioctls that end up 104 * calling ibd_create_parition() and ibd_delete_partition respectively. 105 * There ioctls are registered with the dld layer in _init() via a call to 106 * dld_ioc_register(). 107 * 108 * The port instance by itself cannot be plumbed. It is only the partition 109 * objects that can be plumbed and they alone participate in I/O and not the 110 * port driver. 111 * 112 * There are some info ioctls supported in ibd which are used by dladm(1M) to 113 * display useful information. The info entry point for ibd is 114 * ibd_get_partition_info(). 115 * 116 * 2. Late HCA initialization feature. 117 * ------------------------------------ 118 * As mentioned in section 1, the user creates the partition objects via 119 * dladm(1M). It is possible that: 120 * a) The physical port itself is down and the SM cannot be reached. 121 * b) The PKEY specified by the used has not been created in the SM yet. 122 * c) An IPoIB broadcast group for the specified PKEY is not present. 123 * 124 * In all of the above cases, complete initialization of the partition object is 125 * not possible. However, the new model allows the creation of partition 126 * objects even in such cases but will defer the initialization for later. 127 * When such a partition object is plumbed, the link state will be displayed as 128 * "down". 129 * The driver, at this point, is listening to events that herald the 130 * availability of resources - 131 * i) LINK_UP when the link becomes available 132 * ii) PORT_CHANGE when the PKEY has been created 133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been 134 * created 135 * via ibd_async_handler() for events i) and ii), and via 136 * ibd_snet_notices_handler() for iii. 137 * The driver handles these events (as and when they arrive) and completes the 138 * initialization of the partition object and transitions it to a usable state. 139 * 140 * 3. Brussels support and its implications to the current architecture. 141 * --------------------------------------------------------------------- 142 * The brussels support introduces two new interfaces to the ibd driver - 143 * ibd_m_getprop() and ibd_m_setprop(). 144 * These interfaces allow setting and retrieval of certain properties. 145 * Some of them are public properties while most other are private properties 146 * meant to be used by developers. Tuning the latter kind can cause 147 * performance issues and should not be used without understanding the 148 * implications. All properties are specific to an instance of either the 149 * partition object or the port driver. 150 * 151 * The public properties are : mtu and linkmode. 152 * mtu is a read-only property. 153 * linkmode can take two values - UD and CM. 154 * 155 * Changing the linkmode requires some bookkeeping in the driver. The 156 * capabilities need to be re-reported to the mac layer. This is done by 157 * calling mac_capab_update(). The maxsdu is updated by calling 158 * mac_maxsdu_update(). 159 * The private properties retain their values across the change of linkmode. 160 * NOTE: 161 * - The port driver does not support any property apart from mtu. 162 * - All other properties are only meant for the partition object. 163 * - The properties cannot be set when an instance is plumbed. The 164 * instance has to be unplumbed to effect any setting. 165 */ 166 167 /* 168 * Driver wide tunables 169 * 170 * ibd_tx_softintr 171 * ibd_rx_softintr 172 * The softintr mechanism allows ibd to avoid event queue overflows if 173 * the receive/completion handlers are to be expensive. These are enabled 174 * by default. 175 * 176 * ibd_log_sz 177 * This specifies the size of the ibd log buffer in bytes. The buffer is 178 * allocated and logging is enabled only when IBD_LOGGING is defined. 179 * 180 */ 181 uint_t ibd_rx_softintr = 1; 182 uint_t ibd_tx_softintr = 1; 183 184 #ifdef IBD_LOGGING 185 uint_t ibd_log_sz = 0x20000; 186 #endif 187 188 #ifdef IBD_LOGGING 189 #define IBD_LOG_SZ ibd_log_sz 190 #endif 191 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 193 #define IBD_RX_POST_CNT 8 194 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 196 #define IBD_LOG_RX_POST 4 197 198 /* Minimum number of receive work requests driver needs to always have */ 199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 200 201 /* 202 * LSO parameters 203 */ 204 #define IBD_LSO_MAXLEN 65536 205 #define IBD_LSO_BUFSZ 8192 206 207 /* 208 * Async operation states 209 */ 210 #define IBD_OP_NOTSTARTED 0 211 #define IBD_OP_ONGOING 1 212 #define IBD_OP_COMPLETED 2 213 #define IBD_OP_ERRORED 3 214 #define IBD_OP_ROUTERED 4 215 216 /* 217 * State of IBD driver initialization during attach/m_start 218 */ 219 #define IBD_DRV_STATE_INITIALIZED 0x000001 220 #define IBD_DRV_RXINTR_ADDED 0x000002 221 #define IBD_DRV_TXINTR_ADDED 0x000004 222 #define IBD_DRV_IBTL_ATTACH_DONE 0x000008 223 #define IBD_DRV_HCA_OPENED 0x000010 224 #define IBD_DRV_PD_ALLOCD 0x000020 225 #define IBD_DRV_MAC_REGISTERED 0x000040 226 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x000080 227 #define IBD_DRV_BCAST_GROUP_FOUND 0x000100 228 #define IBD_DRV_ACACHE_INITIALIZED 0x000200 229 #define IBD_DRV_CQS_ALLOCD 0x000400 230 #define IBD_DRV_UD_CHANNEL_SETUP 0x000800 231 #define IBD_DRV_TXLIST_ALLOCD 0x001000 232 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x002000 233 #define IBD_DRV_RXLIST_ALLOCD 0x004000 234 #define IBD_DRV_BCAST_GROUP_JOINED 0x008000 235 #define IBD_DRV_ASYNC_THR_CREATED 0x010000 236 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x020000 237 #define IBD_DRV_SM_NOTICES_REGISTERED 0x040000 238 #define IBD_DRV_STARTED 0x080000 239 #define IBD_DRV_RC_SRQ_ALLOCD 0x100000 240 #define IBD_DRV_RC_LARGEBUF_ALLOCD 0x200000 241 #define IBD_DRV_RC_LISTEN 0x400000 242 #ifdef DEBUG 243 #define IBD_DRV_RC_PRIVATE_STATE 0x800000 244 #endif 245 #define IBD_DRV_IN_DELETION 0x1000000 246 #define IBD_DRV_IN_LATE_HCA_INIT 0x2000000 247 #define IBD_DRV_REQ_LIST_INITED 0x4000000 248 249 /* 250 * Start/stop in-progress flags; note that restart must always remain 251 * the OR of start and stop flag values. 252 */ 253 #define IBD_DRV_START_IN_PROGRESS 0x10000000 254 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 255 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 256 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS 257 258 /* 259 * Miscellaneous constants 260 */ 261 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 262 #define IBD_DEF_MAX_SDU 2044 263 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE) 264 #define IBD_DEF_RC_MAX_SDU 65520 265 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE) 266 #define IBD_DEFAULT_QKEY 0xB1B 267 #ifdef IBD_LOGGING 268 #define IBD_DMAX_LINE 100 269 #endif 270 271 /* 272 * Enumerations for link states 273 */ 274 typedef enum { 275 IBD_LINK_DOWN, 276 IBD_LINK_UP, 277 IBD_LINK_UP_ABSENT 278 } ibd_link_op_t; 279 280 /* 281 * Driver State Pointer 282 */ 283 void *ibd_list; 284 285 /* 286 * Driver Global Data 287 */ 288 ibd_global_state_t ibd_gstate; 289 290 /* 291 * Partition object list 292 */ 293 ibd_state_t *ibd_objlist_head = NULL; 294 kmutex_t ibd_objlist_lock; 295 296 /* 297 * Logging 298 */ 299 #ifdef IBD_LOGGING 300 kmutex_t ibd_lbuf_lock; 301 uint8_t *ibd_lbuf; 302 uint32_t ibd_lbuf_ndx; 303 #endif 304 305 /* 306 * Required system entry points 307 */ 308 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 309 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 310 311 /* 312 * Required driver entry points for GLDv3 313 */ 314 static int ibd_m_stat(void *, uint_t, uint64_t *); 315 static int ibd_m_start(void *); 316 static void ibd_m_stop(void *); 317 static int ibd_m_promisc(void *, boolean_t); 318 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 319 static int ibd_m_unicst(void *, const uint8_t *); 320 static mblk_t *ibd_m_tx(void *, mblk_t *); 321 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 322 323 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 324 const void *); 325 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 326 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t, 327 mac_prop_info_handle_t); 328 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t, 329 const void *); 330 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *); 331 332 /* 333 * Private driver entry points for GLDv3 334 */ 335 336 /* 337 * Initialization 338 */ 339 static int ibd_state_init(ibd_state_t *, dev_info_t *); 340 static int ibd_init_txlist(ibd_state_t *); 341 static int ibd_init_rxlist(ibd_state_t *); 342 static int ibd_acache_init(ibd_state_t *); 343 #ifdef IBD_LOGGING 344 static void ibd_log_init(void); 345 #endif 346 347 /* 348 * Termination/cleanup 349 */ 350 static void ibd_state_fini(ibd_state_t *); 351 static void ibd_fini_txlist(ibd_state_t *); 352 static void ibd_fini_rxlist(ibd_state_t *); 353 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 354 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 355 static void ibd_acache_fini(ibd_state_t *); 356 #ifdef IBD_LOGGING 357 static void ibd_log_fini(void); 358 #endif 359 360 /* 361 * Allocation/acquire/map routines 362 */ 363 static int ibd_alloc_tx_copybufs(ibd_state_t *); 364 static int ibd_alloc_rx_copybufs(ibd_state_t *); 365 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 366 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 367 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 368 uint32_t *); 369 370 /* 371 * Free/release/unmap routines 372 */ 373 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 374 static void ibd_free_tx_copybufs(ibd_state_t *); 375 static void ibd_free_rx_copybufs(ibd_state_t *); 376 static void ibd_free_rx_rsrcs(ibd_state_t *); 377 static void ibd_free_tx_lsobufs(ibd_state_t *); 378 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 379 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 380 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 381 382 /* 383 * Handlers/callback routines 384 */ 385 static uint_t ibd_intr(caddr_t); 386 static uint_t ibd_tx_recycle(caddr_t); 387 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 388 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 389 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 390 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 391 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 392 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 393 static void ibd_freemsg_cb(char *); 394 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 395 ibt_async_event_t *); 396 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 397 ibt_async_event_t *); 398 static void ibd_snet_notices_handler(void *, ib_gid_t, 399 ibt_subnet_event_code_t, ibt_subnet_event_t *); 400 401 /* 402 * Send/receive routines 403 */ 404 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 405 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 406 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 407 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 408 409 /* 410 * Threads 411 */ 412 static void ibd_async_work(ibd_state_t *); 413 414 /* 415 * Async tasks 416 */ 417 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 418 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 419 static void ibd_async_setprom(ibd_state_t *); 420 static void ibd_async_unsetprom(ibd_state_t *); 421 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 422 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 423 static void ibd_async_txsched(ibd_state_t *); 424 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 425 426 /* 427 * Async task helpers 428 */ 429 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 430 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 431 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 432 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 433 ipoib_mac_t *, ipoib_mac_t *); 434 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 435 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 436 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 437 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 438 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 439 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 440 static uint64_t ibd_get_portspeed(ibd_state_t *); 441 static boolean_t ibd_async_safe(ibd_state_t *); 442 static void ibd_async_done(ibd_state_t *); 443 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 444 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 445 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 446 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 447 448 /* 449 * Helpers for attach/start routines 450 */ 451 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 452 static int ibd_record_capab(ibd_state_t *); 453 static int ibd_get_port_details(ibd_state_t *); 454 static int ibd_alloc_cqs(ibd_state_t *); 455 static int ibd_setup_ud_channel(ibd_state_t *); 456 static int ibd_start(ibd_state_t *); 457 static int ibd_undo_start(ibd_state_t *, link_state_t); 458 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 459 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 460 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip); 461 static void ibd_part_unattach(ibd_state_t *state); 462 static int ibd_port_attach(dev_info_t *); 463 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip); 464 static int ibd_get_port_state(ibd_state_t *, link_state_t *); 465 static int ibd_part_busy(ibd_state_t *); 466 467 /* 468 * Miscellaneous helpers 469 */ 470 static int ibd_sched_poll(ibd_state_t *, int, int); 471 static void ibd_resume_transmission(ibd_state_t *); 472 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 473 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 474 static void *list_get_head(list_t *); 475 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 476 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 477 478 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *); 479 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *); 480 481 #ifdef IBD_LOGGING 482 static void ibd_log(const char *, ...); 483 #endif 484 485 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 486 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 487 488 /* Module Driver Info */ 489 static struct modldrv ibd_modldrv = { 490 &mod_driverops, /* This one is a driver */ 491 "InfiniBand GLDv3 Driver", /* short description */ 492 &ibd_dev_ops /* driver specific ops */ 493 }; 494 495 /* Module Linkage */ 496 static struct modlinkage ibd_modlinkage = { 497 MODREV_1, (void *)&ibd_modldrv, NULL 498 }; 499 500 /* 501 * Module (static) info passed to IBTL during ibt_attach 502 */ 503 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 504 IBTI_V_CURR, 505 IBT_NETWORK, 506 ibd_async_handler, 507 NULL, 508 "IBPART" 509 }; 510 511 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = { 512 IBTI_V_CURR, 513 IBT_NETWORK, 514 ibdpd_async_handler, 515 NULL, 516 "IPIB" 517 }; 518 519 /* 520 * GLDv3 entry points 521 */ 522 #define IBD_M_CALLBACK_FLAGS \ 523 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) 524 525 static mac_callbacks_t ibd_m_callbacks = { 526 IBD_M_CALLBACK_FLAGS, 527 ibd_m_stat, 528 ibd_m_start, 529 ibd_m_stop, 530 ibd_m_promisc, 531 ibd_m_multicst, 532 ibd_m_unicst, 533 ibd_m_tx, 534 NULL, 535 NULL, 536 ibd_m_getcapab, 537 NULL, 538 NULL, 539 ibd_m_setprop, 540 ibd_m_getprop, 541 ibd_m_propinfo 542 }; 543 544 /* Private properties */ 545 char *ibd_priv_props[] = { 546 "_ibd_broadcast_group", 547 "_ibd_coalesce_completions", 548 "_ibd_create_broadcast_group", 549 "_ibd_hash_size", 550 "_ibd_lso_enable", 551 "_ibd_num_ah", 552 "_ibd_num_lso_bufs", 553 "_ibd_rc_enable_srq", 554 "_ibd_rc_num_rwqe", 555 "_ibd_rc_num_srq", 556 "_ibd_rc_num_swqe", 557 "_ibd_rc_rx_comp_count", 558 "_ibd_rc_rx_comp_usec", 559 "_ibd_rc_rx_copy_thresh", 560 "_ibd_rc_rx_rwqe_thresh", 561 "_ibd_rc_tx_comp_count", 562 "_ibd_rc_tx_comp_usec", 563 "_ibd_rc_tx_copy_thresh", 564 "_ibd_ud_num_rwqe", 565 "_ibd_ud_num_swqe", 566 "_ibd_ud_rx_comp_count", 567 "_ibd_ud_rx_comp_usec", 568 "_ibd_ud_tx_comp_count", 569 "_ibd_ud_tx_comp_usec", 570 "_ibd_ud_tx_copy_thresh", 571 NULL 572 }; 573 574 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *); 575 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *); 576 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *); 577 578 static dld_ioc_info_t ibd_dld_ioctl_list[] = { 579 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t), 580 ibd_create_partition, secpolicy_dl_config}, 581 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t), 582 ibd_delete_partition, secpolicy_dl_config}, 583 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t), 584 ibd_get_partition_info, NULL} 585 }; 586 587 /* 588 * Fill/clear <scope> and <p_key> in multicast/broadcast address 589 */ 590 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 591 { \ 592 *(uint32_t *)((char *)(maddr) + 4) |= \ 593 htonl((uint32_t)(scope) << 16); \ 594 *(uint32_t *)((char *)(maddr) + 8) |= \ 595 htonl((uint32_t)(pkey) << 16); \ 596 } 597 598 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 599 { \ 600 *(uint32_t *)((char *)(maddr) + 4) &= \ 601 htonl(~((uint32_t)0xF << 16)); \ 602 *(uint32_t *)((char *)(maddr) + 8) &= \ 603 htonl(~((uint32_t)0xFFFF << 16)); \ 604 } 605 606 /* 607 * Rudimentary debugging support 608 */ 609 #ifdef DEBUG 610 int ibd_debuglevel = 100; 611 void 612 debug_print(int l, char *fmt, ...) 613 { 614 va_list ap; 615 616 if (l < ibd_debuglevel) 617 return; 618 va_start(ap, fmt); 619 vcmn_err(CE_CONT, fmt, ap); 620 va_end(ap); 621 } 622 #endif 623 624 /* 625 * Common routine to print warning messages; adds in hca guid, port number 626 * and pkey to be able to identify the IBA interface. 627 */ 628 void 629 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 630 { 631 ib_guid_t hca_guid; 632 char ibd_print_buf[256]; 633 int len; 634 va_list ap; 635 636 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 637 0, "hca-guid", 0); 638 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 639 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 640 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 641 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 642 va_start(ap, fmt); 643 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 644 fmt, ap); 645 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 646 va_end(ap); 647 } 648 649 /* 650 * Warlock directives 651 */ 652 653 /* 654 * id_lso_lock 655 * 656 * state->id_lso->bkt_nfree may be accessed without a lock to 657 * determine the threshold at which we have to ask the nw layer 658 * to resume transmission (see ibd_resume_transmission()). 659 */ 660 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 661 ibd_state_t::id_lso)) 662 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 663 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 664 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 665 666 /* 667 * id_scq_poll_lock 668 */ 669 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 670 ibd_state_t::id_scq_poll_busy)) 671 672 /* 673 * id_txpost_lock 674 */ 675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 676 ibd_state_t::id_tx_head)) 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 678 ibd_state_t::id_tx_busy)) 679 680 /* 681 * id_acache_req_lock 682 */ 683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 684 ibd_state_t::id_acache_req_cv)) 685 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 686 ibd_state_t::id_req_list)) 687 _NOTE(SCHEME_PROTECTS_DATA("atomic", 688 ibd_acache_s::ac_ref)) 689 690 /* 691 * id_ac_mutex 692 * 693 * This mutex is actually supposed to protect id_ah_op as well, 694 * but this path of the code isn't clean (see update of id_ah_op 695 * in ibd_async_acache(), immediately after the call to 696 * ibd_async_mcache()). For now, we'll skip this check by 697 * declaring that id_ah_op is protected by some internal scheme 698 * that warlock isn't aware of. 699 */ 700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 701 ibd_state_t::id_ah_active)) 702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 703 ibd_state_t::id_ah_free)) 704 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 705 ibd_state_t::id_ah_addr)) 706 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 707 ibd_state_t::id_ah_op)) 708 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 709 ibd_state_t::id_ah_error)) 710 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 711 ibd_state_t::id_ac_hot_ace)) 712 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 713 714 /* 715 * id_mc_mutex 716 */ 717 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 718 ibd_state_t::id_mc_full)) 719 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 720 ibd_state_t::id_mc_non)) 721 722 /* 723 * id_trap_lock 724 */ 725 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 726 ibd_state_t::id_trap_cv)) 727 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 728 ibd_state_t::id_trap_stop)) 729 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 730 ibd_state_t::id_trap_inprog)) 731 732 /* 733 * id_prom_op 734 */ 735 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 736 ibd_state_t::id_prom_op)) 737 738 /* 739 * id_sched_lock 740 */ 741 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 742 ibd_state_t::id_sched_needed)) 743 744 /* 745 * id_link_mutex 746 */ 747 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 748 ibd_state_t::id_link_state)) 749 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 750 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 751 ibd_state_t::id_link_speed)) 752 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 753 754 /* 755 * id_tx_list.dl_mutex 756 */ 757 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 758 ibd_state_t::id_tx_list.dl_head)) 759 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 760 ibd_state_t::id_tx_list.dl_pending_sends)) 761 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 762 ibd_state_t::id_tx_list.dl_cnt)) 763 764 /* 765 * id_rx_list.dl_mutex 766 */ 767 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 768 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 769 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 770 ibd_state_t::id_rx_list.dl_cnt)) 771 772 773 /* 774 * Items protected by atomic updates 775 */ 776 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 777 ibd_state_s::id_brd_rcv 778 ibd_state_s::id_brd_xmt 779 ibd_state_s::id_multi_rcv 780 ibd_state_s::id_multi_xmt 781 ibd_state_s::id_num_intrs 782 ibd_state_s::id_rcv_bytes 783 ibd_state_s::id_rcv_pkt 784 ibd_state_s::id_rx_post_queue_index 785 ibd_state_s::id_tx_short 786 ibd_state_s::id_xmt_bytes 787 ibd_state_s::id_xmt_pkt 788 ibd_state_s::rc_rcv_trans_byte 789 ibd_state_s::rc_rcv_trans_pkt 790 ibd_state_s::rc_rcv_copy_byte 791 ibd_state_s::rc_rcv_copy_pkt 792 ibd_state_s::rc_xmt_bytes 793 ibd_state_s::rc_xmt_small_pkt 794 ibd_state_s::rc_xmt_fragmented_pkt 795 ibd_state_s::rc_xmt_map_fail_pkt 796 ibd_state_s::rc_xmt_map_succ_pkt)) 797 798 /* 799 * Non-mutex protection schemes for data elements. Almost all of 800 * these are non-shared items. 801 */ 802 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 803 callb_cpr 804 ib_gid_s 805 ib_header_info 806 ibd_acache_rq 807 ibd_acache_s::ac_mce 808 ibd_acache_s::ac_chan 809 ibd_mcache::mc_fullreap 810 ibd_mcache::mc_jstate 811 ibd_mcache::mc_req 812 ibd_rwqe_s 813 ibd_swqe_s 814 ibd_wqe_s 815 ibt_wr_ds_s::ds_va 816 ibt_wr_lso_s 817 ipoib_mac::ipoib_qpn 818 mac_capab_lso_s 819 msgb::b_next 820 msgb::b_cont 821 msgb::b_rptr 822 msgb::b_wptr 823 ibd_state_s::id_bgroup_created 824 ibd_state_s::id_mac_state 825 ibd_state_s::id_mtu 826 ibd_state_s::id_ud_num_rwqe 827 ibd_state_s::id_ud_num_swqe 828 ibd_state_s::id_qpnum 829 ibd_state_s::id_rcq_hdl 830 ibd_state_s::id_rx_buf_sz 831 ibd_state_s::id_rx_bufs 832 ibd_state_s::id_rx_mr_hdl 833 ibd_state_s::id_rx_wqes 834 ibd_state_s::id_rxwcs 835 ibd_state_s::id_rxwcs_size 836 ibd_state_s::id_rx_nqueues 837 ibd_state_s::id_rx_queues 838 ibd_state_s::id_scope 839 ibd_state_s::id_scq_hdl 840 ibd_state_s::id_tx_buf_sz 841 ibd_state_s::id_tx_bufs 842 ibd_state_s::id_tx_mr_hdl 843 ibd_state_s::id_tx_rel_list.dl_cnt 844 ibd_state_s::id_tx_wqes 845 ibd_state_s::id_txwcs 846 ibd_state_s::id_txwcs_size 847 ibd_state_s::rc_listen_hdl 848 ibd_state_s::rc_listen_hdl_OFED_interop 849 ibd_state_s::rc_srq_size 850 ibd_state_s::rc_srq_rwqes 851 ibd_state_s::rc_srq_rx_bufs 852 ibd_state_s::rc_srq_rx_mr_hdl 853 ibd_state_s::rc_tx_largebuf_desc_base 854 ibd_state_s::rc_tx_mr_bufs 855 ibd_state_s::rc_tx_mr_hdl 856 ipha_s 857 icmph_s 858 ibt_path_info_s::pi_sid 859 ibd_rc_chan_s::ace 860 ibd_rc_chan_s::chan_hdl 861 ibd_rc_chan_s::state 862 ibd_rc_chan_s::chan_state 863 ibd_rc_chan_s::is_tx_chan 864 ibd_rc_chan_s::rcq_hdl 865 ibd_rc_chan_s::rcq_size 866 ibd_rc_chan_s::scq_hdl 867 ibd_rc_chan_s::scq_size 868 ibd_rc_chan_s::requester_gid 869 ibd_rc_chan_s::requester_pkey 870 ibd_rc_chan_s::rx_bufs 871 ibd_rc_chan_s::rx_mr_hdl 872 ibd_rc_chan_s::rx_rwqes 873 ibd_rc_chan_s::tx_wqes 874 ibd_rc_chan_s::tx_mr_bufs 875 ibd_rc_chan_s::tx_mr_hdl 876 ibd_rc_chan_s::tx_rel_list.dl_cnt 877 ibd_rc_chan_s::tx_trans_error_cnt 878 ibd_rc_tx_largebuf_s::lb_buf 879 ibd_rc_msg_hello_s 880 ibt_cm_return_args_s)) 881 882 /* 883 * ibd_rc_chan_s::next is protected by two mutexes: 884 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex 885 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. 886 */ 887 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", 888 ibd_rc_chan_s::next)) 889 890 /* 891 * ibd_state_s.rc_tx_large_bufs_lock 892 */ 893 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 894 ibd_state_s::rc_tx_largebuf_free_head)) 895 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 896 ibd_state_s::rc_tx_largebuf_nfree)) 897 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 898 ibd_rc_tx_largebuf_s::lb_next)) 899 900 /* 901 * ibd_acache_s.tx_too_big_mutex 902 */ 903 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, 904 ibd_acache_s::tx_too_big_ongoing)) 905 906 /* 907 * tx_wqe_list.dl_mutex 908 */ 909 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 910 ibd_rc_chan_s::tx_wqe_list.dl_head)) 911 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 912 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) 913 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 914 ibd_rc_chan_s::tx_wqe_list.dl_cnt)) 915 916 /* 917 * ibd_state_s.rc_ace_recycle_lock 918 */ 919 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, 920 ibd_state_s::rc_ace_recycle)) 921 922 /* 923 * rc_srq_rwqe_list.dl_mutex 924 */ 925 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 926 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) 927 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 928 ibd_state_t::rc_srq_rwqe_list.dl_cnt)) 929 930 /* 931 * Non-mutex protection schemes for data elements. They are counters 932 * for problem diagnosis. Don't need be protected. 933 */ 934 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 935 ibd_state_s::rc_rcv_alloc_fail 936 ibd_state_s::rc_rcq_invoke 937 ibd_state_s::rc_rcq_err 938 ibd_state_s::rc_ace_not_found 939 ibd_state_s::rc_xmt_drop_too_long_pkt 940 ibd_state_s::rc_xmt_icmp_too_long_pkt 941 ibd_state_s::rc_xmt_reenter_too_long_pkt 942 ibd_state_s::rc_swqe_short 943 ibd_state_s::rc_swqe_mac_update 944 ibd_state_s::rc_xmt_buf_short 945 ibd_state_s::rc_xmt_buf_mac_update 946 ibd_state_s::rc_scq_no_swqe 947 ibd_state_s::rc_scq_no_largebuf 948 ibd_state_s::rc_scq_invoke 949 ibd_state_s::rc_conn_succ 950 ibd_state_s::rc_conn_fail 951 ibd_state_s::rc_null_conn 952 ibd_state_s::rc_no_estab_conn 953 ibd_state_s::rc_act_close 954 ibd_state_s::rc_pas_close 955 ibd_state_s::rc_delay_ace_recycle 956 ibd_state_s::rc_act_close_simultaneous 957 ibd_state_s::rc_reset_cnt)) 958 959 #ifdef DEBUG 960 /* 961 * Non-mutex protection schemes for data elements. They are counters 962 * for problem diagnosis. Don't need be protected. 963 */ 964 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 965 ibd_state_s::rc_rwqe_short 966 ibd_rc_stat_s::rc_rcv_trans_byte 967 ibd_rc_stat_s::rc_rcv_trans_pkt 968 ibd_rc_stat_s::rc_rcv_copy_byte 969 ibd_rc_stat_s::rc_rcv_copy_pkt 970 ibd_rc_stat_s::rc_rcv_alloc_fail 971 ibd_rc_stat_s::rc_rcq_invoke 972 ibd_rc_stat_s::rc_rcq_err 973 ibd_rc_stat_s::rc_scq_invoke 974 ibd_rc_stat_s::rc_rwqe_short 975 ibd_rc_stat_s::rc_xmt_bytes 976 ibd_rc_stat_s::rc_xmt_small_pkt 977 ibd_rc_stat_s::rc_xmt_fragmented_pkt 978 ibd_rc_stat_s::rc_xmt_map_fail_pkt 979 ibd_rc_stat_s::rc_xmt_map_succ_pkt 980 ibd_rc_stat_s::rc_ace_not_found 981 ibd_rc_stat_s::rc_scq_no_swqe 982 ibd_rc_stat_s::rc_scq_no_largebuf 983 ibd_rc_stat_s::rc_swqe_short 984 ibd_rc_stat_s::rc_swqe_mac_update 985 ibd_rc_stat_s::rc_xmt_buf_short 986 ibd_rc_stat_s::rc_xmt_buf_mac_update 987 ibd_rc_stat_s::rc_conn_succ 988 ibd_rc_stat_s::rc_conn_fail 989 ibd_rc_stat_s::rc_null_conn 990 ibd_rc_stat_s::rc_no_estab_conn 991 ibd_rc_stat_s::rc_act_close 992 ibd_rc_stat_s::rc_pas_close 993 ibd_rc_stat_s::rc_delay_ace_recycle 994 ibd_rc_stat_s::rc_act_close_simultaneous 995 ibd_rc_stat_s::rc_reset_cnt)) 996 #endif 997 998 int 999 _init() 1000 { 1001 int status; 1002 1003 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 1004 PAGESIZE), 0); 1005 if (status != 0) { 1006 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 1007 return (status); 1008 } 1009 1010 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL); 1011 1012 mac_init_ops(&ibd_dev_ops, "ibp"); 1013 status = mod_install(&ibd_modlinkage); 1014 if (status != 0) { 1015 DPRINT(10, "_init:failed in mod_install()"); 1016 ddi_soft_state_fini(&ibd_list); 1017 mac_fini_ops(&ibd_dev_ops); 1018 return (status); 1019 } 1020 1021 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); 1022 mutex_enter(&ibd_gstate.ig_mutex); 1023 ibd_gstate.ig_ibt_hdl = NULL; 1024 ibd_gstate.ig_ibt_hdl_ref_cnt = 0; 1025 ibd_gstate.ig_service_list = NULL; 1026 mutex_exit(&ibd_gstate.ig_mutex); 1027 1028 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list, 1029 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) { 1030 return (EIO); 1031 } 1032 1033 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr); 1034 1035 #ifdef IBD_LOGGING 1036 ibd_log_init(); 1037 #endif 1038 return (0); 1039 } 1040 1041 int 1042 _info(struct modinfo *modinfop) 1043 { 1044 return (mod_info(&ibd_modlinkage, modinfop)); 1045 } 1046 1047 int 1048 _fini() 1049 { 1050 int status; 1051 1052 status = mod_remove(&ibd_modlinkage); 1053 if (status != 0) 1054 return (status); 1055 1056 ibt_unregister_part_attr_cb(); 1057 1058 mac_fini_ops(&ibd_dev_ops); 1059 mutex_destroy(&ibd_objlist_lock); 1060 ddi_soft_state_fini(&ibd_list); 1061 mutex_destroy(&ibd_gstate.ig_mutex); 1062 #ifdef IBD_LOGGING 1063 ibd_log_fini(); 1064 #endif 1065 return (0); 1066 } 1067 1068 /* 1069 * Convert the GID part of the mac address from network byte order 1070 * to host order. 1071 */ 1072 static void 1073 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 1074 { 1075 ib_sn_prefix_t nbopref; 1076 ib_guid_t nboguid; 1077 1078 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 1079 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 1080 dgid->gid_prefix = b2h64(nbopref); 1081 dgid->gid_guid = b2h64(nboguid); 1082 } 1083 1084 /* 1085 * Create the IPoIB address in network byte order from host order inputs. 1086 */ 1087 static void 1088 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 1089 ib_guid_t guid) 1090 { 1091 ib_sn_prefix_t nbopref; 1092 ib_guid_t nboguid; 1093 1094 mac->ipoib_qpn = htonl(qpn); 1095 nbopref = h2b64(prefix); 1096 nboguid = h2b64(guid); 1097 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 1098 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 1099 } 1100 1101 /* 1102 * Send to the appropriate all-routers group when the IBA multicast group 1103 * does not exist, based on whether the target group is v4 or v6. 1104 */ 1105 static boolean_t 1106 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 1107 ipoib_mac_t *rmac) 1108 { 1109 boolean_t retval = B_TRUE; 1110 uint32_t adjscope = state->id_scope << 16; 1111 uint32_t topword; 1112 1113 /* 1114 * Copy the first 4 bytes in without assuming any alignment of 1115 * input mac address; this will have IPoIB signature, flags and 1116 * scope bits. 1117 */ 1118 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 1119 topword = ntohl(topword); 1120 1121 /* 1122 * Generate proper address for IPv4/v6, adding in the Pkey properly. 1123 */ 1124 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 1125 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 1126 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 1127 ((uint32_t)(state->id_pkey << 16))), 1128 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 1129 else 1130 /* 1131 * Does not have proper bits in the mgid address. 1132 */ 1133 retval = B_FALSE; 1134 1135 return (retval); 1136 } 1137 1138 /* 1139 * Membership states for different mcg's are tracked by two lists: 1140 * the "non" list is used for promiscuous mode, when all mcg traffic 1141 * needs to be inspected. This type of membership is never used for 1142 * transmission, so there can not be an AH in the active list 1143 * corresponding to a member in this list. This list does not need 1144 * any protection, since all operations are performed by the async 1145 * thread. 1146 * 1147 * "Full" and "SendOnly" membership is tracked using a single list, 1148 * the "full" list. This is because this single list can then be 1149 * searched during transmit to a multicast group (if an AH for the 1150 * mcg is not found in the active list), since at least one type 1151 * of membership must be present before initiating the transmit. 1152 * This list is also emptied during driver detach, since sendonly 1153 * membership acquired during transmit is dropped at detach time 1154 * along with ipv4 broadcast full membership. Insert/deletes to 1155 * this list are done only by the async thread, but it is also 1156 * searched in program context (see multicast disable case), thus 1157 * the id_mc_mutex protects the list. The driver detach path also 1158 * deconstructs the "full" list, but it ensures that the async 1159 * thread will not be accessing the list (by blocking out mcg 1160 * trap handling and making sure no more Tx reaping will happen). 1161 * 1162 * Currently, an IBA attach is done in the SendOnly case too, 1163 * although this is not required. 1164 */ 1165 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1166 list_insert_head(&state->id_mc_full, mce) 1167 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1168 list_insert_head(&state->id_mc_non, mce) 1169 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1170 ibd_mcache_find(mgid, &state->id_mc_full) 1171 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1172 ibd_mcache_find(mgid, &state->id_mc_non) 1173 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1174 list_remove(&state->id_mc_full, mce) 1175 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1176 list_remove(&state->id_mc_non, mce) 1177 1178 static void * 1179 list_get_head(list_t *list) 1180 { 1181 list_node_t *lhead = list_head(list); 1182 1183 if (lhead != NULL) 1184 list_remove(list, lhead); 1185 return (lhead); 1186 } 1187 1188 /* 1189 * This is always guaranteed to be able to queue the work. 1190 */ 1191 void 1192 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1193 { 1194 /* Initialize request */ 1195 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1196 ptr->rq_op = op; 1197 1198 /* 1199 * Queue provided slot onto request pool. 1200 */ 1201 mutex_enter(&state->id_acache_req_lock); 1202 list_insert_tail(&state->id_req_list, ptr); 1203 1204 /* Go, fetch, async thread */ 1205 cv_signal(&state->id_acache_req_cv); 1206 mutex_exit(&state->id_acache_req_lock); 1207 } 1208 1209 /* 1210 * Main body of the per interface async thread. 1211 */ 1212 static void 1213 ibd_async_work(ibd_state_t *state) 1214 { 1215 ibd_req_t *ptr; 1216 callb_cpr_t cprinfo; 1217 1218 mutex_enter(&state->id_acache_req_lock); 1219 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1220 callb_generic_cpr, "ibd_async_work"); 1221 1222 for (;;) { 1223 ptr = list_get_head(&state->id_req_list); 1224 if (ptr != NULL) { 1225 mutex_exit(&state->id_acache_req_lock); 1226 1227 /* 1228 * If we are in late hca initialization mode, do not 1229 * process any other async request other than TRAP. TRAP 1230 * is used for indicating creation of a broadcast group; 1231 * in which case, we need to join/create the group. 1232 */ 1233 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 1234 (ptr->rq_op != IBD_ASYNC_TRAP)) { 1235 goto free_req_and_continue; 1236 } 1237 1238 /* 1239 * Once we have done the operation, there is no 1240 * guarantee the request slot is going to be valid, 1241 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1242 * TRAP). 1243 * 1244 * Perform the request. 1245 */ 1246 switch (ptr->rq_op) { 1247 case IBD_ASYNC_GETAH: 1248 ibd_async_acache(state, &ptr->rq_mac); 1249 break; 1250 case IBD_ASYNC_JOIN: 1251 case IBD_ASYNC_LEAVE: 1252 ibd_async_multicast(state, 1253 ptr->rq_gid, ptr->rq_op); 1254 break; 1255 case IBD_ASYNC_PROMON: 1256 ibd_async_setprom(state); 1257 break; 1258 case IBD_ASYNC_PROMOFF: 1259 ibd_async_unsetprom(state); 1260 break; 1261 case IBD_ASYNC_REAP: 1262 ibd_async_reap_group(state, 1263 ptr->rq_ptr, ptr->rq_gid, 1264 IB_MC_JSTATE_FULL); 1265 /* 1266 * the req buf contains in mce 1267 * structure, so we do not need 1268 * to free it here. 1269 */ 1270 ptr = NULL; 1271 break; 1272 case IBD_ASYNC_TRAP: 1273 ibd_async_trap(state, ptr); 1274 break; 1275 case IBD_ASYNC_SCHED: 1276 ibd_async_txsched(state); 1277 break; 1278 case IBD_ASYNC_LINK: 1279 ibd_async_link(state, ptr); 1280 break; 1281 case IBD_ASYNC_EXIT: 1282 mutex_enter(&state->id_acache_req_lock); 1283 #ifndef __lock_lint 1284 CALLB_CPR_EXIT(&cprinfo); 1285 #else 1286 mutex_exit(&state->id_acache_req_lock); 1287 #endif 1288 return; 1289 case IBD_ASYNC_RC_TOO_BIG: 1290 ibd_async_rc_process_too_big(state, 1291 ptr); 1292 break; 1293 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 1294 ibd_async_rc_close_act_chan(state, ptr); 1295 break; 1296 case IBD_ASYNC_RC_RECYCLE_ACE: 1297 ibd_async_rc_recycle_ace(state, ptr); 1298 break; 1299 } 1300 free_req_and_continue: 1301 if (ptr != NULL) 1302 kmem_cache_free(state->id_req_kmc, ptr); 1303 1304 mutex_enter(&state->id_acache_req_lock); 1305 } else { 1306 #ifndef __lock_lint 1307 /* 1308 * Nothing to do: wait till new request arrives. 1309 */ 1310 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1311 cv_wait(&state->id_acache_req_cv, 1312 &state->id_acache_req_lock); 1313 CALLB_CPR_SAFE_END(&cprinfo, 1314 &state->id_acache_req_lock); 1315 #endif 1316 } 1317 } 1318 1319 /*NOTREACHED*/ 1320 _NOTE(NOT_REACHED) 1321 } 1322 1323 /* 1324 * Return when it is safe to queue requests to the async daemon; primarily 1325 * for subnet trap and async event handling. Disallow requests before the 1326 * daemon is created, and when interface deinitilization starts. 1327 */ 1328 static boolean_t 1329 ibd_async_safe(ibd_state_t *state) 1330 { 1331 mutex_enter(&state->id_trap_lock); 1332 if (state->id_trap_stop) { 1333 mutex_exit(&state->id_trap_lock); 1334 return (B_FALSE); 1335 } 1336 state->id_trap_inprog++; 1337 mutex_exit(&state->id_trap_lock); 1338 return (B_TRUE); 1339 } 1340 1341 /* 1342 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1343 * trap or event handling to complete to kill the async thread and deconstruct 1344 * the mcg/ace list. 1345 */ 1346 static void 1347 ibd_async_done(ibd_state_t *state) 1348 { 1349 mutex_enter(&state->id_trap_lock); 1350 if (--state->id_trap_inprog == 0) 1351 cv_signal(&state->id_trap_cv); 1352 mutex_exit(&state->id_trap_lock); 1353 } 1354 1355 /* 1356 * Hash functions: 1357 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1358 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1359 * These operate on mac addresses input into ibd_send, but there is no 1360 * guarantee on the alignment of the ipoib_mac_t structure. 1361 */ 1362 /*ARGSUSED*/ 1363 static uint_t 1364 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1365 { 1366 ulong_t ptraddr = (ulong_t)key; 1367 uint_t hval; 1368 1369 /* 1370 * If the input address is 4 byte aligned, we can just dereference 1371 * it. This is most common, since IP will send in a 4 byte aligned 1372 * IP header, which implies the 24 byte IPoIB psuedo header will be 1373 * 4 byte aligned too. 1374 */ 1375 if ((ptraddr & 3) == 0) 1376 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1377 1378 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1379 return (hval); 1380 } 1381 1382 static int 1383 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1384 { 1385 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1386 return (0); 1387 else 1388 return (1); 1389 } 1390 1391 /* 1392 * Initialize all the per interface caches and lists; AH cache, 1393 * MCG list etc. 1394 */ 1395 static int 1396 ibd_acache_init(ibd_state_t *state) 1397 { 1398 ibd_ace_t *ce; 1399 int i; 1400 1401 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1402 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1403 mutex_enter(&state->id_ac_mutex); 1404 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1405 offsetof(ibd_ace_t, ac_list)); 1406 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1407 offsetof(ibd_ace_t, ac_list)); 1408 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1409 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor, 1410 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1411 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1412 offsetof(ibd_mce_t, mc_list)); 1413 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1414 offsetof(ibd_mce_t, mc_list)); 1415 state->id_ac_hot_ace = NULL; 1416 1417 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1418 state->id_num_ah, KM_SLEEP); 1419 for (i = 0; i < state->id_num_ah; i++, ce++) { 1420 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1421 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1422 mutex_exit(&state->id_ac_mutex); 1423 ibd_acache_fini(state); 1424 return (DDI_FAILURE); 1425 } else { 1426 CLEAR_REFCYCLE(ce); 1427 ce->ac_mce = NULL; 1428 mutex_init(&ce->tx_too_big_mutex, NULL, 1429 MUTEX_DRIVER, NULL); 1430 IBD_ACACHE_INSERT_FREE(state, ce); 1431 } 1432 } 1433 mutex_exit(&state->id_ac_mutex); 1434 return (DDI_SUCCESS); 1435 } 1436 1437 static void 1438 ibd_acache_fini(ibd_state_t *state) 1439 { 1440 ibd_ace_t *ptr; 1441 1442 mutex_enter(&state->id_ac_mutex); 1443 1444 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1445 ASSERT(GET_REF(ptr) == 0); 1446 mutex_destroy(&ptr->tx_too_big_mutex); 1447 (void) ibt_free_ud_dest(ptr->ac_dest); 1448 } 1449 1450 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1451 ASSERT(GET_REF(ptr) == 0); 1452 mutex_destroy(&ptr->tx_too_big_mutex); 1453 (void) ibt_free_ud_dest(ptr->ac_dest); 1454 } 1455 1456 list_destroy(&state->id_ah_free); 1457 list_destroy(&state->id_ah_active); 1458 list_destroy(&state->id_mc_full); 1459 list_destroy(&state->id_mc_non); 1460 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah); 1461 mutex_exit(&state->id_ac_mutex); 1462 mutex_destroy(&state->id_ac_mutex); 1463 mutex_destroy(&state->id_mc_mutex); 1464 } 1465 1466 /* 1467 * Search AH active hash list for a cached path to input destination. 1468 * If we are "just looking", hold == F. When we are in the Tx path, 1469 * we set hold == T to grab a reference on the AH so that it can not 1470 * be recycled to a new destination while the Tx request is posted. 1471 */ 1472 ibd_ace_t * 1473 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1474 { 1475 ibd_ace_t *ptr; 1476 1477 ASSERT(mutex_owned(&state->id_ac_mutex)); 1478 1479 /* 1480 * Do hash search. 1481 */ 1482 if (mod_hash_find(state->id_ah_active_hash, 1483 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1484 if (hold) 1485 INC_REF(ptr, num); 1486 return (ptr); 1487 } 1488 return (NULL); 1489 } 1490 1491 /* 1492 * This is called by the tx side; if an initialized AH is found in 1493 * the active list, it is locked down and can be used; if no entry 1494 * is found, an async request is queued to do path resolution. 1495 */ 1496 static ibd_ace_t * 1497 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1498 { 1499 ibd_ace_t *ptr; 1500 ibd_req_t *req; 1501 1502 /* 1503 * Only attempt to print when we can; in the mdt pattr case, the 1504 * address is not aligned properly. 1505 */ 1506 if (((ulong_t)mac & 3) == 0) { 1507 DPRINT(4, 1508 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1509 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1510 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1511 htonl(mac->ipoib_gidsuff[1])); 1512 } 1513 1514 mutex_enter(&state->id_ac_mutex); 1515 1516 if (((ptr = state->id_ac_hot_ace) != NULL) && 1517 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1518 INC_REF(ptr, numwqe); 1519 mutex_exit(&state->id_ac_mutex); 1520 return (ptr); 1521 } 1522 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1523 state->id_ac_hot_ace = ptr; 1524 mutex_exit(&state->id_ac_mutex); 1525 return (ptr); 1526 } 1527 1528 /* 1529 * Implementation of a single outstanding async request; if 1530 * the operation is not started yet, queue a request and move 1531 * to ongoing state. Remember in id_ah_addr for which address 1532 * we are queueing the request, in case we need to flag an error; 1533 * Any further requests, for the same or different address, until 1534 * the operation completes, is sent back to GLDv3 to be retried. 1535 * The async thread will update id_ah_op with an error indication 1536 * or will set it to indicate the next look up can start; either 1537 * way, it will mac_tx_update() so that all blocked requests come 1538 * back here. 1539 */ 1540 *err = EAGAIN; 1541 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1542 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1543 if (req != NULL) { 1544 /* 1545 * We did not even find the entry; queue a request 1546 * for it. 1547 */ 1548 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1549 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1550 state->id_ah_op = IBD_OP_ONGOING; 1551 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1552 } 1553 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1554 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1555 /* 1556 * Check the status of the pathrecord lookup request 1557 * we had queued before. 1558 */ 1559 if (state->id_ah_op == IBD_OP_ERRORED) { 1560 *err = EFAULT; 1561 state->id_ah_error++; 1562 } else { 1563 /* 1564 * IBD_OP_ROUTERED case: We need to send to the 1565 * all-router MCG. If we can find the AH for 1566 * the mcg, the Tx will be attempted. If we 1567 * do not find the AH, we return NORESOURCES 1568 * to retry. 1569 */ 1570 ipoib_mac_t routermac; 1571 1572 (void) ibd_get_allroutergroup(state, mac, &routermac); 1573 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1574 numwqe); 1575 } 1576 state->id_ah_op = IBD_OP_NOTSTARTED; 1577 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1578 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1579 /* 1580 * This case can happen when we get a higher band 1581 * packet. The easiest way is to reset the state machine 1582 * to accommodate the higher priority packet. 1583 */ 1584 state->id_ah_op = IBD_OP_NOTSTARTED; 1585 } 1586 mutex_exit(&state->id_ac_mutex); 1587 1588 return (ptr); 1589 } 1590 1591 /* 1592 * Grab a not-currently-in-use AH/PathRecord from the active 1593 * list to recycle to a new destination. Only the async thread 1594 * executes this code. 1595 */ 1596 static ibd_ace_t * 1597 ibd_acache_get_unref(ibd_state_t *state) 1598 { 1599 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1600 boolean_t try_rc_chan_recycle = B_FALSE; 1601 1602 ASSERT(mutex_owned(&state->id_ac_mutex)); 1603 1604 /* 1605 * Do plain linear search. 1606 */ 1607 while (ptr != NULL) { 1608 /* 1609 * Note that it is possible that the "cycle" bit 1610 * is set on the AH w/o any reference count. The 1611 * mcg must have been deleted, and the tx cleanup 1612 * just decremented the reference count to 0, but 1613 * hasn't gotten around to grabbing the id_ac_mutex 1614 * to move the AH into the free list. 1615 */ 1616 if (GET_REF(ptr) == 0) { 1617 if (ptr->ac_chan != NULL) { 1618 ASSERT(state->id_enable_rc == B_TRUE); 1619 if (!try_rc_chan_recycle) { 1620 try_rc_chan_recycle = B_TRUE; 1621 ibd_rc_signal_ace_recycle(state, ptr); 1622 } 1623 } else { 1624 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1625 break; 1626 } 1627 } 1628 ptr = list_prev(&state->id_ah_active, ptr); 1629 } 1630 return (ptr); 1631 } 1632 1633 /* 1634 * Invoked to clean up AH from active list in case of multicast 1635 * disable and to handle sendonly memberships during mcg traps. 1636 * And for port up processing for multicast and unicast AHs. 1637 * Normally, the AH is taken off the active list, and put into 1638 * the free list to be recycled for a new destination. In case 1639 * Tx requests on the AH have not completed yet, the AH is marked 1640 * for reaping (which will put the AH on the free list) once the Tx's 1641 * complete; in this case, depending on the "force" input, we take 1642 * out the AH from the active list right now, or leave it also for 1643 * the reap operation. Returns TRUE if the AH is taken off the active 1644 * list (and either put into the free list right now, or arranged for 1645 * later), FALSE otherwise. 1646 */ 1647 boolean_t 1648 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1649 { 1650 ibd_ace_t *acactive; 1651 boolean_t ret = B_TRUE; 1652 1653 ASSERT(mutex_owned(&state->id_ac_mutex)); 1654 1655 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1656 1657 /* 1658 * Note that the AH might already have the cycle bit set 1659 * on it; this might happen if sequences of multicast 1660 * enables and disables are coming so fast, that posted 1661 * Tx's to the mcg have not completed yet, and the cycle 1662 * bit is set successively by each multicast disable. 1663 */ 1664 if (SET_CYCLE_IF_REF(acactive)) { 1665 if (!force) { 1666 /* 1667 * The ace is kept on the active list, further 1668 * Tx's can still grab a reference on it; the 1669 * ace is reaped when all pending Tx's 1670 * referencing the AH complete. 1671 */ 1672 ret = B_FALSE; 1673 } else { 1674 /* 1675 * In the mcg trap case, we always pull the 1676 * AH from the active list. And also the port 1677 * up multi/unicast case. 1678 */ 1679 ASSERT(acactive->ac_chan == NULL); 1680 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1681 acactive->ac_mce = NULL; 1682 } 1683 } else { 1684 /* 1685 * Determined the ref count is 0, thus reclaim 1686 * immediately after pulling out the ace from 1687 * the active list. 1688 */ 1689 ASSERT(acactive->ac_chan == NULL); 1690 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1691 acactive->ac_mce = NULL; 1692 IBD_ACACHE_INSERT_FREE(state, acactive); 1693 } 1694 1695 } 1696 return (ret); 1697 } 1698 1699 /* 1700 * Helper function for async path record lookup. If we are trying to 1701 * Tx to a MCG, check our membership, possibly trying to join the 1702 * group if required. If that fails, try to send the packet to the 1703 * all router group (indicated by the redirect output), pointing 1704 * the input mac address to the router mcg address. 1705 */ 1706 static ibd_mce_t * 1707 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1708 { 1709 ib_gid_t mgid; 1710 ibd_mce_t *mce; 1711 ipoib_mac_t routermac; 1712 1713 *redirect = B_FALSE; 1714 ibd_n2h_gid(mac, &mgid); 1715 1716 /* 1717 * Check the FullMember+SendOnlyNonMember list. 1718 * Since we are the only one who manipulates the 1719 * id_mc_full list, no locks are needed. 1720 */ 1721 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1722 if (mce != NULL) { 1723 DPRINT(4, "ibd_async_mcache : already joined to group"); 1724 return (mce); 1725 } 1726 1727 /* 1728 * Not found; try to join(SendOnlyNonMember) and attach. 1729 */ 1730 DPRINT(4, "ibd_async_mcache : not joined to group"); 1731 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1732 NULL) { 1733 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1734 return (mce); 1735 } 1736 1737 /* 1738 * MCGroup not present; try to join the all-router group. If 1739 * any of the following steps succeed, we will be redirecting 1740 * to the all router group. 1741 */ 1742 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1743 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1744 return (NULL); 1745 *redirect = B_TRUE; 1746 ibd_n2h_gid(&routermac, &mgid); 1747 bcopy(&routermac, mac, IPOIB_ADDRL); 1748 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1749 mgid.gid_prefix, mgid.gid_guid); 1750 1751 /* 1752 * Are we already joined to the router group? 1753 */ 1754 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1755 DPRINT(4, "ibd_async_mcache : using already joined router" 1756 "group\n"); 1757 return (mce); 1758 } 1759 1760 /* 1761 * Can we join(SendOnlyNonMember) the router group? 1762 */ 1763 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1764 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1765 NULL) { 1766 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1767 return (mce); 1768 } 1769 1770 return (NULL); 1771 } 1772 1773 /* 1774 * Async path record lookup code. 1775 */ 1776 static void 1777 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1778 { 1779 ibd_ace_t *ce; 1780 ibd_mce_t *mce = NULL; 1781 ibt_path_attr_t path_attr; 1782 ibt_path_info_t path_info; 1783 ib_gid_t destgid; 1784 char ret = IBD_OP_NOTSTARTED; 1785 1786 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1787 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1788 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1789 htonl(mac->ipoib_gidsuff[1])); 1790 1791 /* 1792 * Check whether we are trying to transmit to a MCG. 1793 * In that case, we need to make sure we are a member of 1794 * the MCG. 1795 */ 1796 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1797 boolean_t redirected; 1798 1799 /* 1800 * If we can not find or join the group or even 1801 * redirect, error out. 1802 */ 1803 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1804 NULL) { 1805 state->id_ah_op = IBD_OP_ERRORED; 1806 return; 1807 } 1808 1809 /* 1810 * If we got redirected, we need to determine whether 1811 * the AH for the new mcg is in the cache already, and 1812 * not pull it in then; otherwise proceed to get the 1813 * path for the new mcg. There is no guarantee that 1814 * if the AH is currently in the cache, it will still be 1815 * there when we look in ibd_acache_lookup(), but that's 1816 * okay, we will come back here. 1817 */ 1818 if (redirected) { 1819 ret = IBD_OP_ROUTERED; 1820 DPRINT(4, "ibd_async_acache : redirected to " 1821 "%08X:%08X:%08X:%08X:%08X", 1822 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1823 htonl(mac->ipoib_gidpref[1]), 1824 htonl(mac->ipoib_gidsuff[0]), 1825 htonl(mac->ipoib_gidsuff[1])); 1826 1827 mutex_enter(&state->id_ac_mutex); 1828 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1829 state->id_ah_op = IBD_OP_ROUTERED; 1830 mutex_exit(&state->id_ac_mutex); 1831 DPRINT(4, "ibd_async_acache : router AH found"); 1832 return; 1833 } 1834 mutex_exit(&state->id_ac_mutex); 1835 } 1836 } 1837 1838 /* 1839 * Get an AH from the free list. 1840 */ 1841 mutex_enter(&state->id_ac_mutex); 1842 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1843 /* 1844 * No free ones; try to grab an unreferenced active 1845 * one. Maybe we need to make the active list LRU, 1846 * but that will create more work for Tx callbacks. 1847 * Is there a way of not having to pull out the 1848 * entry from the active list, but just indicate it 1849 * is being recycled? Yes, but that creates one more 1850 * check in the fast lookup path. 1851 */ 1852 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1853 /* 1854 * Pretty serious shortage now. 1855 */ 1856 state->id_ah_op = IBD_OP_NOTSTARTED; 1857 mutex_exit(&state->id_ac_mutex); 1858 DPRINT(10, "ibd_async_acache : failed to find AH " 1859 "slot\n"); 1860 return; 1861 } 1862 /* 1863 * We could check whether ac_mce points to a SendOnly 1864 * member and drop that membership now. Or do it lazily 1865 * at detach time. 1866 */ 1867 ce->ac_mce = NULL; 1868 } 1869 mutex_exit(&state->id_ac_mutex); 1870 ASSERT(ce->ac_mce == NULL); 1871 1872 /* 1873 * Update the entry. 1874 */ 1875 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1876 1877 bzero(&path_info, sizeof (path_info)); 1878 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1879 path_attr.pa_sgid = state->id_sgid; 1880 path_attr.pa_num_dgids = 1; 1881 ibd_n2h_gid(&ce->ac_mac, &destgid); 1882 path_attr.pa_dgids = &destgid; 1883 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1884 path_attr.pa_pkey = state->id_pkey; 1885 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1, 1886 &path_info, NULL) != IBT_SUCCESS) { 1887 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1888 goto error; 1889 } 1890 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1891 ntohl(ce->ac_mac.ipoib_qpn), 1892 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1893 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1894 goto error; 1895 } 1896 1897 /* 1898 * mce is set whenever an AH is being associated with a 1899 * MCG; this will come in handy when we leave the MCG. The 1900 * lock protects Tx fastpath from scanning the active list. 1901 */ 1902 if (mce != NULL) 1903 ce->ac_mce = mce; 1904 1905 /* 1906 * initiate a RC mode connection for unicast address 1907 */ 1908 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1909 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1910 ASSERT(ce->ac_chan == NULL); 1911 DPRINT(10, "ibd_async_acache: call " 1912 "ibd_rc_try_connect(ace=%p)", ce); 1913 ibd_rc_try_connect(state, ce, &path_info); 1914 if (ce->ac_chan == NULL) { 1915 DPRINT(10, "ibd_async_acache: fail to setup RC" 1916 " channel"); 1917 state->rc_conn_fail++; 1918 goto error; 1919 } 1920 } 1921 1922 mutex_enter(&state->id_ac_mutex); 1923 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1924 state->id_ah_op = ret; 1925 mutex_exit(&state->id_ac_mutex); 1926 return; 1927 error: 1928 /* 1929 * We might want to drop SendOnly membership here if we 1930 * joined above. The lock protects Tx callbacks inserting 1931 * into the free list. 1932 */ 1933 mutex_enter(&state->id_ac_mutex); 1934 state->id_ah_op = IBD_OP_ERRORED; 1935 IBD_ACACHE_INSERT_FREE(state, ce); 1936 mutex_exit(&state->id_ac_mutex); 1937 } 1938 1939 /* 1940 * While restoring port's presence on the subnet on a port up, it is possible 1941 * that the port goes down again. 1942 */ 1943 static void 1944 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1945 { 1946 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1947 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1948 LINK_STATE_UP; 1949 ibd_mce_t *mce, *pmce; 1950 ibd_ace_t *ace, *pace; 1951 1952 DPRINT(10, "ibd_async_link(): %d", opcode); 1953 1954 /* 1955 * On a link up, revalidate the link speed/width. No point doing 1956 * this on a link down, since we will be unable to do SA operations, 1957 * defaulting to the lowest speed. Also notice that we update our 1958 * notion of speed before calling mac_link_update(), which will do 1959 * necessary higher level notifications for speed changes. 1960 */ 1961 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1962 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1963 state->id_link_speed = ibd_get_portspeed(state); 1964 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1965 } 1966 1967 /* 1968 * Do all the work required to establish our presence on 1969 * the subnet. 1970 */ 1971 if (opcode == IBD_LINK_UP_ABSENT) { 1972 /* 1973 * If in promiscuous mode ... 1974 */ 1975 if (state->id_prom_op == IBD_OP_COMPLETED) { 1976 /* 1977 * Drop all nonmembership. 1978 */ 1979 ibd_async_unsetprom(state); 1980 1981 /* 1982 * Then, try to regain nonmembership to all mcg's. 1983 */ 1984 ibd_async_setprom(state); 1985 1986 } 1987 1988 /* 1989 * Drop all sendonly membership (which also gets rid of the 1990 * AHs); try to reacquire all full membership. 1991 */ 1992 mce = list_head(&state->id_mc_full); 1993 while ((pmce = mce) != NULL) { 1994 mce = list_next(&state->id_mc_full, mce); 1995 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1996 ibd_leave_group(state, 1997 pmce->mc_info.mc_adds_vect.av_dgid, 1998 IB_MC_JSTATE_SEND_ONLY_NON); 1999 else 2000 ibd_reacquire_group(state, pmce); 2001 } 2002 2003 /* 2004 * Recycle all active AHs to free list (and if there are 2005 * pending posts, make sure they will go into the free list 2006 * once the Tx's complete). Grab the lock to prevent 2007 * concurrent Tx's as well as Tx cleanups. 2008 */ 2009 mutex_enter(&state->id_ac_mutex); 2010 ace = list_head(&state->id_ah_active); 2011 while ((pace = ace) != NULL) { 2012 boolean_t cycled; 2013 2014 ace = list_next(&state->id_ah_active, ace); 2015 mce = pace->ac_mce; 2016 if (pace->ac_chan != NULL) { 2017 ASSERT(mce == NULL); 2018 ASSERT(state->id_enable_rc == B_TRUE); 2019 if (pace->ac_chan->chan_state == 2020 IBD_RC_STATE_ACT_ESTAB) { 2021 INC_REF(pace, 1); 2022 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 2023 pace->ac_chan->chan_state = 2024 IBD_RC_STATE_ACT_CLOSING; 2025 ibd_rc_signal_act_close(state, pace); 2026 } else { 2027 state->rc_act_close_simultaneous++; 2028 DPRINT(40, "ibd_async_link: other " 2029 "thread is closing it, ace=%p, " 2030 "ac_chan=%p, chan_state=%d", 2031 pace, pace->ac_chan, 2032 pace->ac_chan->chan_state); 2033 } 2034 } else { 2035 cycled = ibd_acache_recycle(state, 2036 &pace->ac_mac, B_TRUE); 2037 } 2038 /* 2039 * If this is for an mcg, it must be for a fullmember, 2040 * since we got rid of send-only members above when 2041 * processing the mce list. 2042 */ 2043 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 2044 IB_MC_JSTATE_FULL))); 2045 2046 /* 2047 * Check if the fullmember mce needs to be torn down, 2048 * ie whether the DLPI disable has already been done. 2049 * If so, do some of the work of tx_cleanup, namely 2050 * causing leave (which will fail), detach and 2051 * mce-freeing. tx_cleanup will put the AH into free 2052 * list. The reason to duplicate some of this 2053 * tx_cleanup work is because we want to delete the 2054 * AH right now instead of waiting for tx_cleanup, to 2055 * force subsequent Tx's to reacquire an AH. 2056 */ 2057 if ((mce != NULL) && (mce->mc_fullreap)) 2058 ibd_async_reap_group(state, mce, 2059 mce->mc_info.mc_adds_vect.av_dgid, 2060 mce->mc_jstate); 2061 } 2062 mutex_exit(&state->id_ac_mutex); 2063 } 2064 2065 /* 2066 * mac handle is guaranteed to exist since driver does ibt_close_hca() 2067 * (which stops further events from being delivered) before 2068 * mac_unregister(). At this point, it is guaranteed that mac_register 2069 * has already been done. 2070 */ 2071 mutex_enter(&state->id_link_mutex); 2072 state->id_link_state = lstate; 2073 mac_link_update(state->id_mh, lstate); 2074 mutex_exit(&state->id_link_mutex); 2075 2076 ibd_async_done(state); 2077 } 2078 2079 /* 2080 * Check the pkey table to see if we can find the pkey we're looking for. 2081 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 2082 * failure. 2083 */ 2084 static int 2085 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 2086 uint16_t *pkix) 2087 { 2088 uint16_t ndx; 2089 2090 ASSERT(pkix != NULL); 2091 2092 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 2093 if (pkey_tbl[ndx] == pkey) { 2094 *pkix = ndx; 2095 return (0); 2096 } 2097 } 2098 return (-1); 2099 } 2100 2101 /* 2102 * Late HCA Initialization: 2103 * If plumb had succeeded without the availability of an active port or the 2104 * pkey, and either of their availability is now being indicated via PORT_UP 2105 * or PORT_CHANGE respectively, try a start of the interface. 2106 * 2107 * Normal Operation: 2108 * When the link is notified up, we need to do a few things, based 2109 * on the port's current p_init_type_reply claiming a reinit has been 2110 * done or not. The reinit steps are: 2111 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2112 * the old Pkey and GID0 are correct. 2113 * 2. Register for mcg traps (already done by ibmf). 2114 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2115 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2116 * 4. Give up all sendonly memberships. 2117 * 5. Acquire all full memberships. 2118 * 6. In promiscuous mode, acquire all non memberships. 2119 * 7. Recycle all AHs to free list. 2120 */ 2121 static void 2122 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2123 { 2124 ibt_hca_portinfo_t *port_infop = NULL; 2125 ibt_status_t ibt_status; 2126 uint_t psize, port_infosz; 2127 ibd_link_op_t opcode; 2128 ibd_req_t *req; 2129 link_state_t new_link_state = LINK_STATE_UP; 2130 uint8_t itreply; 2131 uint16_t pkix; 2132 int ret; 2133 2134 /* 2135 * Let's not race with a plumb or an unplumb; if we detect a 2136 * pkey relocation event later on here, we may have to restart. 2137 */ 2138 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2139 2140 mutex_enter(&state->id_link_mutex); 2141 2142 /* 2143 * If the link state is unknown, a plumb has not yet been attempted 2144 * on the interface. Nothing to do. 2145 */ 2146 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2147 mutex_exit(&state->id_link_mutex); 2148 goto link_mod_return; 2149 } 2150 2151 /* 2152 * If link state is down because of plumb failure, and we are not in 2153 * late HCA init, and we were not successfully plumbed, nothing to do. 2154 */ 2155 if ((state->id_link_state == LINK_STATE_DOWN) && 2156 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) && 2157 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) { 2158 mutex_exit(&state->id_link_mutex); 2159 goto link_mod_return; 2160 } 2161 2162 /* 2163 * If this routine was called in response to a port down event, 2164 * we just need to see if this should be informed. 2165 */ 2166 if (code == IBT_ERROR_PORT_DOWN) { 2167 new_link_state = LINK_STATE_DOWN; 2168 goto update_link_state; 2169 } 2170 2171 /* 2172 * If it's not a port down event we've received, try to get the port 2173 * attributes first. If we fail here, the port is as good as down. 2174 * Otherwise, if the link went down by the time the handler gets 2175 * here, give up - we cannot even validate the pkey/gid since those 2176 * are not valid and this is as bad as a port down anyway. 2177 */ 2178 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2179 &port_infop, &psize, &port_infosz); 2180 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2181 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2182 new_link_state = LINK_STATE_DOWN; 2183 goto update_link_state; 2184 } 2185 2186 /* 2187 * If in the previous attempt, the pkey was not found either due to the 2188 * port state being down, or due to it's absence in the pkey table, 2189 * look for it now and try to start the interface. 2190 */ 2191 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) { 2192 mutex_exit(&state->id_link_mutex); 2193 if ((ret = ibd_start(state)) != 0) { 2194 DPRINT(10, "ibd_linkmod: cannot start from late HCA " 2195 "init, ret=%d", ret); 2196 } 2197 ibt_free_portinfo(port_infop, port_infosz); 2198 goto link_mod_return; 2199 } 2200 2201 /* 2202 * Check the SM InitTypeReply flags. If both NoLoadReply and 2203 * PreserveContentReply are 0, we don't know anything about the 2204 * data loaded into the port attributes, so we need to verify 2205 * if gid0 and pkey are still valid. 2206 */ 2207 itreply = port_infop->p_init_type_reply; 2208 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2209 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2210 /* 2211 * Check to see if the subnet part of GID0 has changed. If 2212 * not, check the simple case first to see if the pkey 2213 * index is the same as before; finally check to see if the 2214 * pkey has been relocated to a different index in the table. 2215 */ 2216 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2217 if (bcmp(port_infop->p_sgid_tbl, 2218 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2219 2220 new_link_state = LINK_STATE_DOWN; 2221 2222 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2223 state->id_pkey) { 2224 2225 new_link_state = LINK_STATE_UP; 2226 2227 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2228 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2229 2230 ibt_free_portinfo(port_infop, port_infosz); 2231 mutex_exit(&state->id_link_mutex); 2232 2233 /* 2234 * Currently a restart is required if our pkey has moved 2235 * in the pkey table. If we get the ibt_recycle_ud() to 2236 * work as documented (expected), we may be able to 2237 * avoid a complete restart. Note that we've already 2238 * marked both the start and stop 'in-progress' flags, 2239 * so it is ok to go ahead and do this restart. 2240 */ 2241 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2242 if ((ret = ibd_start(state)) != 0) { 2243 DPRINT(10, "ibd_restart: cannot restart, " 2244 "ret=%d", ret); 2245 } 2246 2247 goto link_mod_return; 2248 } else { 2249 new_link_state = LINK_STATE_DOWN; 2250 } 2251 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2252 } 2253 2254 update_link_state: 2255 if (port_infop) { 2256 ibt_free_portinfo(port_infop, port_infosz); 2257 } 2258 2259 /* 2260 * If we're reporting a link up, check InitTypeReply to see if 2261 * the SM has ensured that the port's presence in mcg, traps, 2262 * etc. is intact. 2263 */ 2264 if (new_link_state == LINK_STATE_DOWN) { 2265 opcode = IBD_LINK_DOWN; 2266 } else { 2267 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2268 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2269 opcode = IBD_LINK_UP; 2270 } else { 2271 opcode = IBD_LINK_UP_ABSENT; 2272 } 2273 } 2274 2275 /* 2276 * If the old state is the same as the new state, and the SM indicated 2277 * no change in the port parameters, nothing to do. 2278 */ 2279 if ((state->id_link_state == new_link_state) && (opcode != 2280 IBD_LINK_UP_ABSENT)) { 2281 mutex_exit(&state->id_link_mutex); 2282 goto link_mod_return; 2283 } 2284 2285 /* 2286 * Ok, so there was a link state change; see if it's safe to ask 2287 * the async thread to do the work 2288 */ 2289 if (!ibd_async_safe(state)) { 2290 state->id_link_state = new_link_state; 2291 mutex_exit(&state->id_link_mutex); 2292 goto link_mod_return; 2293 } 2294 2295 mutex_exit(&state->id_link_mutex); 2296 2297 /* 2298 * Queue up a request for ibd_async_link() to handle this link 2299 * state change event 2300 */ 2301 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2302 req->rq_ptr = (void *)opcode; 2303 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2304 2305 link_mod_return: 2306 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2307 } 2308 2309 /* 2310 * For the port up/down events, IBTL guarantees there will not be concurrent 2311 * invocations of the handler. IBTL might coalesce link transition events, 2312 * and not invoke the handler for _each_ up/down transition, but it will 2313 * invoke the handler with last known state 2314 */ 2315 static void 2316 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2317 ibt_async_code_t code, ibt_async_event_t *event) 2318 { 2319 ibd_state_t *state = (ibd_state_t *)clnt_private; 2320 2321 switch (code) { 2322 case IBT_ERROR_CATASTROPHIC_CHAN: 2323 ibd_print_warn(state, "catastrophic channel error"); 2324 break; 2325 case IBT_ERROR_CQ: 2326 ibd_print_warn(state, "completion queue error"); 2327 break; 2328 case IBT_PORT_CHANGE_EVENT: 2329 /* 2330 * Events will be delivered to all instances that have 2331 * done ibt_open_hca() but not yet done ibt_close_hca(). 2332 * Only need to do work for our port; IBTF will deliver 2333 * events for other ports on the hca we have ibt_open_hca'ed 2334 * too. Note that id_port is initialized in ibd_attach() 2335 * before we do an ibt_open_hca() in ibd_attach(). 2336 */ 2337 ASSERT(state->id_hca_hdl == hca_hdl); 2338 if (state->id_port != event->ev_port) 2339 break; 2340 2341 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2342 IBT_PORT_CHANGE_PKEY) { 2343 ibd_link_mod(state, code); 2344 } 2345 break; 2346 case IBT_ERROR_PORT_DOWN: 2347 case IBT_CLNT_REREG_EVENT: 2348 case IBT_EVENT_PORT_UP: 2349 /* 2350 * Events will be delivered to all instances that have 2351 * done ibt_open_hca() but not yet done ibt_close_hca(). 2352 * Only need to do work for our port; IBTF will deliver 2353 * events for other ports on the hca we have ibt_open_hca'ed 2354 * too. Note that id_port is initialized in ibd_attach() 2355 * before we do an ibt_open_hca() in ibd_attach(). 2356 */ 2357 ASSERT(state->id_hca_hdl == hca_hdl); 2358 if (state->id_port != event->ev_port) 2359 break; 2360 2361 ibd_link_mod(state, code); 2362 break; 2363 2364 case IBT_HCA_ATTACH_EVENT: 2365 case IBT_HCA_DETACH_EVENT: 2366 /* 2367 * When a new card is plugged to the system, attach_event is 2368 * invoked. Additionally, a cfgadm needs to be run to make the 2369 * card known to the system, and an ifconfig needs to be run to 2370 * plumb up any ibd interfaces on the card. In the case of card 2371 * unplug, a cfgadm is run that will trigger any RCM scripts to 2372 * unplumb the ibd interfaces on the card; when the card is 2373 * actually unplugged, the detach_event is invoked; 2374 * additionally, if any ibd instances are still active on the 2375 * card (eg there were no associated RCM scripts), driver's 2376 * detach routine is invoked. 2377 */ 2378 break; 2379 default: 2380 break; 2381 } 2382 } 2383 2384 static int 2385 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2386 { 2387 mac_register_t *macp; 2388 int ret; 2389 2390 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2391 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2392 return (DDI_FAILURE); 2393 } 2394 2395 /* 2396 * Note that when we register with mac during attach, we don't 2397 * have the id_macaddr yet, so we'll simply be registering a 2398 * zero macaddr that we'll overwrite later during plumb (in 2399 * ibd_m_start()). Similar is the case with id_mtu - we'll 2400 * update the mac layer with the correct mtu during plumb. 2401 */ 2402 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2403 macp->m_driver = state; 2404 macp->m_dip = dip; 2405 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2406 macp->m_callbacks = &ibd_m_callbacks; 2407 macp->m_min_sdu = 0; 2408 if (state->id_type == IBD_PORT_DRIVER) { 2409 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 2410 } else if (state->id_enable_rc) { 2411 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2412 } else { 2413 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2414 } 2415 macp->m_priv_props = ibd_priv_props; 2416 2417 /* 2418 * Register ourselves with the GLDv3 interface 2419 */ 2420 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2421 mac_free(macp); 2422 DPRINT(10, 2423 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2424 return (DDI_FAILURE); 2425 } 2426 2427 mac_free(macp); 2428 return (DDI_SUCCESS); 2429 } 2430 2431 static int 2432 ibd_record_capab(ibd_state_t *state) 2433 { 2434 ibt_hca_attr_t hca_attrs; 2435 ibt_status_t ibt_status; 2436 2437 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2438 2439 /* 2440 * Query the HCA and fetch its attributes 2441 */ 2442 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2443 ASSERT(ibt_status == IBT_SUCCESS); 2444 2445 /* 2446 * 1. Set the Hardware Checksum capability. Currently we only consider 2447 * full checksum offload. 2448 */ 2449 if (state->id_enable_rc) { 2450 state->id_hwcksum_capab = 0; 2451 } else { 2452 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2453 == IBT_HCA_CKSUM_FULL) { 2454 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2455 } 2456 } 2457 2458 /* 2459 * 2. Set LSO policy, capability and maximum length 2460 */ 2461 if (state->id_enable_rc) { 2462 state->id_lso_capable = B_FALSE; 2463 state->id_lso_maxlen = 0; 2464 } else { 2465 if (hca_attrs.hca_max_lso_size > 0) { 2466 state->id_lso_capable = B_TRUE; 2467 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2468 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2469 else 2470 state->id_lso_maxlen = 2471 hca_attrs.hca_max_lso_size; 2472 } else { 2473 state->id_lso_capable = B_FALSE; 2474 state->id_lso_maxlen = 0; 2475 } 2476 } 2477 2478 /* 2479 * 3. Set Reserved L_Key capability 2480 */ 2481 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2482 state->id_hca_res_lkey_capab = 1; 2483 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2484 state->rc_enable_iov_map = B_TRUE; 2485 } else { 2486 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2487 state->rc_enable_iov_map = B_FALSE; 2488 } 2489 2490 /* 2491 * 4. Set maximum sqseg value after checking to see if extended sgl 2492 * size information is provided by the hca 2493 */ 2494 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2495 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2496 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2497 } else { 2498 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2499 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2500 } 2501 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2502 state->id_max_sqseg = IBD_MAX_SQSEG; 2503 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2504 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2505 state->id_max_sqseg, IBD_MAX_SQSEG); 2506 } 2507 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2508 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2509 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2510 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2511 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2512 } 2513 2514 /* 2515 * Translating the virtual address regions into physical regions 2516 * for using the Reserved LKey feature results in a wr sgl that 2517 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2518 * we'll fix a high-water mark (65%) for when we should stop. 2519 */ 2520 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2521 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2522 2523 /* 2524 * 5. Set number of recv and send wqes after checking hca maximum 2525 * channel size. Store the max channel size in the state so that it 2526 * can be referred to when the swqe/rwqe change is requested via 2527 * dladm. 2528 */ 2529 2530 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz; 2531 2532 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe) 2533 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz; 2534 2535 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe - 2536 IBD_RWQE_MIN; 2537 2538 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe) 2539 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz; 2540 2541 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2542 2543 return (DDI_SUCCESS); 2544 } 2545 2546 static int 2547 ibd_part_busy(ibd_state_t *state) 2548 { 2549 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2550 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n"); 2551 return (DDI_FAILURE); 2552 } 2553 2554 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { 2555 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n"); 2556 return (DDI_FAILURE); 2557 } 2558 2559 return (DDI_SUCCESS); 2560 } 2561 2562 2563 static void 2564 ibd_part_unattach(ibd_state_t *state) 2565 { 2566 uint32_t progress = state->id_mac_state; 2567 ibt_status_t ret; 2568 2569 /* make sure rx resources are freed */ 2570 ibd_free_rx_rsrcs(state); 2571 2572 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 2573 ASSERT(state->id_enable_rc); 2574 ibd_rc_fini_srq_list(state); 2575 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 2576 } 2577 2578 if (progress & IBD_DRV_MAC_REGISTERED) { 2579 (void) mac_unregister(state->id_mh); 2580 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2581 } 2582 2583 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 2584 /* 2585 * No new async requests will be posted since the device 2586 * link state has been marked as unknown; completion handlers 2587 * have been turned off, so Tx handler will not cause any 2588 * more IBD_ASYNC_REAP requests. 2589 * 2590 * Queue a request for the async thread to exit, which will 2591 * be serviced after any pending ones. This can take a while, 2592 * specially if the SM is unreachable, since IBMF will slowly 2593 * timeout each SM request issued by the async thread. Reap 2594 * the thread before continuing on, we do not want it to be 2595 * lingering in modunloaded code. 2596 */ 2597 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 2598 thread_join(state->id_async_thrid); 2599 2600 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 2601 } 2602 2603 if (progress & IBD_DRV_REQ_LIST_INITED) { 2604 list_destroy(&state->id_req_list); 2605 mutex_destroy(&state->id_acache_req_lock); 2606 cv_destroy(&state->id_acache_req_cv); 2607 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED; 2608 } 2609 2610 if (progress & IBD_DRV_PD_ALLOCD) { 2611 if ((ret = ibt_free_pd(state->id_hca_hdl, 2612 state->id_pd_hdl)) != IBT_SUCCESS) { 2613 ibd_print_warn(state, "failed to free " 2614 "protection domain, ret=%d", ret); 2615 } 2616 state->id_pd_hdl = NULL; 2617 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2618 } 2619 2620 if (progress & IBD_DRV_HCA_OPENED) { 2621 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2622 IBT_SUCCESS) { 2623 ibd_print_warn(state, "failed to close " 2624 "HCA device, ret=%d", ret); 2625 } 2626 state->id_hca_hdl = NULL; 2627 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2628 } 2629 2630 mutex_enter(&ibd_gstate.ig_mutex); 2631 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2632 if ((ret = ibt_detach(state->id_ibt_hdl)) != 2633 IBT_SUCCESS) { 2634 ibd_print_warn(state, 2635 "ibt_detach() failed, ret=%d", ret); 2636 } 2637 state->id_ibt_hdl = NULL; 2638 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2639 ibd_gstate.ig_ibt_hdl_ref_cnt--; 2640 } 2641 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && 2642 (ibd_gstate.ig_ibt_hdl != NULL)) { 2643 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != 2644 IBT_SUCCESS) { 2645 ibd_print_warn(state, "ibt_detach(): global " 2646 "failed, ret=%d", ret); 2647 } 2648 ibd_gstate.ig_ibt_hdl = NULL; 2649 } 2650 mutex_exit(&ibd_gstate.ig_mutex); 2651 2652 if (progress & IBD_DRV_TXINTR_ADDED) { 2653 ddi_remove_softintr(state->id_tx); 2654 state->id_tx = NULL; 2655 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2656 } 2657 2658 if (progress & IBD_DRV_RXINTR_ADDED) { 2659 ddi_remove_softintr(state->id_rx); 2660 state->id_rx = NULL; 2661 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2662 } 2663 2664 #ifdef DEBUG 2665 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2666 kstat_delete(state->rc_ksp); 2667 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2668 } 2669 #endif 2670 2671 if (progress & IBD_DRV_STATE_INITIALIZED) { 2672 ibd_state_fini(state); 2673 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2674 } 2675 } 2676 2677 int 2678 ibd_part_attach(ibd_state_t *state, dev_info_t *dip) 2679 { 2680 ibt_status_t ret; 2681 int rv; 2682 kthread_t *kht; 2683 2684 /* 2685 * Initialize mutexes and condition variables 2686 */ 2687 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2688 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()"); 2689 return (DDI_FAILURE); 2690 } 2691 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2692 2693 /* 2694 * Allocate rx,tx softintr 2695 */ 2696 if (ibd_rx_softintr == 1) { 2697 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2698 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2699 DPRINT(10, "ibd_part_attach: failed in " 2700 "ddi_add_softintr(id_rx), ret=%d", rv); 2701 return (DDI_FAILURE); 2702 } 2703 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2704 } 2705 if (ibd_tx_softintr == 1) { 2706 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2707 NULL, NULL, ibd_tx_recycle, 2708 (caddr_t)state)) != DDI_SUCCESS) { 2709 DPRINT(10, "ibd_part_attach: failed in " 2710 "ddi_add_softintr(id_tx), ret=%d", rv); 2711 return (DDI_FAILURE); 2712 } 2713 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2714 } 2715 2716 /* 2717 * Attach to IBTL 2718 */ 2719 mutex_enter(&ibd_gstate.ig_mutex); 2720 if (ibd_gstate.ig_ibt_hdl == NULL) { 2721 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2722 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { 2723 DPRINT(10, "ibd_part_attach: global: failed in " 2724 "ibt_attach(), ret=%d", ret); 2725 mutex_exit(&ibd_gstate.ig_mutex); 2726 return (DDI_FAILURE); 2727 } 2728 } 2729 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2730 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2731 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d", 2732 ret); 2733 mutex_exit(&ibd_gstate.ig_mutex); 2734 return (DDI_FAILURE); 2735 } 2736 ibd_gstate.ig_ibt_hdl_ref_cnt++; 2737 mutex_exit(&ibd_gstate.ig_mutex); 2738 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2739 2740 /* 2741 * Open the HCA 2742 */ 2743 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 2744 &state->id_hca_hdl)) != IBT_SUCCESS) { 2745 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d", 2746 ret); 2747 return (DDI_FAILURE); 2748 } 2749 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2750 2751 #ifdef DEBUG 2752 /* Initialize Driver Counters for Reliable Connected Mode */ 2753 if (state->id_enable_rc) { 2754 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2755 DPRINT(10, "ibd_part_attach: failed in " 2756 "ibd_rc_init_stats"); 2757 return (DDI_FAILURE); 2758 } 2759 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2760 } 2761 #endif 2762 2763 /* 2764 * Record capabilities 2765 */ 2766 (void) ibd_record_capab(state); 2767 2768 /* 2769 * Allocate a protection domain on the HCA 2770 */ 2771 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2772 &state->id_pd_hdl)) != IBT_SUCCESS) { 2773 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d", 2774 ret); 2775 return (DDI_FAILURE); 2776 } 2777 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2778 2779 2780 /* 2781 * We need to initialise the req_list that is required for the 2782 * operation of the async_thread. 2783 */ 2784 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 2785 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 2786 list_create(&state->id_req_list, sizeof (ibd_req_t), 2787 offsetof(ibd_req_t, rq_list)); 2788 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED; 2789 2790 /* 2791 * Create the async thread; thread_create never fails. 2792 */ 2793 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 2794 TS_RUN, minclsyspri); 2795 state->id_async_thrid = kht->t_did; 2796 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 2797 2798 return (DDI_SUCCESS); 2799 } 2800 2801 /* 2802 * Attach device to the IO framework. 2803 */ 2804 static int 2805 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2806 { 2807 int ret; 2808 2809 switch (cmd) { 2810 case DDI_ATTACH: 2811 ret = ibd_port_attach(dip); 2812 break; 2813 default: 2814 ret = DDI_FAILURE; 2815 break; 2816 } 2817 return (ret); 2818 } 2819 2820 /* 2821 * Detach device from the IO framework. 2822 */ 2823 static int 2824 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2825 { 2826 ibd_state_t *state; 2827 int instance; 2828 2829 /* 2830 * IBD doesn't support suspend/resume 2831 */ 2832 if (cmd != DDI_DETACH) 2833 return (DDI_FAILURE); 2834 2835 /* 2836 * Get the instance softstate 2837 */ 2838 instance = ddi_get_instance(dip); 2839 state = ddi_get_soft_state(ibd_list, instance); 2840 2841 /* 2842 * Release all resources we're holding still. Note that if we'd 2843 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2844 * so far, we should find all the flags we need in id_mac_state. 2845 */ 2846 return (ibd_port_unattach(state, dip)); 2847 } 2848 2849 /* 2850 * Pre ibt_attach() driver initialization 2851 */ 2852 static int 2853 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2854 { 2855 char buf[64]; 2856 2857 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2858 state->id_link_state = LINK_STATE_UNKNOWN; 2859 2860 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2861 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2862 state->id_trap_stop = B_TRUE; 2863 state->id_trap_inprog = 0; 2864 2865 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2866 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2867 state->id_dip = dip; 2868 2869 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2870 2871 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2872 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2873 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2874 state->id_tx_busy = 0; 2875 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2876 2877 state->id_rx_list.dl_bufs_outstanding = 0; 2878 state->id_rx_list.dl_cnt = 0; 2879 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2880 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2881 (void) sprintf(buf, "ibd_req%d_%x", ddi_get_instance(dip), 2882 state->id_pkey); 2883 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2884 0, NULL, NULL, NULL, NULL, NULL, 0); 2885 2886 /* For Reliable Connected Mode */ 2887 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2888 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2889 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2890 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2891 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2892 MUTEX_DRIVER, NULL); 2893 2894 /* 2895 * Make the default link mode as RC. If this fails during connection 2896 * setup, the link mode is automatically transitioned to UD. 2897 * Also set the RC MTU. 2898 */ 2899 state->id_enable_rc = IBD_DEF_LINK_MODE; 2900 state->rc_mtu = IBD_DEF_RC_MAX_MTU; 2901 state->id_mtu = IBD_DEF_MAX_MTU; 2902 2903 /* Iniatialize all tunables to default */ 2904 state->id_lso_policy = IBD_DEF_LSO_POLICY; 2905 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS; 2906 state->id_num_ah = IBD_DEF_NUM_AH; 2907 state->id_hash_size = IBD_DEF_HASH_SIZE; 2908 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP; 2909 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS; 2910 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT; 2911 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC; 2912 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT; 2913 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC; 2914 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT; 2915 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC; 2916 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT; 2917 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC; 2918 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH; 2919 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH; 2920 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH; 2921 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE; 2922 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE; 2923 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE; 2924 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE; 2925 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ; 2926 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ; 2927 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH; 2928 2929 return (DDI_SUCCESS); 2930 } 2931 2932 /* 2933 * Post ibt_detach() driver deconstruction 2934 */ 2935 static void 2936 ibd_state_fini(ibd_state_t *state) 2937 { 2938 kmem_cache_destroy(state->id_req_kmc); 2939 2940 mutex_destroy(&state->id_rx_list.dl_mutex); 2941 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2942 2943 mutex_destroy(&state->id_txpost_lock); 2944 mutex_destroy(&state->id_tx_list.dl_mutex); 2945 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2946 mutex_destroy(&state->id_lso_lock); 2947 2948 mutex_destroy(&state->id_sched_lock); 2949 mutex_destroy(&state->id_scq_poll_lock); 2950 mutex_destroy(&state->id_rcq_poll_lock); 2951 2952 cv_destroy(&state->id_trap_cv); 2953 mutex_destroy(&state->id_trap_lock); 2954 mutex_destroy(&state->id_link_mutex); 2955 2956 /* For Reliable Connected Mode */ 2957 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2958 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2959 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2960 mutex_destroy(&state->rc_tx_large_bufs_lock); 2961 mutex_destroy(&state->rc_rx_lock); 2962 } 2963 2964 /* 2965 * Fetch link speed from SA for snmp ifspeed reporting. 2966 */ 2967 static uint64_t 2968 ibd_get_portspeed(ibd_state_t *state) 2969 { 2970 int ret; 2971 ibt_path_info_t path; 2972 ibt_path_attr_t path_attr; 2973 uint8_t num_paths; 2974 uint64_t ifspeed; 2975 2976 /* 2977 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2978 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2979 * 2000000000. Start with that as default. 2980 */ 2981 ifspeed = 2000000000; 2982 2983 bzero(&path_attr, sizeof (path_attr)); 2984 2985 /* 2986 * Get the port speed from Loopback path information. 2987 */ 2988 path_attr.pa_dgids = &state->id_sgid; 2989 path_attr.pa_num_dgids = 1; 2990 path_attr.pa_sgid = state->id_sgid; 2991 2992 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2993 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2994 goto earlydone; 2995 2996 if (num_paths < 1) 2997 goto earlydone; 2998 2999 /* 3000 * In case SA does not return an expected value, report the default 3001 * speed as 1X. 3002 */ 3003 ret = 1; 3004 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 3005 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 3006 ret = 1; 3007 break; 3008 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 3009 ret = 4; 3010 break; 3011 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 3012 ret = 12; 3013 break; 3014 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 3015 ret = 2; 3016 break; 3017 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 3018 ret = 8; 3019 break; 3020 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 3021 ret = 16; 3022 break; 3023 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 3024 ret = 24; 3025 break; 3026 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 3027 ret = 32; 3028 break; 3029 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 3030 ret = 48; 3031 break; 3032 } 3033 3034 ifspeed *= ret; 3035 3036 earlydone: 3037 return (ifspeed); 3038 } 3039 3040 /* 3041 * Search input mcg list (id_mc_full or id_mc_non) for an entry 3042 * representing the input mcg mgid. 3043 */ 3044 static ibd_mce_t * 3045 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 3046 { 3047 ibd_mce_t *ptr = list_head(mlist); 3048 3049 /* 3050 * Do plain linear search. 3051 */ 3052 while (ptr != NULL) { 3053 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 3054 sizeof (ib_gid_t)) == 0) 3055 return (ptr); 3056 ptr = list_next(mlist, ptr); 3057 } 3058 return (NULL); 3059 } 3060 3061 /* 3062 * Execute IBA JOIN. 3063 */ 3064 static ibt_status_t 3065 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 3066 { 3067 ibt_mcg_attr_t mcg_attr; 3068 3069 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3070 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 3071 mcg_attr.mc_mgid = mgid; 3072 mcg_attr.mc_join_state = mce->mc_jstate; 3073 mcg_attr.mc_scope = state->id_scope; 3074 mcg_attr.mc_pkey = state->id_pkey; 3075 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 3076 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 3077 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 3078 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 3079 NULL, NULL)); 3080 } 3081 3082 /* 3083 * This code JOINs the port in the proper way (depending on the join 3084 * state) so that IBA fabric will forward mcg packets to/from the port. 3085 * It also attaches the QPN to the mcg so it can receive those mcg 3086 * packets. This code makes sure not to attach the mcg to the QP if 3087 * that has been previously done due to the mcg being joined with a 3088 * different join state, even though this is not required by SWG_0216, 3089 * refid 3610. 3090 */ 3091 static ibd_mce_t * 3092 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3093 { 3094 ibt_status_t ibt_status; 3095 ibd_mce_t *mce, *tmce, *omce = NULL; 3096 boolean_t do_attach = B_TRUE; 3097 3098 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 3099 jstate, mgid.gid_prefix, mgid.gid_guid); 3100 3101 /* 3102 * For enable_multicast Full member joins, we need to do some 3103 * extra work. If there is already an mce on the list that 3104 * indicates full membership, that means the membership has 3105 * not yet been dropped (since the disable_multicast was issued) 3106 * because there are pending Tx's to the mcg; in that case, just 3107 * mark the mce not to be reaped when the Tx completion queues 3108 * an async reap operation. 3109 * 3110 * If there is already an mce on the list indicating sendonly 3111 * membership, try to promote to full membership. Be careful 3112 * not to deallocate the old mce, since there might be an AH 3113 * pointing to it; instead, update the old mce with new data 3114 * that tracks the full membership. 3115 */ 3116 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 3117 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 3118 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 3119 ASSERT(omce->mc_fullreap); 3120 omce->mc_fullreap = B_FALSE; 3121 return (omce); 3122 } else { 3123 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 3124 } 3125 } 3126 3127 /* 3128 * Allocate the ibd_mce_t to track this JOIN. 3129 */ 3130 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 3131 mce->mc_fullreap = B_FALSE; 3132 mce->mc_jstate = jstate; 3133 3134 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 3135 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 3136 ibt_status); 3137 kmem_free(mce, sizeof (ibd_mce_t)); 3138 return (NULL); 3139 } 3140 3141 /* 3142 * Is an IBA attach required? Not if the interface is already joined 3143 * to the mcg in a different appropriate join state. 3144 */ 3145 if (jstate == IB_MC_JSTATE_NON) { 3146 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3147 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3148 do_attach = B_FALSE; 3149 } else if (jstate == IB_MC_JSTATE_FULL) { 3150 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3151 do_attach = B_FALSE; 3152 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3153 do_attach = B_FALSE; 3154 } 3155 3156 if (do_attach) { 3157 /* 3158 * Do the IBA attach. 3159 */ 3160 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 3161 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 3162 &mce->mc_info)) != IBT_SUCCESS) { 3163 DPRINT(10, "ibd_join_group : failed qp attachment " 3164 "%d\n", ibt_status); 3165 /* 3166 * NOTE that we should probably preserve the join info 3167 * in the list and later try to leave again at detach 3168 * time. 3169 */ 3170 (void) ibt_leave_mcg(state->id_sgid, mgid, 3171 state->id_sgid, jstate); 3172 kmem_free(mce, sizeof (ibd_mce_t)); 3173 return (NULL); 3174 } 3175 } 3176 3177 /* 3178 * Insert the ibd_mce_t in the proper list. 3179 */ 3180 if (jstate == IB_MC_JSTATE_NON) { 3181 IBD_MCACHE_INSERT_NON(state, mce); 3182 } else { 3183 /* 3184 * Set up the mc_req fields used for reaping the 3185 * mcg in case of delayed tx completion (see 3186 * ibd_tx_cleanup()). Also done for sendonly join in 3187 * case we are promoted to fullmembership later and 3188 * keep using the same mce. 3189 */ 3190 mce->mc_req.rq_gid = mgid; 3191 mce->mc_req.rq_ptr = mce; 3192 /* 3193 * Check whether this is the case of trying to join 3194 * full member, and we were already joined send only. 3195 * We try to drop our SendOnly membership, but it is 3196 * possible that the mcg does not exist anymore (and 3197 * the subnet trap never reached us), so the leave 3198 * operation might fail. 3199 */ 3200 if (omce != NULL) { 3201 (void) ibt_leave_mcg(state->id_sgid, mgid, 3202 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 3203 omce->mc_jstate = IB_MC_JSTATE_FULL; 3204 bcopy(&mce->mc_info, &omce->mc_info, 3205 sizeof (ibt_mcg_info_t)); 3206 kmem_free(mce, sizeof (ibd_mce_t)); 3207 return (omce); 3208 } 3209 mutex_enter(&state->id_mc_mutex); 3210 IBD_MCACHE_INSERT_FULL(state, mce); 3211 mutex_exit(&state->id_mc_mutex); 3212 } 3213 3214 return (mce); 3215 } 3216 3217 /* 3218 * Called during port up event handling to attempt to reacquire full 3219 * membership to an mcg. Stripped down version of ibd_join_group(). 3220 * Note that it is possible that the mcg might have gone away, and 3221 * gets recreated at this point. 3222 */ 3223 static void 3224 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 3225 { 3226 ib_gid_t mgid; 3227 3228 /* 3229 * If the mc_fullreap flag is set, or this join fails, a subsequent 3230 * reap/leave is going to try to leave the group. We could prevent 3231 * that by adding a boolean flag into ibd_mce_t, if required. 3232 */ 3233 if (mce->mc_fullreap) 3234 return; 3235 3236 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3237 3238 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 3239 mgid.gid_guid); 3240 3241 /* While reacquiring, leave and then join the MCG */ 3242 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, 3243 mce->mc_jstate); 3244 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 3245 ibd_print_warn(state, "Failure on port up to rejoin " 3246 "multicast gid %016llx:%016llx", 3247 (u_longlong_t)mgid.gid_prefix, 3248 (u_longlong_t)mgid.gid_guid); 3249 } 3250 3251 /* 3252 * This code handles delayed Tx completion cleanups for mcg's to which 3253 * disable_multicast has been issued, regular mcg related cleanups during 3254 * disable_multicast, disable_promiscuous and mcg traps, as well as 3255 * cleanups during driver detach time. Depending on the join state, 3256 * it deletes the mce from the appropriate list and issues the IBA 3257 * leave/detach; except in the disable_multicast case when the mce 3258 * is left on the active list for a subsequent Tx completion cleanup. 3259 */ 3260 static void 3261 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3262 uint8_t jstate) 3263 { 3264 ibd_mce_t *tmce; 3265 boolean_t do_detach = B_TRUE; 3266 3267 /* 3268 * Before detaching, we must check whether the other list 3269 * contains the mcg; if we detach blindly, the consumer 3270 * who set up the other list will also stop receiving 3271 * traffic. 3272 */ 3273 if (jstate == IB_MC_JSTATE_FULL) { 3274 /* 3275 * The following check is only relevant while coming 3276 * from the Tx completion path in the reap case. 3277 */ 3278 if (!mce->mc_fullreap) 3279 return; 3280 mutex_enter(&state->id_mc_mutex); 3281 IBD_MCACHE_PULLOUT_FULL(state, mce); 3282 mutex_exit(&state->id_mc_mutex); 3283 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3284 do_detach = B_FALSE; 3285 } else if (jstate == IB_MC_JSTATE_NON) { 3286 IBD_MCACHE_PULLOUT_NON(state, mce); 3287 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3288 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3289 do_detach = B_FALSE; 3290 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3291 mutex_enter(&state->id_mc_mutex); 3292 IBD_MCACHE_PULLOUT_FULL(state, mce); 3293 mutex_exit(&state->id_mc_mutex); 3294 do_detach = B_FALSE; 3295 } 3296 3297 /* 3298 * If we are reacting to a mcg trap and leaving our sendonly or 3299 * non membership, the mcg is possibly already gone, so attempting 3300 * to leave might fail. On the other hand, we must try to leave 3301 * anyway, since this might be a trap from long ago, and we could 3302 * have potentially sendonly joined to a recent incarnation of 3303 * the mcg and are about to loose track of this information. 3304 */ 3305 if (do_detach) { 3306 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3307 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3308 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3309 } 3310 3311 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3312 kmem_free(mce, sizeof (ibd_mce_t)); 3313 } 3314 3315 /* 3316 * Async code executed due to multicast and promiscuous disable requests 3317 * and mcg trap handling; also executed during driver detach. Mostly, a 3318 * leave and detach is done; except for the fullmember case when Tx 3319 * requests are pending, whence arrangements are made for subsequent 3320 * cleanup on Tx completion. 3321 */ 3322 static void 3323 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3324 { 3325 ipoib_mac_t mcmac; 3326 boolean_t recycled; 3327 ibd_mce_t *mce; 3328 3329 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3330 jstate, mgid.gid_prefix, mgid.gid_guid); 3331 3332 if (jstate == IB_MC_JSTATE_NON) { 3333 recycled = B_TRUE; 3334 mce = IBD_MCACHE_FIND_NON(state, mgid); 3335 /* 3336 * In case we are handling a mcg trap, we might not find 3337 * the mcg in the non list. 3338 */ 3339 if (mce == NULL) { 3340 return; 3341 } 3342 } else { 3343 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3344 3345 /* 3346 * In case we are handling a mcg trap, make sure the trap 3347 * is not arriving late; if we have an mce that indicates 3348 * that we are already a fullmember, that would be a clear 3349 * indication that the trap arrived late (ie, is for a 3350 * previous incarnation of the mcg). 3351 */ 3352 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3353 if ((mce == NULL) || (mce->mc_jstate == 3354 IB_MC_JSTATE_FULL)) { 3355 return; 3356 } 3357 } else { 3358 ASSERT(jstate == IB_MC_JSTATE_FULL); 3359 3360 /* 3361 * If join group failed, mce will be NULL here. 3362 * This is because in GLDv3 driver, set multicast 3363 * will always return success. 3364 */ 3365 if (mce == NULL) { 3366 return; 3367 } 3368 3369 mce->mc_fullreap = B_TRUE; 3370 } 3371 3372 /* 3373 * If no pending Tx's remain that reference the AH 3374 * for the mcg, recycle it from active to free list. 3375 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3376 * so the last completing Tx will cause an async reap 3377 * operation to be invoked, at which time we will drop our 3378 * membership to the mcg so that the pending Tx's complete 3379 * successfully. Refer to comments on "AH and MCE active 3380 * list manipulation" at top of this file. The lock protects 3381 * against Tx fast path and Tx cleanup code. 3382 */ 3383 mutex_enter(&state->id_ac_mutex); 3384 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3385 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3386 IB_MC_JSTATE_SEND_ONLY_NON)); 3387 mutex_exit(&state->id_ac_mutex); 3388 } 3389 3390 if (recycled) { 3391 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3392 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3393 ibd_async_reap_group(state, mce, mgid, jstate); 3394 } 3395 } 3396 3397 /* 3398 * Find the broadcast address as defined by IPoIB; implicitly 3399 * determines the IBA scope, mtu, tclass etc of the link the 3400 * interface is going to be a member of. 3401 */ 3402 static ibt_status_t 3403 ibd_find_bgroup(ibd_state_t *state) 3404 { 3405 ibt_mcg_attr_t mcg_attr; 3406 uint_t numg; 3407 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3408 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3409 IB_MC_SCOPE_GLOBAL }; 3410 int i, mcgmtu; 3411 boolean_t found = B_FALSE; 3412 int ret; 3413 ibt_mcg_info_t mcg_info; 3414 3415 state->id_bgroup_created = B_FALSE; 3416 state->id_bgroup_present = B_FALSE; 3417 3418 query_bcast_grp: 3419 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3420 mcg_attr.mc_pkey = state->id_pkey; 3421 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3422 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3423 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3424 3425 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3426 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3427 3428 /* 3429 * Look for the IPoIB broadcast group. 3430 */ 3431 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3432 state->id_mgid.gid_prefix = 3433 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3434 ((uint64_t)state->id_scope << 48) | 3435 ((uint32_t)(state->id_pkey << 16))); 3436 mcg_attr.mc_mgid = state->id_mgid; 3437 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3438 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3439 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3440 found = B_TRUE; 3441 break; 3442 } 3443 } 3444 3445 if (!found) { 3446 if (state->id_create_broadcast_group) { 3447 /* 3448 * If we created the broadcast group, but failed to 3449 * find it, we can't do anything except leave the 3450 * one we created and return failure. 3451 */ 3452 if (state->id_bgroup_created) { 3453 ibd_print_warn(state, "IPoIB broadcast group " 3454 "absent. Unable to query after create."); 3455 goto find_bgroup_fail; 3456 } 3457 3458 /* 3459 * Create the ipoib broadcast group if it didn't exist 3460 */ 3461 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3462 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3463 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3464 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3465 mcg_attr.mc_pkey = state->id_pkey; 3466 mcg_attr.mc_flow = 0; 3467 mcg_attr.mc_sl = 0; 3468 mcg_attr.mc_tclass = 0; 3469 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3470 state->id_mgid.gid_prefix = 3471 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3472 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3473 ((uint32_t)(state->id_pkey << 16))); 3474 mcg_attr.mc_mgid = state->id_mgid; 3475 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3476 3477 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3478 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3479 ibd_print_warn(state, "IPoIB broadcast group " 3480 "absent, create failed: ret = %d\n", ret); 3481 state->id_bgroup_created = B_FALSE; 3482 return (IBT_FAILURE); 3483 } 3484 state->id_bgroup_created = B_TRUE; 3485 goto query_bcast_grp; 3486 } else { 3487 ibd_print_warn(state, "IPoIB broadcast group absent"); 3488 return (IBT_FAILURE); 3489 } 3490 } 3491 3492 /* 3493 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3494 */ 3495 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3496 if (state->id_mtu < mcgmtu) { 3497 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3498 "greater than port's maximum MTU %d", mcgmtu, 3499 state->id_mtu); 3500 ibt_free_mcg_info(state->id_mcinfo, 1); 3501 goto find_bgroup_fail; 3502 } 3503 state->id_mtu = mcgmtu; 3504 state->id_bgroup_present = B_TRUE; 3505 3506 return (IBT_SUCCESS); 3507 3508 find_bgroup_fail: 3509 if (state->id_bgroup_created) { 3510 (void) ibt_leave_mcg(state->id_sgid, 3511 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3512 IB_MC_JSTATE_FULL); 3513 } 3514 3515 return (IBT_FAILURE); 3516 } 3517 3518 static int 3519 ibd_alloc_tx_copybufs(ibd_state_t *state) 3520 { 3521 ibt_mr_attr_t mem_attr; 3522 3523 /* 3524 * Allocate one big chunk for all regular tx copy bufs 3525 */ 3526 state->id_tx_buf_sz = state->id_mtu; 3527 if (state->id_lso_policy && state->id_lso_capable && 3528 (state->id_ud_tx_copy_thresh > state->id_mtu)) { 3529 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh; 3530 } 3531 3532 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe * 3533 state->id_tx_buf_sz, KM_SLEEP); 3534 3535 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe * 3536 sizeof (ibd_swqe_t), KM_SLEEP); 3537 3538 /* 3539 * Do one memory registration on the entire txbuf area 3540 */ 3541 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3542 mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz; 3543 mem_attr.mr_as = NULL; 3544 mem_attr.mr_flags = IBT_MR_SLEEP; 3545 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3546 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3547 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3548 kmem_free(state->id_tx_wqes, 3549 state->id_ud_num_swqe * sizeof (ibd_swqe_t)); 3550 kmem_free(state->id_tx_bufs, 3551 state->id_ud_num_swqe * state->id_tx_buf_sz); 3552 state->id_tx_bufs = NULL; 3553 return (DDI_FAILURE); 3554 } 3555 3556 return (DDI_SUCCESS); 3557 } 3558 3559 static int 3560 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3561 { 3562 ibt_mr_attr_t mem_attr; 3563 ibd_lsobuf_t *buflist; 3564 ibd_lsobuf_t *lbufp; 3565 ibd_lsobuf_t *tail; 3566 ibd_lsobkt_t *bktp; 3567 uint8_t *membase; 3568 uint8_t *memp; 3569 uint_t memsz; 3570 int i; 3571 3572 /* 3573 * Allocate the lso bucket 3574 */ 3575 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3576 3577 /* 3578 * Allocate the entire lso memory and register it 3579 */ 3580 memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ; 3581 membase = kmem_zalloc(memsz, KM_SLEEP); 3582 3583 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3584 mem_attr.mr_len = memsz; 3585 mem_attr.mr_as = NULL; 3586 mem_attr.mr_flags = IBT_MR_SLEEP; 3587 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3588 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3589 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3590 kmem_free(membase, memsz); 3591 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3592 return (DDI_FAILURE); 3593 } 3594 3595 mutex_enter(&state->id_lso_lock); 3596 3597 /* 3598 * Now allocate the buflist. Note that the elements in the buflist and 3599 * the buffers in the lso memory have a permanent 1-1 relation, so we 3600 * can always derive the address of a buflist entry from the address of 3601 * an lso buffer. 3602 */ 3603 buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t), 3604 KM_SLEEP); 3605 3606 /* 3607 * Set up the lso buf chain 3608 */ 3609 memp = membase; 3610 lbufp = buflist; 3611 for (i = 0; i < state->id_num_lso_bufs; i++) { 3612 lbufp->lb_isfree = 1; 3613 lbufp->lb_buf = memp; 3614 lbufp->lb_next = lbufp + 1; 3615 3616 tail = lbufp; 3617 3618 memp += IBD_LSO_BUFSZ; 3619 lbufp++; 3620 } 3621 tail->lb_next = NULL; 3622 3623 /* 3624 * Set up the LSO buffer information in ibd state 3625 */ 3626 bktp->bkt_bufl = buflist; 3627 bktp->bkt_free_head = buflist; 3628 bktp->bkt_mem = membase; 3629 bktp->bkt_nelem = state->id_num_lso_bufs; 3630 bktp->bkt_nfree = bktp->bkt_nelem; 3631 3632 state->id_lso = bktp; 3633 mutex_exit(&state->id_lso_lock); 3634 3635 return (DDI_SUCCESS); 3636 } 3637 3638 /* 3639 * Statically allocate Tx buffer list(s). 3640 */ 3641 static int 3642 ibd_init_txlist(ibd_state_t *state) 3643 { 3644 ibd_swqe_t *swqe; 3645 ibt_lkey_t lkey; 3646 int i; 3647 uint_t len; 3648 uint8_t *bufaddr; 3649 3650 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3651 return (DDI_FAILURE); 3652 3653 if (state->id_lso_policy && state->id_lso_capable) { 3654 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3655 state->id_lso_capable = B_FALSE; 3656 } 3657 3658 mutex_enter(&state->id_tx_list.dl_mutex); 3659 state->id_tx_list.dl_head = NULL; 3660 state->id_tx_list.dl_pending_sends = B_FALSE; 3661 state->id_tx_list.dl_cnt = 0; 3662 mutex_exit(&state->id_tx_list.dl_mutex); 3663 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3664 state->id_tx_rel_list.dl_head = NULL; 3665 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3666 state->id_tx_rel_list.dl_cnt = 0; 3667 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3668 3669 /* 3670 * Allocate and setup the swqe list 3671 */ 3672 lkey = state->id_tx_mr_desc.md_lkey; 3673 bufaddr = state->id_tx_bufs; 3674 len = state->id_tx_buf_sz; 3675 swqe = state->id_tx_wqes; 3676 mutex_enter(&state->id_tx_list.dl_mutex); 3677 for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) { 3678 swqe->swqe_next = NULL; 3679 swqe->swqe_im_mblk = NULL; 3680 3681 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3682 bufaddr; 3683 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3684 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3685 3686 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3687 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3688 swqe->w_swr.wr_trans = IBT_UD_SRV; 3689 3690 /* These are set in send */ 3691 swqe->w_swr.wr_nds = 0; 3692 swqe->w_swr.wr_sgl = NULL; 3693 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3694 3695 /* add to list */ 3696 state->id_tx_list.dl_cnt++; 3697 swqe->swqe_next = state->id_tx_list.dl_head; 3698 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3699 } 3700 mutex_exit(&state->id_tx_list.dl_mutex); 3701 3702 return (DDI_SUCCESS); 3703 } 3704 3705 static int 3706 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3707 uint32_t *nds_p) 3708 { 3709 ibd_lsobkt_t *bktp; 3710 ibd_lsobuf_t *lbufp; 3711 ibd_lsobuf_t *nextp; 3712 ibt_lkey_t lso_lkey; 3713 uint_t frag_sz; 3714 uint_t num_needed; 3715 int i; 3716 3717 ASSERT(sgl_p != NULL); 3718 ASSERT(nds_p != NULL); 3719 ASSERT(req_sz != 0); 3720 3721 /* 3722 * Determine how many bufs we'd need for the size requested 3723 */ 3724 num_needed = req_sz / IBD_LSO_BUFSZ; 3725 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3726 num_needed++; 3727 3728 mutex_enter(&state->id_lso_lock); 3729 3730 /* 3731 * If we don't have enough lso bufs, return failure 3732 */ 3733 ASSERT(state->id_lso != NULL); 3734 bktp = state->id_lso; 3735 if (bktp->bkt_nfree < num_needed) { 3736 mutex_exit(&state->id_lso_lock); 3737 return (-1); 3738 } 3739 3740 /* 3741 * Pick the first 'num_needed' bufs from the free list 3742 */ 3743 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3744 lbufp = bktp->bkt_free_head; 3745 for (i = 0; i < num_needed; i++) { 3746 ASSERT(lbufp->lb_isfree != 0); 3747 ASSERT(lbufp->lb_buf != NULL); 3748 3749 nextp = lbufp->lb_next; 3750 3751 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3752 sgl_p[i].ds_key = lso_lkey; 3753 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3754 3755 lbufp->lb_isfree = 0; 3756 lbufp->lb_next = NULL; 3757 3758 lbufp = nextp; 3759 } 3760 bktp->bkt_free_head = lbufp; 3761 3762 /* 3763 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3764 * to adjust the last sgl entry's length. Since we know we need atleast 3765 * one, the i-1 use below is ok. 3766 */ 3767 if (frag_sz) { 3768 sgl_p[i-1].ds_len = frag_sz; 3769 } 3770 3771 /* 3772 * Update nfree count and return 3773 */ 3774 bktp->bkt_nfree -= num_needed; 3775 3776 mutex_exit(&state->id_lso_lock); 3777 3778 *nds_p = num_needed; 3779 3780 return (0); 3781 } 3782 3783 static void 3784 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3785 { 3786 ibd_lsobkt_t *bktp; 3787 ibd_lsobuf_t *lbufp; 3788 uint8_t *lso_mem_end; 3789 uint_t ndx; 3790 int i; 3791 3792 mutex_enter(&state->id_lso_lock); 3793 3794 bktp = state->id_lso; 3795 ASSERT(bktp != NULL); 3796 3797 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3798 for (i = 0; i < nds; i++) { 3799 uint8_t *va; 3800 3801 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3802 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3803 3804 /* 3805 * Figure out the buflist element this sgl buffer corresponds 3806 * to and put it back at the head 3807 */ 3808 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3809 lbufp = bktp->bkt_bufl + ndx; 3810 3811 ASSERT(lbufp->lb_isfree == 0); 3812 ASSERT(lbufp->lb_buf == va); 3813 3814 lbufp->lb_isfree = 1; 3815 lbufp->lb_next = bktp->bkt_free_head; 3816 bktp->bkt_free_head = lbufp; 3817 } 3818 bktp->bkt_nfree += nds; 3819 3820 mutex_exit(&state->id_lso_lock); 3821 } 3822 3823 static void 3824 ibd_free_tx_copybufs(ibd_state_t *state) 3825 { 3826 /* 3827 * Unregister txbuf mr 3828 */ 3829 if (ibt_deregister_mr(state->id_hca_hdl, 3830 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3831 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3832 } 3833 state->id_tx_mr_hdl = NULL; 3834 3835 /* 3836 * Free txbuf memory 3837 */ 3838 kmem_free(state->id_tx_wqes, state->id_ud_num_swqe * 3839 sizeof (ibd_swqe_t)); 3840 kmem_free(state->id_tx_bufs, state->id_ud_num_swqe * 3841 state->id_tx_buf_sz); 3842 state->id_tx_wqes = NULL; 3843 state->id_tx_bufs = NULL; 3844 } 3845 3846 static void 3847 ibd_free_tx_lsobufs(ibd_state_t *state) 3848 { 3849 ibd_lsobkt_t *bktp; 3850 3851 mutex_enter(&state->id_lso_lock); 3852 3853 if ((bktp = state->id_lso) == NULL) { 3854 mutex_exit(&state->id_lso_lock); 3855 return; 3856 } 3857 3858 /* 3859 * First, free the buflist 3860 */ 3861 ASSERT(bktp->bkt_bufl != NULL); 3862 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3863 3864 /* 3865 * Unregister the LSO memory and free it 3866 */ 3867 ASSERT(bktp->bkt_mr_hdl != NULL); 3868 if (ibt_deregister_mr(state->id_hca_hdl, 3869 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3870 DPRINT(10, 3871 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3872 } 3873 ASSERT(bktp->bkt_mem); 3874 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3875 3876 /* 3877 * Finally free the bucket 3878 */ 3879 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3880 state->id_lso = NULL; 3881 3882 mutex_exit(&state->id_lso_lock); 3883 } 3884 3885 /* 3886 * Free the statically allocated Tx buffer list. 3887 */ 3888 static void 3889 ibd_fini_txlist(ibd_state_t *state) 3890 { 3891 /* 3892 * Free the allocated swqes 3893 */ 3894 mutex_enter(&state->id_tx_list.dl_mutex); 3895 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3896 state->id_tx_list.dl_head = NULL; 3897 state->id_tx_list.dl_pending_sends = B_FALSE; 3898 state->id_tx_list.dl_cnt = 0; 3899 state->id_tx_rel_list.dl_head = NULL; 3900 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3901 state->id_tx_rel_list.dl_cnt = 0; 3902 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3903 mutex_exit(&state->id_tx_list.dl_mutex); 3904 3905 ibd_free_tx_lsobufs(state); 3906 ibd_free_tx_copybufs(state); 3907 } 3908 3909 /* 3910 * post a list of rwqes, NULL terminated. 3911 */ 3912 static void 3913 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3914 { 3915 uint_t i; 3916 uint_t num_posted; 3917 ibt_status_t ibt_status; 3918 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3919 3920 while (rwqe) { 3921 /* Post up to IBD_RX_POST_CNT receive work requests */ 3922 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3923 wrs[i] = rwqe->w_rwr; 3924 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3925 if (rwqe == NULL) { 3926 i++; 3927 break; 3928 } 3929 } 3930 3931 /* 3932 * If posting fails for some reason, we'll never receive 3933 * completion intimation, so we'll need to cleanup. But 3934 * we need to make sure we don't clean up nodes whose 3935 * wrs have been successfully posted. We assume that the 3936 * hca driver returns on the first failure to post and 3937 * therefore the first 'num_posted' entries don't need 3938 * cleanup here. 3939 */ 3940 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3941 3942 num_posted = 0; 3943 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3944 &num_posted); 3945 if (ibt_status != IBT_SUCCESS) { 3946 /* This cannot happen unless the device has an error. */ 3947 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3948 "posting multiple wrs failed: " 3949 "requested=%d, done=%d, ret=%d", 3950 IBD_RX_POST_CNT, num_posted, ibt_status); 3951 atomic_add_32(&state->id_rx_list.dl_cnt, 3952 num_posted - i); 3953 } 3954 } 3955 } 3956 3957 /* 3958 * Grab a list of rwqes from the array of lists, and post the list. 3959 */ 3960 static void 3961 ibd_post_recv_intr(ibd_state_t *state) 3962 { 3963 ibd_rx_queue_t *rxp; 3964 ibd_rwqe_t *list; 3965 3966 /* rotate through the rx_queue array, expecting an adequate number */ 3967 state->id_rx_post_queue_index = 3968 (state->id_rx_post_queue_index + 1) & 3969 (state->id_rx_nqueues - 1); 3970 3971 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3972 mutex_enter(&rxp->rx_post_lock); 3973 list = WQE_TO_RWQE(rxp->rx_head); 3974 rxp->rx_head = NULL; 3975 rxp->rx_cnt = 0; 3976 mutex_exit(&rxp->rx_post_lock); 3977 ibd_post_recv_list(state, list); 3978 } 3979 3980 /* macro explained below */ 3981 #define RX_QUEUE_HASH(rwqe) \ 3982 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3983 3984 /* 3985 * Add a rwqe to one of the the Rx lists. If the list is large enough 3986 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3987 * 3988 * Note: one of 2^N lists is chosen via a hash. This is done 3989 * because using one list is contentious. If the first list is busy 3990 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3991 * 3992 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3993 * even distribution of mapping rwqes to the 2^N queues. 3994 */ 3995 static void 3996 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3997 { 3998 ibd_rx_queue_t *rxp; 3999 4000 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 4001 4002 if (!mutex_tryenter(&rxp->rx_post_lock)) { 4003 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 4004 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 4005 mutex_enter(&rxp->rx_post_lock); 4006 } 4007 rwqe->rwqe_next = rxp->rx_head; 4008 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 4009 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 4010 4011 /* only call ibt_post_recv() every Nth time through here */ 4012 if ((active & (state->id_rx_nqueues - 1)) == 0) { 4013 rxp->rx_head = NULL; 4014 rxp->rx_cnt = 0; 4015 mutex_exit(&rxp->rx_post_lock); 4016 ibd_post_recv_list(state, rwqe); 4017 return; 4018 } 4019 } 4020 rxp->rx_head = RWQE_TO_WQE(rwqe); 4021 mutex_exit(&rxp->rx_post_lock); 4022 } 4023 4024 static int 4025 ibd_alloc_rx_copybufs(ibd_state_t *state) 4026 { 4027 ibt_mr_attr_t mem_attr; 4028 int i; 4029 4030 /* 4031 * Allocate one big chunk for all regular rx copy bufs 4032 */ 4033 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 4034 4035 state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe * 4036 state->id_rx_buf_sz, KM_SLEEP); 4037 4038 state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe * 4039 sizeof (ibd_rwqe_t), KM_SLEEP); 4040 4041 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 4042 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 4043 sizeof (ibd_rx_queue_t), KM_SLEEP); 4044 for (i = 0; i < state->id_rx_nqueues; i++) { 4045 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4046 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 4047 } 4048 4049 /* 4050 * Do one memory registration on the entire rxbuf area 4051 */ 4052 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 4053 mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz; 4054 mem_attr.mr_as = NULL; 4055 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 4056 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 4057 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 4058 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 4059 kmem_free(state->id_rx_wqes, 4060 state->id_ud_num_rwqe * sizeof (ibd_rwqe_t)); 4061 kmem_free(state->id_rx_bufs, 4062 state->id_ud_num_rwqe * state->id_rx_buf_sz); 4063 state->id_rx_bufs = NULL; 4064 state->id_rx_wqes = NULL; 4065 return (DDI_FAILURE); 4066 } 4067 4068 return (DDI_SUCCESS); 4069 } 4070 4071 /* 4072 * Allocate the statically allocated Rx buffer list. 4073 */ 4074 static int 4075 ibd_init_rxlist(ibd_state_t *state) 4076 { 4077 ibd_rwqe_t *rwqe, *next; 4078 ibd_wqe_t *list; 4079 ibt_lkey_t lkey; 4080 int i; 4081 uint_t len; 4082 uint8_t *bufaddr; 4083 4084 mutex_enter(&state->id_rx_free_list.dl_mutex); 4085 if (state->id_rx_free_list.dl_head != NULL) { 4086 /* rx rsrcs were never freed. Just repost them */ 4087 len = state->id_rx_buf_sz; 4088 list = state->id_rx_free_list.dl_head; 4089 state->id_rx_free_list.dl_head = NULL; 4090 state->id_rx_free_list.dl_cnt = 0; 4091 mutex_exit(&state->id_rx_free_list.dl_mutex); 4092 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4093 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4094 if ((rwqe->rwqe_im_mblk = desballoc( 4095 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 4096 &rwqe->w_freemsg_cb)) == NULL) { 4097 /* allow freemsg_cb to free the rwqes */ 4098 if (atomic_dec_32_nv(&state->id_running) != 0) { 4099 cmn_err(CE_WARN, "ibd_init_rxlist: " 4100 "id_running was not 1\n"); 4101 } 4102 DPRINT(10, "ibd_init_rxlist : " 4103 "failed in desballoc()"); 4104 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4105 rwqe = next) { 4106 next = WQE_TO_RWQE(rwqe->rwqe_next); 4107 if (rwqe->rwqe_im_mblk) { 4108 atomic_inc_32(&state-> 4109 id_rx_list. 4110 dl_bufs_outstanding); 4111 freemsg(rwqe->rwqe_im_mblk); 4112 } else 4113 ibd_free_rwqe(state, rwqe); 4114 } 4115 atomic_inc_32(&state->id_running); 4116 return (DDI_FAILURE); 4117 } 4118 } 4119 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4120 return (DDI_SUCCESS); 4121 } 4122 mutex_exit(&state->id_rx_free_list.dl_mutex); 4123 4124 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 4125 return (DDI_FAILURE); 4126 4127 /* 4128 * Allocate and setup the rwqe list 4129 */ 4130 len = state->id_rx_buf_sz; 4131 lkey = state->id_rx_mr_desc.md_lkey; 4132 rwqe = state->id_rx_wqes; 4133 bufaddr = state->id_rx_bufs; 4134 list = NULL; 4135 for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) { 4136 rwqe->w_state = state; 4137 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 4138 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 4139 4140 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 4141 4142 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 4143 &rwqe->w_freemsg_cb)) == NULL) { 4144 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 4145 /* allow freemsg_cb to free the rwqes */ 4146 if (atomic_dec_32_nv(&state->id_running) != 0) { 4147 cmn_err(CE_WARN, "ibd_init_rxlist: " 4148 "id_running was not 1\n"); 4149 } 4150 DPRINT(10, "ibd_init_rxlist : " 4151 "failed in desballoc()"); 4152 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 4153 rwqe = next) { 4154 next = WQE_TO_RWQE(rwqe->rwqe_next); 4155 freemsg(rwqe->rwqe_im_mblk); 4156 } 4157 atomic_inc_32(&state->id_running); 4158 4159 /* remove reference to free'd rwqes */ 4160 mutex_enter(&state->id_rx_free_list.dl_mutex); 4161 state->id_rx_free_list.dl_head = NULL; 4162 state->id_rx_free_list.dl_cnt = 0; 4163 mutex_exit(&state->id_rx_free_list.dl_mutex); 4164 4165 ibd_fini_rxlist(state); 4166 return (DDI_FAILURE); 4167 } 4168 4169 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 4170 rwqe->rwqe_copybuf.ic_sgl.ds_va = 4171 (ib_vaddr_t)(uintptr_t)bufaddr; 4172 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 4173 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 4174 rwqe->w_rwr.wr_nds = 1; 4175 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 4176 4177 rwqe->rwqe_next = list; 4178 list = RWQE_TO_WQE(rwqe); 4179 } 4180 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 4181 4182 return (DDI_SUCCESS); 4183 } 4184 4185 static void 4186 ibd_free_rx_copybufs(ibd_state_t *state) 4187 { 4188 int i; 4189 4190 /* 4191 * Unregister rxbuf mr 4192 */ 4193 if (ibt_deregister_mr(state->id_hca_hdl, 4194 state->id_rx_mr_hdl) != IBT_SUCCESS) { 4195 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 4196 } 4197 state->id_rx_mr_hdl = NULL; 4198 4199 /* 4200 * Free rxbuf memory 4201 */ 4202 for (i = 0; i < state->id_rx_nqueues; i++) { 4203 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4204 mutex_destroy(&rxp->rx_post_lock); 4205 } 4206 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 4207 sizeof (ibd_rx_queue_t)); 4208 kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe * 4209 sizeof (ibd_rwqe_t)); 4210 kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe * 4211 state->id_rx_buf_sz); 4212 state->id_rx_queues = NULL; 4213 state->id_rx_wqes = NULL; 4214 state->id_rx_bufs = NULL; 4215 } 4216 4217 static void 4218 ibd_free_rx_rsrcs(ibd_state_t *state) 4219 { 4220 mutex_enter(&state->id_rx_free_list.dl_mutex); 4221 if (state->id_rx_free_list.dl_head == NULL) { 4222 /* already freed */ 4223 mutex_exit(&state->id_rx_free_list.dl_mutex); 4224 return; 4225 } 4226 ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe); 4227 ibd_free_rx_copybufs(state); 4228 state->id_rx_free_list.dl_cnt = 0; 4229 state->id_rx_free_list.dl_head = NULL; 4230 mutex_exit(&state->id_rx_free_list.dl_mutex); 4231 } 4232 4233 /* 4234 * Free the statically allocated Rx buffer list. 4235 */ 4236 static void 4237 ibd_fini_rxlist(ibd_state_t *state) 4238 { 4239 ibd_rwqe_t *rwqe; 4240 int i; 4241 4242 /* run through the rx_queue's, calling freemsg() */ 4243 for (i = 0; i < state->id_rx_nqueues; i++) { 4244 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4245 mutex_enter(&rxp->rx_post_lock); 4246 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 4247 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4248 freemsg(rwqe->rwqe_im_mblk); 4249 rxp->rx_cnt--; 4250 } 4251 rxp->rx_head = NULL; 4252 mutex_exit(&rxp->rx_post_lock); 4253 } 4254 4255 /* cannot free rx resources unless gld returned everything */ 4256 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 4257 ibd_free_rx_rsrcs(state); 4258 } 4259 4260 /* 4261 * Free an allocated recv wqe. 4262 */ 4263 /* ARGSUSED */ 4264 static void 4265 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 4266 { 4267 /* 4268 * desballoc() failed (no memory). 4269 * 4270 * This rwqe is placed on a free list so that it 4271 * can be reinstated when memory is available. 4272 * 4273 * NOTE: no code currently exists to reinstate 4274 * these "lost" rwqes. 4275 */ 4276 mutex_enter(&state->id_rx_free_list.dl_mutex); 4277 state->id_rx_free_list.dl_cnt++; 4278 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 4279 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 4280 mutex_exit(&state->id_rx_free_list.dl_mutex); 4281 } 4282 4283 /* 4284 * IBA Rx completion queue handler. Guaranteed to be single 4285 * threaded and nonreentrant for this CQ. 4286 */ 4287 /* ARGSUSED */ 4288 static void 4289 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4290 { 4291 ibd_state_t *state = (ibd_state_t *)arg; 4292 4293 atomic_inc_64(&state->id_num_intrs); 4294 4295 if (ibd_rx_softintr == 1) { 4296 mutex_enter(&state->id_rcq_poll_lock); 4297 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 4298 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 4299 mutex_exit(&state->id_rcq_poll_lock); 4300 return; 4301 } else { 4302 mutex_exit(&state->id_rcq_poll_lock); 4303 ddi_trigger_softintr(state->id_rx); 4304 } 4305 } else 4306 (void) ibd_intr((caddr_t)state); 4307 } 4308 4309 /* 4310 * CQ handler for Tx completions, when the Tx CQ is in 4311 * interrupt driven mode. 4312 */ 4313 /* ARGSUSED */ 4314 static void 4315 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4316 { 4317 ibd_state_t *state = (ibd_state_t *)arg; 4318 4319 atomic_inc_64(&state->id_num_intrs); 4320 4321 if (ibd_tx_softintr == 1) { 4322 mutex_enter(&state->id_scq_poll_lock); 4323 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 4324 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 4325 mutex_exit(&state->id_scq_poll_lock); 4326 return; 4327 } else { 4328 mutex_exit(&state->id_scq_poll_lock); 4329 ddi_trigger_softintr(state->id_tx); 4330 } 4331 } else 4332 (void) ibd_tx_recycle((caddr_t)state); 4333 } 4334 4335 /* 4336 * Multicast group create/delete trap handler. These will be delivered 4337 * on a kernel thread (handling can thus block) and can be invoked 4338 * concurrently. The handler can be invoked anytime after it is 4339 * registered and before ibt_detach(). 4340 */ 4341 /* ARGSUSED */ 4342 static void 4343 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4344 ibt_subnet_event_t *event) 4345 { 4346 ibd_state_t *state = (ibd_state_t *)arg; 4347 ibd_req_t *req; 4348 4349 /* 4350 * The trap handler will get invoked once for every event for 4351 * every port. The input "gid" is the GID0 of the port the 4352 * trap came in on; we just need to act on traps that came 4353 * to our port, meaning the port on which the ipoib interface 4354 * resides. Since ipoib uses GID0 of the port, we just match 4355 * the gids to check whether we need to handle the trap. 4356 */ 4357 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4358 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4359 return; 4360 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4361 4362 DPRINT(10, "ibd_notices_handler : %d\n", code); 4363 4364 switch (code) { 4365 case IBT_SM_EVENT_UNAVAILABLE: 4366 /* 4367 * If we are in promiscuous mode or have 4368 * sendnonmembers, we need to print a warning 4369 * message right now. Else, just store the 4370 * information, print when we enter promiscuous 4371 * mode or attempt nonmember send. We might 4372 * also want to stop caching sendnonmember. 4373 */ 4374 ibd_print_warn(state, "IBA multicast support " 4375 "degraded due to unavailability of multicast " 4376 "traps"); 4377 break; 4378 case IBT_SM_EVENT_AVAILABLE: 4379 /* 4380 * If we printed a warning message above or 4381 * while trying to nonmember send or get into 4382 * promiscuous mode, print an okay message. 4383 */ 4384 ibd_print_warn(state, "IBA multicast support " 4385 "restored due to availability of multicast " 4386 "traps"); 4387 break; 4388 case IBT_SM_EVENT_MCG_CREATED: 4389 case IBT_SM_EVENT_MCG_DELETED: 4390 /* 4391 * If it is a "deleted" event and we are in late hca 4392 * init, nothing to do. 4393 */ 4394 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4395 IBD_DRV_IN_LATE_HCA_INIT) && (code == 4396 IBT_SM_EVENT_MCG_DELETED)) { 4397 break; 4398 } 4399 /* 4400 * Common processing of creation/deletion traps. 4401 * First check if the instance is being 4402 * [de]initialized; back off then, without doing 4403 * anything more, since we are not sure if the 4404 * async thread is around, or whether we might 4405 * be racing with the detach code in ibd_m_stop() 4406 * that scans the mcg list. 4407 */ 4408 if (!ibd_async_safe(state)) 4409 return; 4410 4411 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4412 req->rq_gid = event->sm_notice_gid; 4413 req->rq_ptr = (void *)code; 4414 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4415 break; 4416 } 4417 } 4418 4419 static void 4420 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4421 { 4422 ib_gid_t mgid = req->rq_gid; 4423 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4424 int ret; 4425 ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff; 4426 4427 DPRINT(10, "ibd_async_trap : %d\n", code); 4428 4429 /* 4430 * Check if we have already joined the IPoIB broadcast group for our 4431 * PKEY. If joined, perform the rest of the operation. 4432 * Else, the interface is not initialised. Do the initialisation here 4433 * by calling ibd_start() and return. 4434 */ 4435 4436 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4437 IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) && 4438 (code == IBT_SM_EVENT_MCG_CREATED)) { 4439 /* 4440 * If we are in late HCA init and a notification for the 4441 * creation of a MCG came in, check if it is the IPoIB MCG for 4442 * this pkey. If not, return. 4443 */ 4444 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey != 4445 state->id_pkey)) { 4446 ibd_async_done(state); 4447 return; 4448 } 4449 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4450 /* 4451 * Check if there is still a necessity to start the interface. 4452 * It is possible that the user attempted unplumb at just about 4453 * the same time, and if unplumb succeeded, we have nothing to 4454 * do. 4455 */ 4456 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4457 IBD_DRV_IN_LATE_HCA_INIT) && 4458 ((ret = ibd_start(state)) != 0)) { 4459 DPRINT(10, "ibd_async_trap: cannot start from late HCA " 4460 "init, ret=%d", ret); 4461 } 4462 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4463 ibd_async_done(state); 4464 return; 4465 } 4466 4467 /* 4468 * Atomically search the nonmember and sendonlymember lists and 4469 * delete. 4470 */ 4471 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4472 4473 if (state->id_prom_op == IBD_OP_COMPLETED) { 4474 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4475 4476 /* 4477 * If in promiscuous mode, try to join/attach to the new 4478 * mcg. Given the unreliable out-of-order mode of trap 4479 * delivery, we can never be sure whether it is a problem 4480 * if the join fails. Thus, we warn the admin of a failure 4481 * if this was a creation trap. Note that the trap might 4482 * actually be reporting a long past event, and the mcg 4483 * might already have been deleted, thus we might be warning 4484 * in vain. 4485 */ 4486 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4487 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4488 ibd_print_warn(state, "IBA promiscuous mode missed " 4489 "new multicast gid %016llx:%016llx", 4490 (u_longlong_t)mgid.gid_prefix, 4491 (u_longlong_t)mgid.gid_guid); 4492 } 4493 4494 /* 4495 * Free the request slot allocated by the subnet event thread. 4496 */ 4497 ibd_async_done(state); 4498 } 4499 4500 /* 4501 * GLDv3 entry point to get capabilities. 4502 */ 4503 static boolean_t 4504 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4505 { 4506 ibd_state_t *state = arg; 4507 4508 if (state->id_type == IBD_PORT_DRIVER) 4509 return (B_FALSE); 4510 4511 switch (cap) { 4512 case MAC_CAPAB_HCKSUM: { 4513 uint32_t *txflags = cap_data; 4514 4515 /* 4516 * We either do full checksum or not do it at all 4517 */ 4518 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4519 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4520 else 4521 return (B_FALSE); 4522 break; 4523 } 4524 4525 case MAC_CAPAB_LSO: { 4526 mac_capab_lso_t *cap_lso = cap_data; 4527 4528 /* 4529 * In addition to the capability and policy, since LSO 4530 * relies on hw checksum, we'll not enable LSO if we 4531 * don't have hw checksum. Of course, if the HCA doesn't 4532 * provide the reserved lkey capability, enabling LSO will 4533 * actually affect performance adversely, so we'll disable 4534 * LSO even for that case. 4535 */ 4536 if (!state->id_lso_policy || !state->id_lso_capable) 4537 return (B_FALSE); 4538 4539 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4540 return (B_FALSE); 4541 4542 if (state->id_hca_res_lkey_capab == 0) { 4543 ibd_print_warn(state, "no reserved-lkey capability, " 4544 "disabling LSO"); 4545 return (B_FALSE); 4546 } 4547 4548 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4549 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4550 break; 4551 } 4552 4553 default: 4554 return (B_FALSE); 4555 } 4556 4557 return (B_TRUE); 4558 } 4559 4560 /* 4561 * callback function for set/get of properties 4562 */ 4563 static int 4564 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4565 uint_t pr_valsize, const void *pr_val) 4566 { 4567 ibd_state_t *state = arg; 4568 int err = 0; 4569 uint32_t link_mode; 4570 4571 /* Cannot set properties on a port driver */ 4572 if (state->id_type == IBD_PORT_DRIVER) { 4573 return (ENOTSUP); 4574 } 4575 4576 switch (pr_num) { 4577 case MAC_PROP_IB_LINKMODE: 4578 if (state->id_mac_state & IBD_DRV_STARTED) { 4579 err = EBUSY; 4580 break; 4581 } 4582 if (pr_val == NULL) { 4583 err = EINVAL; 4584 break; 4585 } 4586 bcopy(pr_val, &link_mode, sizeof (link_mode)); 4587 if (link_mode != IBD_LINK_MODE_UD && 4588 link_mode != IBD_LINK_MODE_RC) { 4589 err = EINVAL; 4590 } else { 4591 if (link_mode == IBD_LINK_MODE_RC) { 4592 if (state->id_enable_rc) { 4593 return (0); 4594 } 4595 state->id_enable_rc = 1; 4596 /* inform MAC framework of new MTU */ 4597 err = mac_maxsdu_update(state->id_mh, 4598 state->rc_mtu - IPOIB_HDRSIZE); 4599 } else { 4600 if (!state->id_enable_rc) { 4601 return (0); 4602 } 4603 state->id_enable_rc = 0; 4604 err = mac_maxsdu_update(state->id_mh, 4605 state->id_mtu - IPOIB_HDRSIZE); 4606 } 4607 (void) ibd_record_capab(state); 4608 mac_capab_update(state->id_mh); 4609 } 4610 break; 4611 case MAC_PROP_PRIVATE: 4612 err = ibd_set_priv_prop(state, pr_name, 4613 pr_valsize, pr_val); 4614 break; 4615 default: 4616 err = ENOTSUP; 4617 break; 4618 } 4619 return (err); 4620 } 4621 4622 static int 4623 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4624 uint_t pr_valsize, void *pr_val) 4625 { 4626 ibd_state_t *state = arg; 4627 int err = 0; 4628 4629 switch (pr_num) { 4630 case MAC_PROP_MTU: 4631 break; 4632 default: 4633 if (state->id_type == IBD_PORT_DRIVER) { 4634 return (ENOTSUP); 4635 } 4636 break; 4637 } 4638 4639 switch (pr_num) { 4640 case MAC_PROP_IB_LINKMODE: 4641 *(uint_t *)pr_val = state->id_enable_rc; 4642 break; 4643 case MAC_PROP_PRIVATE: 4644 err = ibd_get_priv_prop(state, pr_name, pr_valsize, 4645 pr_val); 4646 break; 4647 default: 4648 err = ENOTSUP; 4649 break; 4650 } 4651 return (err); 4652 } 4653 4654 static void 4655 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4656 mac_prop_info_handle_t prh) 4657 { 4658 ibd_state_t *state = arg; 4659 4660 switch (pr_num) { 4661 case MAC_PROP_IB_LINKMODE: { 4662 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE); 4663 break; 4664 } 4665 case MAC_PROP_MTU: { 4666 uint32_t min, max; 4667 if (state->id_type == IBD_PORT_DRIVER) { 4668 min = 1500; 4669 max = IBD_DEF_RC_MAX_SDU; 4670 } else if (state->id_enable_rc) { 4671 min = max = IBD_DEF_RC_MAX_SDU; 4672 } else { 4673 min = max = state->id_mtu - IPOIB_HDRSIZE; 4674 } 4675 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4676 mac_prop_info_set_range_uint32(prh, min, max); 4677 break; 4678 } 4679 case MAC_PROP_PRIVATE: { 4680 char valstr[64]; 4681 int value; 4682 4683 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 4684 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4685 return; 4686 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4687 value = IBD_DEF_COALESCE_COMPLETIONS; 4688 } else if (strcmp(pr_name, 4689 "_ibd_create_broadcast_group") == 0) { 4690 value = IBD_DEF_CREATE_BCAST_GROUP; 4691 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4692 value = IBD_DEF_HASH_SIZE; 4693 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4694 value = IBD_DEF_LSO_POLICY; 4695 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4696 value = IBD_DEF_NUM_AH; 4697 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4698 value = IBD_DEF_NUM_LSO_BUFS; 4699 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4700 value = IBD_DEF_RC_ENABLE_SRQ; 4701 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4702 value = IBD_DEF_RC_NUM_RWQE; 4703 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4704 value = IBD_DEF_RC_NUM_SRQ; 4705 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4706 value = IBD_DEF_RC_NUM_SWQE; 4707 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4708 value = IBD_DEF_RC_RX_COMP_COUNT; 4709 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4710 value = IBD_DEF_RC_RX_COMP_USEC; 4711 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4712 value = IBD_DEF_RC_RX_COPY_THRESH; 4713 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4714 value = IBD_DEF_RC_RX_RWQE_THRESH; 4715 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4716 value = IBD_DEF_RC_TX_COMP_COUNT; 4717 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 4718 value = IBD_DEF_RC_TX_COMP_USEC; 4719 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 4720 value = IBD_DEF_RC_TX_COPY_THRESH; 4721 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 4722 value = IBD_DEF_UD_NUM_RWQE; 4723 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 4724 value = IBD_DEF_UD_NUM_SWQE; 4725 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 4726 value = IBD_DEF_UD_RX_COMP_COUNT; 4727 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 4728 value = IBD_DEF_UD_RX_COMP_USEC; 4729 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 4730 value = IBD_DEF_UD_TX_COMP_COUNT; 4731 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 4732 value = IBD_DEF_UD_TX_COMP_USEC; 4733 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 4734 value = IBD_DEF_UD_TX_COPY_THRESH; 4735 } else { 4736 return; 4737 } 4738 4739 (void) snprintf(valstr, sizeof (valstr), "%d", value); 4740 mac_prop_info_set_default_str(prh, valstr); 4741 break; 4742 } 4743 } /* switch (pr_num) */ 4744 } 4745 4746 /* ARGSUSED2 */ 4747 static int 4748 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name, 4749 uint_t pr_valsize, const void *pr_val) 4750 { 4751 int err = 0; 4752 long result; 4753 4754 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4755 if (pr_val == NULL) { 4756 return (EINVAL); 4757 } 4758 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4759 if (result < 0 || result > 1) { 4760 err = EINVAL; 4761 } else { 4762 state->id_allow_coalesce_comp_tuning = (result == 1) ? 4763 B_TRUE: B_FALSE; 4764 } 4765 return (err); 4766 } 4767 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 4768 if (state->id_mac_state & IBD_DRV_STARTED) { 4769 return (EBUSY); 4770 } 4771 if (pr_val == NULL) { 4772 return (EINVAL); 4773 } 4774 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4775 if (result < 0 || result > 1) { 4776 err = EINVAL; 4777 } else { 4778 state->id_create_broadcast_group = (result == 1) ? 4779 B_TRUE: B_FALSE; 4780 } 4781 return (err); 4782 } 4783 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4784 if (state->id_mac_state & IBD_DRV_STARTED) { 4785 return (EBUSY); 4786 } 4787 if (pr_val == NULL) { 4788 return (EINVAL); 4789 } 4790 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4791 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) { 4792 err = EINVAL; 4793 } else { 4794 state->id_hash_size = (uint32_t)result; 4795 } 4796 return (err); 4797 } 4798 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4799 if (state->id_mac_state & IBD_DRV_STARTED) { 4800 return (EBUSY); 4801 } 4802 if (pr_val == NULL) { 4803 return (EINVAL); 4804 } 4805 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4806 if (result < 0 || result > 1) { 4807 err = EINVAL; 4808 } else { 4809 state->id_lso_policy = (result == 1) ? 4810 B_TRUE: B_FALSE; 4811 } 4812 mac_capab_update(state->id_mh); 4813 return (err); 4814 } 4815 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4816 if (state->id_mac_state & IBD_DRV_STARTED) { 4817 return (EBUSY); 4818 } 4819 if (pr_val == NULL) { 4820 return (EINVAL); 4821 } 4822 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4823 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) { 4824 err = EINVAL; 4825 } else { 4826 state->id_num_ah = (uint32_t)result; 4827 } 4828 return (err); 4829 } 4830 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4831 if (state->id_mac_state & IBD_DRV_STARTED) { 4832 return (EBUSY); 4833 } 4834 if (!state->id_lso_policy || !state->id_lso_capable) { 4835 return (EINVAL); 4836 } 4837 if (pr_val == NULL) { 4838 return (EINVAL); 4839 } 4840 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4841 if (result < IBD_MIN_NUM_LSO_BUFS || 4842 result > IBD_MAX_NUM_LSO_BUFS) { 4843 err = EINVAL; 4844 } else { 4845 state->id_num_lso_bufs = (uint32_t)result; 4846 } 4847 return (err); 4848 } 4849 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4850 if (state->id_mac_state & IBD_DRV_STARTED) { 4851 return (EBUSY); 4852 } 4853 if (pr_val == NULL) { 4854 return (EINVAL); 4855 } 4856 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4857 if (result < 0 || result > 1) { 4858 err = EINVAL; 4859 } else { 4860 state->rc_enable_srq = (result == 1) ? 4861 B_TRUE: B_FALSE; 4862 } 4863 if (!state->rc_enable_srq) { 4864 state->id_rc_num_srq = 0; 4865 } 4866 return (err); 4867 } 4868 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4869 if (state->id_mac_state & IBD_DRV_STARTED) { 4870 return (EBUSY); 4871 } 4872 if (pr_val == NULL) { 4873 return (EINVAL); 4874 } 4875 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4876 if (result < IBD_MIN_RC_NUM_RWQE || 4877 result > IBD_MAX_RC_NUM_RWQE) { 4878 err = EINVAL; 4879 } else { 4880 state->id_rc_num_rwqe = (uint32_t)result; 4881 if (state->id_allow_coalesce_comp_tuning && 4882 state->id_rc_rx_comp_count > state->id_rc_num_rwqe) 4883 state->id_rc_rx_comp_count = 4884 state->id_rc_num_rwqe; 4885 if (state->id_rc_num_srq > state->id_rc_num_rwqe) 4886 state->id_rc_num_srq = 4887 state->id_rc_num_rwqe - 1; 4888 /* 4889 * If rx_rwqe_threshold is greater than the number of 4890 * rwqes, pull it back to 25% of number of rwqes. 4891 */ 4892 if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe) 4893 state->id_rc_rx_rwqe_thresh = 4894 (state->id_rc_num_rwqe >> 2); 4895 4896 } 4897 return (err); 4898 } 4899 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4900 if (state->id_mac_state & IBD_DRV_STARTED) { 4901 return (EBUSY); 4902 } 4903 if (pr_val == NULL) { 4904 return (EINVAL); 4905 } 4906 if (!state->rc_enable_srq) 4907 return (EINVAL); 4908 4909 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4910 if (result < IBD_MIN_RC_NUM_SRQ || 4911 result >= state->id_rc_num_rwqe) { 4912 err = EINVAL; 4913 } else 4914 state->id_rc_num_srq = (uint32_t)result; 4915 return (err); 4916 } 4917 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4918 if (state->id_mac_state & IBD_DRV_STARTED) { 4919 return (EBUSY); 4920 } 4921 if (pr_val == NULL) { 4922 return (EINVAL); 4923 } 4924 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4925 if (result < IBD_MIN_RC_NUM_SWQE || 4926 result > IBD_MAX_RC_NUM_SWQE) { 4927 err = EINVAL; 4928 } else { 4929 state->id_rc_num_swqe = (uint32_t)result; 4930 if (state->id_allow_coalesce_comp_tuning && 4931 state->id_rc_tx_comp_count > state->id_rc_num_swqe) 4932 state->id_rc_tx_comp_count = 4933 state->id_rc_num_swqe; 4934 } 4935 return (err); 4936 } 4937 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4938 if (!state->id_allow_coalesce_comp_tuning) { 4939 return (ENOTSUP); 4940 } 4941 if (pr_val == NULL) { 4942 return (EINVAL); 4943 } 4944 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4945 if (result < 1 || result > state->id_rc_num_rwqe) { 4946 err = EINVAL; 4947 } else { 4948 state->id_rc_rx_comp_count = (uint32_t)result; 4949 } 4950 return (err); 4951 } 4952 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4953 if (!state->id_allow_coalesce_comp_tuning) { 4954 return (ENOTSUP); 4955 } 4956 if (pr_val == NULL) { 4957 return (EINVAL); 4958 } 4959 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4960 if (result < 1) { 4961 err = EINVAL; 4962 } else { 4963 state->id_rc_rx_comp_usec = (uint32_t)result; 4964 } 4965 return (err); 4966 } 4967 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4968 if (state->id_mac_state & IBD_DRV_STARTED) { 4969 return (EBUSY); 4970 } 4971 if (pr_val == NULL) { 4972 return (EINVAL); 4973 } 4974 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4975 if (result < IBD_MIN_RC_RX_COPY_THRESH || 4976 result > state->rc_mtu) { 4977 err = EINVAL; 4978 } else { 4979 state->id_rc_rx_copy_thresh = (uint32_t)result; 4980 } 4981 return (err); 4982 } 4983 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4984 if (state->id_mac_state & IBD_DRV_STARTED) { 4985 return (EBUSY); 4986 } 4987 if (pr_val == NULL) { 4988 return (EINVAL); 4989 } 4990 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4991 if (result < IBD_MIN_RC_RX_RWQE_THRESH || 4992 result >= state->id_rc_num_rwqe) { 4993 err = EINVAL; 4994 } else { 4995 state->id_rc_rx_rwqe_thresh = (uint32_t)result; 4996 } 4997 return (err); 4998 } 4999 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 5000 if (!state->id_allow_coalesce_comp_tuning) { 5001 return (ENOTSUP); 5002 } 5003 if (pr_val == NULL) { 5004 return (EINVAL); 5005 } 5006 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5007 if (result < 1 || result > state->id_rc_num_swqe) { 5008 err = EINVAL; 5009 } else { 5010 state->id_rc_tx_comp_count = (uint32_t)result; 5011 } 5012 return (err); 5013 } 5014 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5015 if (!state->id_allow_coalesce_comp_tuning) { 5016 return (ENOTSUP); 5017 } 5018 if (pr_val == NULL) { 5019 return (EINVAL); 5020 } 5021 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5022 if (result < 1) 5023 err = EINVAL; 5024 else { 5025 state->id_rc_tx_comp_usec = (uint32_t)result; 5026 } 5027 return (err); 5028 } 5029 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5030 if (state->id_mac_state & IBD_DRV_STARTED) { 5031 return (EBUSY); 5032 } 5033 if (pr_val == NULL) { 5034 return (EINVAL); 5035 } 5036 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5037 if (result < IBD_MIN_RC_TX_COPY_THRESH || 5038 result > state->rc_mtu) { 5039 err = EINVAL; 5040 } else { 5041 state->id_rc_tx_copy_thresh = (uint32_t)result; 5042 } 5043 return (err); 5044 } 5045 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5046 if (state->id_mac_state & IBD_DRV_STARTED) { 5047 return (EBUSY); 5048 } 5049 if (pr_val == NULL) { 5050 return (EINVAL); 5051 } 5052 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5053 if (result < IBD_MIN_UD_NUM_RWQE || 5054 result > IBD_MAX_UD_NUM_RWQE) { 5055 err = EINVAL; 5056 } else { 5057 if (result > state->id_hca_max_chan_sz) { 5058 state->id_ud_num_rwqe = 5059 state->id_hca_max_chan_sz; 5060 } else { 5061 state->id_ud_num_rwqe = (uint32_t)result; 5062 } 5063 if (state->id_allow_coalesce_comp_tuning && 5064 state->id_ud_rx_comp_count > state->id_ud_num_rwqe) 5065 state->id_ud_rx_comp_count = 5066 state->id_ud_num_rwqe; 5067 } 5068 return (err); 5069 } 5070 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5071 if (state->id_mac_state & IBD_DRV_STARTED) { 5072 return (EBUSY); 5073 } 5074 if (pr_val == NULL) { 5075 return (EINVAL); 5076 } 5077 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5078 if (result < IBD_MIN_UD_NUM_SWQE || 5079 result > IBD_MAX_UD_NUM_SWQE) { 5080 err = EINVAL; 5081 } else { 5082 if (result > state->id_hca_max_chan_sz) { 5083 state->id_ud_num_swqe = 5084 state->id_hca_max_chan_sz; 5085 } else { 5086 state->id_ud_num_swqe = (uint32_t)result; 5087 } 5088 if (state->id_allow_coalesce_comp_tuning && 5089 state->id_ud_tx_comp_count > state->id_ud_num_swqe) 5090 state->id_ud_tx_comp_count = 5091 state->id_ud_num_swqe; 5092 } 5093 return (err); 5094 } 5095 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5096 if (!state->id_allow_coalesce_comp_tuning) { 5097 return (ENOTSUP); 5098 } 5099 if (pr_val == NULL) { 5100 return (EINVAL); 5101 } 5102 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5103 if (result < 1 || result > state->id_ud_num_rwqe) { 5104 err = EINVAL; 5105 } else { 5106 state->id_ud_rx_comp_count = (uint32_t)result; 5107 } 5108 return (err); 5109 } 5110 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5111 if (!state->id_allow_coalesce_comp_tuning) { 5112 return (ENOTSUP); 5113 } 5114 if (pr_val == NULL) { 5115 return (EINVAL); 5116 } 5117 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5118 if (result < 1) { 5119 err = EINVAL; 5120 } else { 5121 state->id_ud_rx_comp_usec = (uint32_t)result; 5122 } 5123 return (err); 5124 } 5125 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5126 if (!state->id_allow_coalesce_comp_tuning) { 5127 return (ENOTSUP); 5128 } 5129 if (pr_val == NULL) { 5130 return (EINVAL); 5131 } 5132 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5133 if (result < 1 || result > state->id_ud_num_swqe) { 5134 err = EINVAL; 5135 } else { 5136 state->id_ud_tx_comp_count = (uint32_t)result; 5137 } 5138 return (err); 5139 } 5140 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5141 if (!state->id_allow_coalesce_comp_tuning) { 5142 return (ENOTSUP); 5143 } 5144 if (pr_val == NULL) { 5145 return (EINVAL); 5146 } 5147 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5148 if (result < 1) { 5149 err = EINVAL; 5150 } else { 5151 state->id_ud_tx_comp_usec = (uint32_t)result; 5152 } 5153 return (err); 5154 } 5155 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5156 if (state->id_mac_state & IBD_DRV_STARTED) { 5157 return (EBUSY); 5158 } 5159 if (pr_val == NULL) { 5160 return (EINVAL); 5161 } 5162 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 5163 if (result < IBD_MIN_UD_TX_COPY_THRESH || 5164 result > IBD_MAX_UD_TX_COPY_THRESH) { 5165 err = EINVAL; 5166 } else { 5167 state->id_ud_tx_copy_thresh = (uint32_t)result; 5168 } 5169 return (err); 5170 } 5171 return (ENOTSUP); 5172 } 5173 5174 static int 5175 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize, 5176 void *pr_val) 5177 { 5178 int err = ENOTSUP; 5179 int value; 5180 5181 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 5182 value = state->id_bgroup_present; 5183 err = 0; 5184 goto done; 5185 } 5186 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 5187 value = state->id_allow_coalesce_comp_tuning; 5188 err = 0; 5189 goto done; 5190 } 5191 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 5192 value = state->id_create_broadcast_group; 5193 err = 0; 5194 goto done; 5195 } 5196 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 5197 value = state->id_hash_size; 5198 err = 0; 5199 goto done; 5200 } 5201 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 5202 value = state->id_lso_policy; 5203 err = 0; 5204 goto done; 5205 } 5206 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 5207 value = state->id_num_ah; 5208 err = 0; 5209 goto done; 5210 } 5211 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 5212 value = state->id_num_lso_bufs; 5213 err = 0; 5214 goto done; 5215 } 5216 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 5217 value = state->rc_enable_srq; 5218 err = 0; 5219 goto done; 5220 } 5221 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 5222 value = state->id_rc_num_rwqe; 5223 err = 0; 5224 goto done; 5225 } 5226 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 5227 value = state->id_rc_num_srq; 5228 err = 0; 5229 goto done; 5230 } 5231 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 5232 value = state->id_rc_num_swqe; 5233 err = 0; 5234 goto done; 5235 } 5236 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 5237 value = state->id_rc_rx_comp_count; 5238 err = 0; 5239 goto done; 5240 } 5241 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 5242 value = state->id_rc_rx_comp_usec; 5243 err = 0; 5244 goto done; 5245 } 5246 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 5247 value = state->id_rc_rx_copy_thresh; 5248 err = 0; 5249 goto done; 5250 } 5251 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 5252 value = state->id_rc_rx_rwqe_thresh; 5253 err = 0; 5254 goto done; 5255 } 5256 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 5257 value = state->id_rc_tx_comp_count; 5258 err = 0; 5259 goto done; 5260 } 5261 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 5262 value = state->id_rc_tx_comp_usec; 5263 err = 0; 5264 goto done; 5265 } 5266 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 5267 value = state->id_rc_tx_copy_thresh; 5268 err = 0; 5269 goto done; 5270 } 5271 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 5272 value = state->id_ud_num_rwqe; 5273 err = 0; 5274 goto done; 5275 } 5276 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 5277 value = state->id_ud_num_swqe; 5278 err = 0; 5279 goto done; 5280 } 5281 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 5282 value = state->id_ud_rx_comp_count; 5283 err = 0; 5284 goto done; 5285 } 5286 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 5287 value = state->id_ud_rx_comp_usec; 5288 err = 0; 5289 goto done; 5290 } 5291 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 5292 value = state->id_ud_tx_comp_count; 5293 err = 0; 5294 goto done; 5295 } 5296 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 5297 value = state->id_ud_tx_comp_usec; 5298 err = 0; 5299 goto done; 5300 } 5301 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 5302 value = state->id_ud_tx_copy_thresh; 5303 err = 0; 5304 goto done; 5305 } 5306 done: 5307 if (err == 0) { 5308 (void) snprintf(pr_val, pr_valsize, "%d", value); 5309 } 5310 return (err); 5311 } 5312 5313 static int 5314 ibd_get_port_details(ibd_state_t *state) 5315 { 5316 ibt_hca_portinfo_t *port_infop; 5317 ibt_status_t ret; 5318 uint_t psize, port_infosz; 5319 5320 mutex_enter(&state->id_link_mutex); 5321 5322 /* 5323 * Query for port information 5324 */ 5325 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 5326 &port_infop, &psize, &port_infosz); 5327 if ((ret != IBT_SUCCESS) || (psize != 1)) { 5328 mutex_exit(&state->id_link_mutex); 5329 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 5330 "failed, ret=%d", ret); 5331 return (ENETDOWN); 5332 } 5333 5334 /* 5335 * If the link is active, verify the pkey 5336 */ 5337 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) { 5338 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 5339 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 5340 state->id_link_state = LINK_STATE_DOWN; 5341 } else { 5342 state->id_link_state = LINK_STATE_UP; 5343 } 5344 state->id_mtu = (128 << port_infop->p_mtu); 5345 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5346 state->id_sgid = *port_infop->p_sgid_tbl; 5347 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 5348 /* 5349 * Now that the port is active, record the port speed 5350 */ 5351 state->id_link_speed = ibd_get_portspeed(state); 5352 } else { 5353 /* Make sure that these are handled in PORT_UP/CHANGE */ 5354 state->id_mtu = 0; 5355 state->id_link_state = LINK_STATE_DOWN; 5356 state->id_link_speed = 0; 5357 } 5358 mutex_exit(&state->id_link_mutex); 5359 ibt_free_portinfo(port_infop, port_infosz); 5360 5361 return (0); 5362 } 5363 5364 static int 5365 ibd_alloc_cqs(ibd_state_t *state) 5366 { 5367 ibt_hca_attr_t hca_attrs; 5368 ibt_cq_attr_t cq_attr; 5369 ibt_status_t ret; 5370 uint32_t real_size; 5371 uint_t num_rwqe_change = 0; 5372 uint_t num_swqe_change = 0; 5373 5374 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 5375 ASSERT(ret == IBT_SUCCESS); 5376 5377 /* 5378 * Allocate Rx/combined CQ: 5379 * Theoretically, there is no point in having more than #rwqe 5380 * plus #swqe cqe's, except that the CQ will be signaled for 5381 * overflow when the last wqe completes, if none of the previous 5382 * cqe's have been polled. Thus, we allocate just a few less wqe's 5383 * to make sure such overflow does not occur. 5384 */ 5385 cq_attr.cq_sched = NULL; 5386 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 5387 5388 /* 5389 * Allocate Receive CQ. 5390 */ 5391 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) { 5392 cq_attr.cq_size = state->id_ud_num_rwqe + 1; 5393 } else { 5394 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5395 num_rwqe_change = state->id_ud_num_rwqe; 5396 state->id_ud_num_rwqe = cq_attr.cq_size - 1; 5397 } 5398 5399 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5400 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 5401 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 5402 "failed, ret=%d\n", ret); 5403 return (DDI_FAILURE); 5404 } 5405 5406 if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count, 5407 state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) { 5408 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 5409 "moderation failed, ret=%d\n", ret); 5410 } 5411 5412 /* make the #rx wc's the same as max rx chain size */ 5413 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 5414 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 5415 state->id_rxwcs_size, KM_SLEEP); 5416 5417 /* 5418 * Allocate Send CQ. 5419 */ 5420 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) { 5421 cq_attr.cq_size = state->id_ud_num_swqe + 1; 5422 } else { 5423 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5424 num_swqe_change = state->id_ud_num_swqe; 5425 state->id_ud_num_swqe = cq_attr.cq_size - 1; 5426 } 5427 5428 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5429 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 5430 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 5431 "failed, ret=%d\n", ret); 5432 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 5433 state->id_rxwcs_size); 5434 (void) ibt_free_cq(state->id_rcq_hdl); 5435 return (DDI_FAILURE); 5436 } 5437 if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count, 5438 state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) { 5439 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 5440 "moderation failed, ret=%d\n", ret); 5441 } 5442 5443 state->id_txwcs_size = IBD_TX_POLL_THRESH; 5444 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 5445 state->id_txwcs_size, KM_SLEEP); 5446 5447 /* 5448 * Print message in case we could not allocate as many wqe's 5449 * as was requested. 5450 */ 5451 if (num_rwqe_change) { 5452 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 5453 "%d", state->id_ud_num_rwqe, num_rwqe_change); 5454 } 5455 if (num_swqe_change) { 5456 ibd_print_warn(state, "Setting #swqe = %d instead of default " 5457 "%d", state->id_ud_num_swqe, num_swqe_change); 5458 } 5459 5460 return (DDI_SUCCESS); 5461 } 5462 5463 static int 5464 ibd_setup_ud_channel(ibd_state_t *state) 5465 { 5466 ibt_ud_chan_alloc_args_t ud_alloc_attr; 5467 ibt_ud_chan_query_attr_t ud_chan_attr; 5468 ibt_status_t ret; 5469 5470 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 5471 if (state->id_hca_res_lkey_capab) 5472 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 5473 if (state->id_lso_policy && state->id_lso_capable) 5474 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 5475 5476 ud_alloc_attr.ud_hca_port_num = state->id_port; 5477 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 5478 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 5479 ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe; 5480 ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe; 5481 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 5482 ud_alloc_attr.ud_scq = state->id_scq_hdl; 5483 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 5484 ud_alloc_attr.ud_pd = state->id_pd_hdl; 5485 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 5486 ud_alloc_attr.ud_clone_chan = NULL; 5487 5488 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 5489 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 5490 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 5491 "failed, ret=%d\n", ret); 5492 return (DDI_FAILURE); 5493 } 5494 5495 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 5496 &ud_chan_attr)) != IBT_SUCCESS) { 5497 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 5498 "failed, ret=%d\n", ret); 5499 (void) ibt_free_channel(state->id_chnl_hdl); 5500 return (DDI_FAILURE); 5501 } 5502 5503 state->id_qpnum = ud_chan_attr.ud_qpn; 5504 5505 return (DDI_SUCCESS); 5506 } 5507 5508 static int 5509 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 5510 { 5511 uint32_t progress = state->id_mac_state; 5512 uint_t attempts; 5513 ibt_status_t ret; 5514 ib_gid_t mgid; 5515 ibd_mce_t *mce; 5516 uint8_t jstate; 5517 5518 if (atomic_dec_32_nv(&state->id_running) != 0) 5519 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 5520 5521 /* 5522 * Before we try to stop/undo whatever we did in ibd_start(), 5523 * we need to mark the link state appropriately to prevent the 5524 * ip layer from using this instance for any new transfers. Note 5525 * that if the original state of the link was "up" when we're 5526 * here, we'll set the final link state to "unknown", to behave 5527 * in the same fashion as other ethernet drivers. 5528 */ 5529 mutex_enter(&state->id_link_mutex); 5530 if (cur_link_state == LINK_STATE_DOWN) { 5531 state->id_link_state = cur_link_state; 5532 } else { 5533 state->id_link_state = LINK_STATE_UNKNOWN; 5534 } 5535 mutex_exit(&state->id_link_mutex); 5536 bzero(&state->id_macaddr, sizeof (ipoib_mac_t)); 5537 mac_link_update(state->id_mh, state->id_link_state); 5538 5539 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 5540 if (progress & IBD_DRV_STARTED) { 5541 state->id_mac_state &= (~IBD_DRV_STARTED); 5542 } 5543 5544 if (progress & IBD_DRV_IN_LATE_HCA_INIT) { 5545 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT); 5546 } 5547 5548 /* Stop listen under Reliable Connected Mode */ 5549 if (progress & IBD_DRV_RC_LISTEN) { 5550 ASSERT(state->id_enable_rc); 5551 if (state->rc_listen_hdl != NULL) { 5552 ibd_rc_stop_listen(state); 5553 } 5554 state->id_mac_state &= (~IBD_DRV_RC_LISTEN); 5555 } 5556 5557 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) { 5558 (void) ibd_rc_close_all_chan(state); 5559 } 5560 5561 /* 5562 * First, stop receive interrupts; this stops the driver from 5563 * handing up buffers to higher layers. Wait for receive buffers 5564 * to be returned and give up after 1 second. 5565 */ 5566 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 5567 attempts = 10; 5568 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 5569 0) > 0) { 5570 delay(drv_usectohz(100000)); 5571 if (--attempts == 0) { 5572 /* 5573 * There are pending bufs with the network 5574 * layer and we have no choice but to wait 5575 * for them to be done with. Reap all the 5576 * Tx/Rx completions that were posted since 5577 * we turned off the notification and 5578 * return failure. 5579 */ 5580 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 5581 DPRINT(2, "ibd_undo_start: " 5582 "reclaiming failed"); 5583 break; 5584 } 5585 } 5586 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 5587 } 5588 5589 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { 5590 ibd_rc_fini_tx_largebuf_list(state); 5591 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); 5592 } 5593 5594 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 5595 ASSERT(state->id_enable_rc); 5596 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) { 5597 ibd_rc_fini_srq_list(state); 5598 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 5599 } else { 5600 cmn_err(CE_CONT, "ibd_undo_start: srq bufs " 5601 "outstanding\n"); 5602 } 5603 } 5604 5605 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 5606 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 5607 5608 mutex_enter(&state->id_trap_lock); 5609 state->id_trap_stop = B_TRUE; 5610 while (state->id_trap_inprog > 0) 5611 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 5612 mutex_exit(&state->id_trap_lock); 5613 5614 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 5615 } 5616 5617 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 5618 /* 5619 * Flushing the channel ensures that all pending WQE's 5620 * are marked with flush_error and handed to the CQ. It 5621 * does not guarantee the invocation of the CQ handler. 5622 * This call is guaranteed to return successfully for 5623 * UD QPNs. 5624 */ 5625 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 5626 IBT_SUCCESS) { 5627 DPRINT(10, "ibd_undo_start: flush_channel " 5628 "failed, ret=%d", ret); 5629 } 5630 5631 /* 5632 * Give some time for the TX CQ handler to process the 5633 * completions. 5634 */ 5635 mutex_enter(&state->id_tx_list.dl_mutex); 5636 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5637 attempts = 10; 5638 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 5639 != state->id_ud_num_swqe) { 5640 if (--attempts == 0) 5641 break; 5642 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5643 mutex_exit(&state->id_tx_list.dl_mutex); 5644 delay(drv_usectohz(100000)); 5645 mutex_enter(&state->id_tx_list.dl_mutex); 5646 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5647 } 5648 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5649 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 5650 state->id_ud_num_swqe) { 5651 cmn_err(CE_WARN, "tx resources not freed\n"); 5652 } 5653 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5654 mutex_exit(&state->id_tx_list.dl_mutex); 5655 5656 attempts = 10; 5657 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5658 if (--attempts == 0) 5659 break; 5660 delay(drv_usectohz(100000)); 5661 } 5662 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 5663 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5664 cmn_err(CE_WARN, "rx resources not freed\n"); 5665 } 5666 5667 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 5668 } 5669 5670 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 5671 /* 5672 * Drop all residual full/non membership. This includes full 5673 * membership to the broadcast group, and any nonmembership 5674 * acquired during transmits. We do this after the Tx completion 5675 * handlers are done, since those might result in some late 5676 * leaves; this also eliminates a potential race with that 5677 * path wrt the mc full list insert/delete. Trap handling 5678 * has also been suppressed at this point. Thus, no locks 5679 * are required while traversing the mc full list. 5680 */ 5681 DPRINT(2, "ibd_undo_start: clear full cache entries"); 5682 mce = list_head(&state->id_mc_full); 5683 while (mce != NULL) { 5684 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5685 jstate = mce->mc_jstate; 5686 mce = list_next(&state->id_mc_full, mce); 5687 ibd_leave_group(state, mgid, jstate); 5688 } 5689 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 5690 } 5691 5692 if (progress & IBD_DRV_RXLIST_ALLOCD) { 5693 ibd_fini_rxlist(state); 5694 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 5695 } 5696 5697 if (progress & IBD_DRV_TXLIST_ALLOCD) { 5698 ibd_fini_txlist(state); 5699 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 5700 } 5701 5702 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 5703 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 5704 IBT_SUCCESS) { 5705 DPRINT(10, "ibd_undo_start: free_channel " 5706 "failed, ret=%d", ret); 5707 } 5708 5709 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 5710 } 5711 5712 if (progress & IBD_DRV_CQS_ALLOCD) { 5713 kmem_free(state->id_txwcs, 5714 sizeof (ibt_wc_t) * state->id_txwcs_size); 5715 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 5716 IBT_SUCCESS) { 5717 DPRINT(10, "ibd_undo_start: free_cq(scq) " 5718 "failed, ret=%d", ret); 5719 } 5720 5721 kmem_free(state->id_rxwcs, 5722 sizeof (ibt_wc_t) * state->id_rxwcs_size); 5723 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 5724 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 5725 "ret=%d", ret); 5726 } 5727 5728 state->id_txwcs = NULL; 5729 state->id_rxwcs = NULL; 5730 state->id_scq_hdl = NULL; 5731 state->id_rcq_hdl = NULL; 5732 5733 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 5734 } 5735 5736 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 5737 mutex_enter(&state->id_ac_mutex); 5738 mod_hash_destroy_hash(state->id_ah_active_hash); 5739 mutex_exit(&state->id_ac_mutex); 5740 ibd_acache_fini(state); 5741 5742 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 5743 } 5744 5745 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 5746 /* 5747 * If we'd created the ipoib broadcast group and had 5748 * successfully joined it, leave it now 5749 */ 5750 if (state->id_bgroup_created) { 5751 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 5752 jstate = IB_MC_JSTATE_FULL; 5753 (void) ibt_leave_mcg(state->id_sgid, mgid, 5754 state->id_sgid, jstate); 5755 } 5756 ibt_free_mcg_info(state->id_mcinfo, 1); 5757 5758 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 5759 } 5760 5761 return (DDI_SUCCESS); 5762 } 5763 5764 /* 5765 * These pair of routines are used to set/clear the condition that 5766 * the caller is likely to do something to change the id_mac_state. 5767 * If there's already someone doing either a start or a stop (possibly 5768 * due to the async handler detecting a pkey relocation event, a plumb 5769 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 5770 * that's done. 5771 */ 5772 static void 5773 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 5774 { 5775 mutex_enter(&state->id_macst_lock); 5776 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 5777 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 5778 5779 state->id_mac_state |= flag; 5780 mutex_exit(&state->id_macst_lock); 5781 } 5782 5783 static void 5784 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 5785 { 5786 mutex_enter(&state->id_macst_lock); 5787 state->id_mac_state &= (~flag); 5788 cv_signal(&state->id_macst_cv); 5789 mutex_exit(&state->id_macst_lock); 5790 } 5791 5792 /* 5793 * GLDv3 entry point to start hardware. 5794 */ 5795 /*ARGSUSED*/ 5796 static int 5797 ibd_m_start(void *arg) 5798 { 5799 ibd_state_t *state = arg; 5800 int ret; 5801 5802 if (state->id_type == IBD_PORT_DRIVER) 5803 return (EINVAL); 5804 5805 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5806 if (state->id_mac_state & IBD_DRV_IN_DELETION) { 5807 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5808 return (EIO); 5809 } 5810 5811 ret = ibd_start(state); 5812 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5813 return (ret); 5814 } 5815 5816 static int 5817 ibd_start(ibd_state_t *state) 5818 { 5819 int err; 5820 ibt_status_t ret; 5821 int late_hca_init = 0; 5822 5823 if (state->id_mac_state & IBD_DRV_STARTED) 5824 return (DDI_SUCCESS); 5825 5826 /* 5827 * We do not increment the running flag when calling ibd_start() as 5828 * a result of some event which moves the state away from late HCA 5829 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability. 5830 */ 5831 if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 5832 (atomic_inc_32_nv(&state->id_running) != 1)) { 5833 DPRINT(10, "ibd_start: id_running is non-zero"); 5834 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 5835 atomic_dec_32(&state->id_running); 5836 return (EINVAL); 5837 } 5838 5839 /* 5840 * Get port details; if we fail here, something bad happened. 5841 * Fail plumb. 5842 */ 5843 if ((err = ibd_get_port_details(state)) != 0) { 5844 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 5845 goto start_fail; 5846 } 5847 /* 5848 * If state->id_link_state is DOWN, it indicates that either the port 5849 * is down, or the pkey is not available. In both cases, resort to late 5850 * initialization. Register for subnet notices, and return success. 5851 */ 5852 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 5853 if (state->id_link_state == LINK_STATE_DOWN) { 5854 late_hca_init = 1; 5855 goto late_hca_init_return; 5856 } 5857 5858 /* 5859 * Find the IPoIB broadcast group 5860 */ 5861 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 5862 /* Resort to late initialization */ 5863 late_hca_init = 1; 5864 goto reg_snet_notices; 5865 } 5866 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 5867 5868 /* 5869 * Initialize per-interface caches and lists; if we fail here, 5870 * it is most likely due to a lack of resources 5871 */ 5872 if (ibd_acache_init(state) != DDI_SUCCESS) { 5873 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 5874 err = ENOMEM; 5875 goto start_fail; 5876 } 5877 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 5878 5879 /* 5880 * Allocate send and receive completion queues 5881 */ 5882 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 5883 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 5884 err = ENOMEM; 5885 goto start_fail; 5886 } 5887 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 5888 5889 /* 5890 * Setup a UD channel 5891 */ 5892 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 5893 err = ENOMEM; 5894 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 5895 goto start_fail; 5896 } 5897 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 5898 5899 /* 5900 * Allocate and initialize the tx buffer list 5901 */ 5902 if (ibd_init_txlist(state) != DDI_SUCCESS) { 5903 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 5904 err = ENOMEM; 5905 goto start_fail; 5906 } 5907 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 5908 5909 /* 5910 * Create the send cq handler here 5911 */ 5912 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 5913 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 5914 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 5915 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 5916 "failed, ret=%d", ret); 5917 err = EINVAL; 5918 goto start_fail; 5919 } 5920 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 5921 5922 /* 5923 * Allocate and initialize the rx buffer list 5924 */ 5925 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 5926 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 5927 err = ENOMEM; 5928 goto start_fail; 5929 } 5930 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 5931 5932 /* 5933 * Join IPoIB broadcast group 5934 */ 5935 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 5936 DPRINT(10, "ibd_start: ibd_join_group() failed"); 5937 err = ENOTACTIVE; 5938 goto start_fail; 5939 } 5940 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 5941 5942 /* 5943 * When we did mac_register() in ibd_attach(), we didn't register 5944 * the real macaddr and we didn't have the true port mtu. Now that 5945 * we're almost ready, set the local mac address and broadcast 5946 * addresses and update gldv3 about the real values of these 5947 * parameters. 5948 */ 5949 if (state->id_enable_rc) { 5950 ibd_h2n_mac(&state->id_macaddr, 5951 IBD_MAC_ADDR_RC + state->id_qpnum, 5952 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5953 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum, 5954 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5955 } else { 5956 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 5957 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5958 } 5959 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 5960 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 5961 5962 if (!state->id_enable_rc) { 5963 (void) mac_maxsdu_update(state->id_mh, state->id_mtu 5964 - IPOIB_HDRSIZE); 5965 } 5966 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 5967 5968 /* 5969 * Setup the receive cq handler 5970 */ 5971 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 5972 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 5973 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 5974 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 5975 "failed, ret=%d", ret); 5976 err = EINVAL; 5977 goto start_fail; 5978 } 5979 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 5980 5981 reg_snet_notices: 5982 /* 5983 * In case of normal initialization sequence, 5984 * Setup the subnet notices handler after we've initialized the acache/ 5985 * mcache and started the async thread, both of which are required for 5986 * the trap handler to function properly. 5987 * 5988 * Now that the async thread has been started (and we've already done 5989 * a mac_register() during attach so mac_tx_update() can be called 5990 * if necessary without any problem), we can enable the trap handler 5991 * to queue requests to the async thread. 5992 * 5993 * In case of late hca initialization, the subnet notices handler will 5994 * only handle MCG created/deleted event. The action performed as part 5995 * of handling these events is to start the interface. So, the 5996 * acache/mcache initialization is not a necessity in such cases for 5997 * registering the subnet notices handler. Also, if we are in 5998 * ibd_start() as a result of, say, some event handling after entering 5999 * late hca initialization phase no need to register again. 6000 */ 6001 if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) { 6002 ibt_register_subnet_notices(state->id_ibt_hdl, 6003 ibd_snet_notices_handler, state); 6004 mutex_enter(&state->id_trap_lock); 6005 state->id_trap_stop = B_FALSE; 6006 mutex_exit(&state->id_trap_lock); 6007 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 6008 } 6009 6010 late_hca_init_return: 6011 if (late_hca_init == 1) { 6012 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT; 6013 /* 6014 * In case of late initialization, mark the link state as down, 6015 * immaterial of the actual link state as reported in the 6016 * port_info. 6017 */ 6018 state->id_link_state = LINK_STATE_DOWN; 6019 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 6020 mac_link_update(state->id_mh, state->id_link_state); 6021 return (DDI_SUCCESS); 6022 } 6023 6024 if (state->id_enable_rc) { 6025 if (state->rc_enable_srq) { 6026 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) { 6027 if (ibd_rc_repost_srq_free_list(state) != 6028 IBT_SUCCESS) { 6029 err = ENOMEM; 6030 goto start_fail; 6031 } 6032 } else { 6033 /* Allocate SRQ resource */ 6034 if (ibd_rc_init_srq_list(state) != 6035 IBT_SUCCESS) { 6036 err = ENOMEM; 6037 goto start_fail; 6038 } 6039 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; 6040 } 6041 } 6042 6043 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { 6044 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " 6045 "failed"); 6046 err = ENOMEM; 6047 goto start_fail; 6048 } 6049 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; 6050 6051 /* RC: begin to listen only after everything is available */ 6052 if (ibd_rc_listen(state) != IBT_SUCCESS) { 6053 DPRINT(10, "ibd_start: ibd_rc_listen() failed"); 6054 err = EINVAL; 6055 goto start_fail; 6056 } 6057 state->id_mac_state |= IBD_DRV_RC_LISTEN; 6058 } 6059 6060 /* 6061 * Indicate link status to GLDv3 and higher layers. By default, 6062 * we assume we are in up state (which must have been true at 6063 * least at the time the broadcast mcg's were probed); if there 6064 * were any up/down transitions till the time we come here, the 6065 * async handler will have updated last known state, which we 6066 * use to tell GLDv3. The async handler will not send any 6067 * notifications to GLDv3 till we reach here in the initialization 6068 * sequence. 6069 */ 6070 mac_link_update(state->id_mh, state->id_link_state); 6071 state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT; 6072 state->id_mac_state |= IBD_DRV_STARTED; 6073 6074 return (DDI_SUCCESS); 6075 6076 start_fail: 6077 /* 6078 * If we ran into a problem during ibd_start() and ran into 6079 * some other problem during undoing our partial work, we can't 6080 * do anything about it. Ignore any errors we might get from 6081 * ibd_undo_start() and just return the original error we got. 6082 */ 6083 (void) ibd_undo_start(state, LINK_STATE_DOWN); 6084 return (err); 6085 } 6086 6087 /* 6088 * GLDv3 entry point to stop hardware from receiving packets. 6089 */ 6090 /*ARGSUSED*/ 6091 static void 6092 ibd_m_stop(void *arg) 6093 { 6094 ibd_state_t *state = (ibd_state_t *)arg; 6095 6096 if (state->id_type == IBD_PORT_DRIVER) 6097 return; 6098 6099 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6100 6101 (void) ibd_undo_start(state, state->id_link_state); 6102 6103 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 6104 } 6105 6106 /* 6107 * GLDv3 entry point to modify device's mac address. We do not 6108 * allow address modifications. 6109 */ 6110 static int 6111 ibd_m_unicst(void *arg, const uint8_t *macaddr) 6112 { 6113 ibd_state_t *state = arg; 6114 6115 if (state->id_type == IBD_PORT_DRIVER) 6116 return (EINVAL); 6117 6118 /* 6119 * Don't bother even comparing the macaddr if we haven't 6120 * completed ibd_m_start(). 6121 */ 6122 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6123 return (0); 6124 6125 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 6126 return (0); 6127 else 6128 return (EINVAL); 6129 } 6130 6131 /* 6132 * The blocking part of the IBA join/leave operations are done out 6133 * of here on the async thread. 6134 */ 6135 static void 6136 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 6137 { 6138 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 6139 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 6140 6141 if (op == IBD_ASYNC_JOIN) { 6142 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 6143 ibd_print_warn(state, "Join multicast group failed :" 6144 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6145 } 6146 } else { 6147 /* 6148 * Here, we must search for the proper mcg_info and 6149 * use that to leave the group. 6150 */ 6151 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 6152 } 6153 } 6154 6155 /* 6156 * GLDv3 entry point for multicast enable/disable requests. 6157 * This function queues the operation to the async thread and 6158 * return success for a valid multicast address. 6159 */ 6160 static int 6161 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 6162 { 6163 ibd_state_t *state = (ibd_state_t *)arg; 6164 ipoib_mac_t maddr, *mcast; 6165 ib_gid_t mgid; 6166 ibd_req_t *req; 6167 6168 if (state->id_type == IBD_PORT_DRIVER) 6169 return (EINVAL); 6170 6171 /* 6172 * If we haven't completed ibd_m_start(), async thread wouldn't 6173 * have been started and id_bcaddr wouldn't be set, so there's 6174 * no point in continuing. 6175 */ 6176 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6177 return (0); 6178 6179 /* 6180 * The incoming multicast address might not be aligned properly 6181 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 6182 * it to look like one though, to get the offsets of the mc gid, 6183 * since we know we are not going to dereference any values with 6184 * the ipoib_mac_t pointer. 6185 */ 6186 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 6187 mcast = &maddr; 6188 6189 /* 6190 * Check validity of MCG address. We could additionally check 6191 * that a enable/disable is not being issued on the "broadcast" 6192 * mcg, but since this operation is only invokable by privileged 6193 * programs anyway, we allow the flexibility to those dlpi apps. 6194 * Note that we do not validate the "scope" of the IBA mcg. 6195 */ 6196 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 6197 return (EINVAL); 6198 6199 /* 6200 * fill in multicast pkey and scope 6201 */ 6202 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 6203 6204 /* 6205 * If someone is trying to JOIN/LEAVE the broadcast group, we do 6206 * nothing (i.e. we stay JOINed to the broadcast group done in 6207 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 6208 * requires to be joined to broadcast groups at all times. 6209 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 6210 * depends on this. 6211 */ 6212 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6213 return (0); 6214 6215 ibd_n2h_gid(mcast, &mgid); 6216 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6217 if (req == NULL) 6218 return (ENOMEM); 6219 6220 req->rq_gid = mgid; 6221 6222 if (add) { 6223 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 6224 mgid.gid_prefix, mgid.gid_guid); 6225 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 6226 } else { 6227 DPRINT(1, "ibd_m_multicst : unset_multicast : " 6228 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 6229 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 6230 } 6231 return (0); 6232 } 6233 6234 /* 6235 * The blocking part of the IBA promiscuous operations are done 6236 * out of here on the async thread. The dlpireq parameter indicates 6237 * whether this invocation is due to a dlpi request or due to 6238 * a port up/down event. 6239 */ 6240 static void 6241 ibd_async_unsetprom(ibd_state_t *state) 6242 { 6243 ibd_mce_t *mce = list_head(&state->id_mc_non); 6244 ib_gid_t mgid; 6245 6246 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 6247 6248 while (mce != NULL) { 6249 mgid = mce->mc_info.mc_adds_vect.av_dgid; 6250 mce = list_next(&state->id_mc_non, mce); 6251 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 6252 } 6253 state->id_prom_op = IBD_OP_NOTSTARTED; 6254 } 6255 6256 /* 6257 * The blocking part of the IBA promiscuous operations are done 6258 * out of here on the async thread. The dlpireq parameter indicates 6259 * whether this invocation is due to a dlpi request or due to 6260 * a port up/down event. 6261 */ 6262 static void 6263 ibd_async_setprom(ibd_state_t *state) 6264 { 6265 ibt_mcg_attr_t mcg_attr; 6266 ibt_mcg_info_t *mcg_info; 6267 ib_gid_t mgid; 6268 uint_t numg; 6269 int i; 6270 char ret = IBD_OP_COMPLETED; 6271 6272 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 6273 6274 /* 6275 * Obtain all active MC groups on the IB fabric with 6276 * specified criteria (scope + Pkey + Qkey + mtu). 6277 */ 6278 bzero(&mcg_attr, sizeof (mcg_attr)); 6279 mcg_attr.mc_pkey = state->id_pkey; 6280 mcg_attr.mc_scope = state->id_scope; 6281 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 6282 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 6283 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 6284 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 6285 IBT_SUCCESS) { 6286 ibd_print_warn(state, "Could not get list of IBA multicast " 6287 "groups"); 6288 ret = IBD_OP_ERRORED; 6289 goto done; 6290 } 6291 6292 /* 6293 * Iterate over the returned mcg's and join as NonMember 6294 * to the IP mcg's. 6295 */ 6296 for (i = 0; i < numg; i++) { 6297 /* 6298 * Do a NonMember JOIN on the MC group. 6299 */ 6300 mgid = mcg_info[i].mc_adds_vect.av_dgid; 6301 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 6302 ibd_print_warn(state, "IBA promiscuous mode missed " 6303 "multicast gid %016llx:%016llx", 6304 (u_longlong_t)mgid.gid_prefix, 6305 (u_longlong_t)mgid.gid_guid); 6306 } 6307 6308 ibt_free_mcg_info(mcg_info, numg); 6309 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 6310 done: 6311 state->id_prom_op = ret; 6312 } 6313 6314 /* 6315 * GLDv3 entry point for multicast promiscuous enable/disable requests. 6316 * GLDv3 assumes phys state receives more packets than multi state, 6317 * which is not true for IPoIB. Thus, treat the multi and phys 6318 * promiscuous states the same way to work with GLDv3's assumption. 6319 */ 6320 static int 6321 ibd_m_promisc(void *arg, boolean_t on) 6322 { 6323 ibd_state_t *state = (ibd_state_t *)arg; 6324 ibd_req_t *req; 6325 6326 if (state->id_type == IBD_PORT_DRIVER) 6327 return (EINVAL); 6328 6329 /* 6330 * Async thread wouldn't have been started if we haven't 6331 * passed ibd_m_start() 6332 */ 6333 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6334 return (0); 6335 6336 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6337 if (req == NULL) 6338 return (ENOMEM); 6339 if (on) { 6340 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 6341 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 6342 } else { 6343 DPRINT(1, "ibd_m_promisc : unset_promisc"); 6344 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 6345 } 6346 6347 return (0); 6348 } 6349 6350 /* 6351 * GLDv3 entry point for gathering statistics. 6352 */ 6353 static int 6354 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 6355 { 6356 ibd_state_t *state = (ibd_state_t *)arg; 6357 6358 switch (stat) { 6359 case MAC_STAT_IFSPEED: 6360 *val = state->id_link_speed; 6361 break; 6362 case MAC_STAT_MULTIRCV: 6363 *val = state->id_multi_rcv; 6364 break; 6365 case MAC_STAT_BRDCSTRCV: 6366 *val = state->id_brd_rcv; 6367 break; 6368 case MAC_STAT_MULTIXMT: 6369 *val = state->id_multi_xmt; 6370 break; 6371 case MAC_STAT_BRDCSTXMT: 6372 *val = state->id_brd_xmt; 6373 break; 6374 case MAC_STAT_RBYTES: 6375 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte 6376 + state->rc_rcv_copy_byte; 6377 break; 6378 case MAC_STAT_IPACKETS: 6379 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt 6380 + state->rc_rcv_copy_pkt; 6381 break; 6382 case MAC_STAT_OBYTES: 6383 *val = state->id_xmt_bytes + state->rc_xmt_bytes; 6384 break; 6385 case MAC_STAT_OPACKETS: 6386 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + 6387 state->rc_xmt_fragmented_pkt + 6388 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; 6389 break; 6390 case MAC_STAT_OERRORS: 6391 *val = state->id_ah_error; /* failed AH translation */ 6392 break; 6393 case MAC_STAT_IERRORS: 6394 *val = 0; 6395 break; 6396 case MAC_STAT_NOXMTBUF: 6397 *val = state->id_tx_short + state->rc_swqe_short + 6398 state->rc_xmt_buf_short; 6399 break; 6400 case MAC_STAT_NORCVBUF: 6401 default: 6402 return (ENOTSUP); 6403 } 6404 6405 return (0); 6406 } 6407 6408 static void 6409 ibd_async_txsched(ibd_state_t *state) 6410 { 6411 ibd_resume_transmission(state); 6412 } 6413 6414 static void 6415 ibd_resume_transmission(ibd_state_t *state) 6416 { 6417 int flag; 6418 int met_thresh = 0; 6419 int thresh = 0; 6420 int ret = -1; 6421 6422 mutex_enter(&state->id_sched_lock); 6423 if (state->id_sched_needed & IBD_RSRC_SWQE) { 6424 mutex_enter(&state->id_tx_list.dl_mutex); 6425 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6426 met_thresh = state->id_tx_list.dl_cnt + 6427 state->id_tx_rel_list.dl_cnt; 6428 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6429 mutex_exit(&state->id_tx_list.dl_mutex); 6430 thresh = IBD_FREE_SWQES_THRESH; 6431 flag = IBD_RSRC_SWQE; 6432 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 6433 ASSERT(state->id_lso != NULL); 6434 mutex_enter(&state->id_lso_lock); 6435 met_thresh = state->id_lso->bkt_nfree; 6436 thresh = IBD_FREE_LSOS_THRESH; 6437 mutex_exit(&state->id_lso_lock); 6438 flag = IBD_RSRC_LSOBUF; 6439 if (met_thresh > thresh) 6440 state->id_sched_lso_cnt++; 6441 } 6442 if (met_thresh > thresh) { 6443 state->id_sched_needed &= ~flag; 6444 state->id_sched_cnt++; 6445 ret = 0; 6446 } 6447 mutex_exit(&state->id_sched_lock); 6448 6449 if (ret == 0) 6450 mac_tx_update(state->id_mh); 6451 } 6452 6453 /* 6454 * Release the send wqe back into free list. 6455 */ 6456 static void 6457 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 6458 { 6459 /* 6460 * Add back on Tx list for reuse. 6461 */ 6462 ASSERT(tail->swqe_next == NULL); 6463 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6464 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 6465 tail->swqe_next = state->id_tx_rel_list.dl_head; 6466 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 6467 state->id_tx_rel_list.dl_cnt += n; 6468 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6469 } 6470 6471 /* 6472 * Acquire a send wqe from free list. 6473 * Returns error number and send wqe pointer. 6474 */ 6475 static ibd_swqe_t * 6476 ibd_acquire_swqe(ibd_state_t *state) 6477 { 6478 ibd_swqe_t *wqe; 6479 6480 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6481 if (state->id_tx_rel_list.dl_head != NULL) { 6482 /* transfer id_tx_rel_list to id_tx_list */ 6483 state->id_tx_list.dl_head = 6484 state->id_tx_rel_list.dl_head; 6485 state->id_tx_list.dl_cnt = 6486 state->id_tx_rel_list.dl_cnt; 6487 state->id_tx_list.dl_pending_sends = B_FALSE; 6488 6489 /* clear id_tx_rel_list */ 6490 state->id_tx_rel_list.dl_head = NULL; 6491 state->id_tx_rel_list.dl_cnt = 0; 6492 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6493 6494 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 6495 state->id_tx_list.dl_cnt -= 1; 6496 state->id_tx_list.dl_head = wqe->swqe_next; 6497 } else { /* no free swqe */ 6498 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6499 state->id_tx_list.dl_pending_sends = B_TRUE; 6500 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 6501 state->id_tx_short++; 6502 wqe = NULL; 6503 } 6504 return (wqe); 6505 } 6506 6507 static int 6508 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 6509 ibt_ud_dest_hdl_t ud_dest) 6510 { 6511 mblk_t *nmp; 6512 int iph_len, tcph_len; 6513 ibt_wr_lso_t *lso; 6514 uintptr_t ip_start, tcp_start; 6515 uint8_t *dst; 6516 uint_t pending, mblen; 6517 6518 /* 6519 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 6520 * we need to adjust it here for lso. 6521 */ 6522 lso = &(node->w_swr.wr.ud_lso); 6523 lso->lso_ud_dest = ud_dest; 6524 lso->lso_mss = mss; 6525 6526 /* 6527 * Calculate the LSO header size and set it in the UD LSO structure. 6528 * Note that the only assumption we make is that each of the IPoIB, 6529 * IP and TCP headers will be contained in a single mblk fragment; 6530 * together, the headers may span multiple mblk fragments. 6531 */ 6532 nmp = mp; 6533 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 6534 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 6535 ip_start = (uintptr_t)nmp->b_cont->b_rptr 6536 + (ip_start - (uintptr_t)(nmp->b_wptr)); 6537 nmp = nmp->b_cont; 6538 6539 } 6540 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 6541 6542 tcp_start = ip_start + iph_len; 6543 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 6544 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 6545 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 6546 nmp = nmp->b_cont; 6547 } 6548 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 6549 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 6550 6551 /* 6552 * If the lso header fits entirely within a single mblk fragment, 6553 * we'll avoid an additional copy of the lso header here and just 6554 * pass the b_rptr of the mblk directly. 6555 * 6556 * If this isn't true, we'd have to allocate for it explicitly. 6557 */ 6558 if (lso->lso_hdr_sz <= MBLKL(mp)) { 6559 lso->lso_hdr = mp->b_rptr; 6560 } else { 6561 /* On work completion, remember to free this allocated hdr */ 6562 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 6563 if (lso->lso_hdr == NULL) { 6564 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 6565 "sz = %d", lso->lso_hdr_sz); 6566 lso->lso_hdr_sz = 0; 6567 lso->lso_mss = 0; 6568 return (-1); 6569 } 6570 } 6571 6572 /* 6573 * Copy in the lso header only if we need to 6574 */ 6575 if (lso->lso_hdr != mp->b_rptr) { 6576 dst = lso->lso_hdr; 6577 pending = lso->lso_hdr_sz; 6578 6579 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 6580 mblen = MBLKL(nmp); 6581 if (pending > mblen) { 6582 bcopy(nmp->b_rptr, dst, mblen); 6583 dst += mblen; 6584 pending -= mblen; 6585 } else { 6586 bcopy(nmp->b_rptr, dst, pending); 6587 break; 6588 } 6589 } 6590 } 6591 6592 return (0); 6593 } 6594 6595 static void 6596 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 6597 { 6598 ibt_wr_lso_t *lso; 6599 6600 if ((!node) || (!mp)) 6601 return; 6602 6603 /* 6604 * Free any header space that we might've allocated if we 6605 * did an LSO 6606 */ 6607 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 6608 lso = &(node->w_swr.wr.ud_lso); 6609 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 6610 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 6611 lso->lso_hdr = NULL; 6612 lso->lso_hdr_sz = 0; 6613 } 6614 } 6615 } 6616 6617 static void 6618 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 6619 { 6620 uint_t i; 6621 uint_t num_posted; 6622 uint_t n_wrs; 6623 ibt_status_t ibt_status; 6624 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 6625 ibd_swqe_t *tx_head, *elem; 6626 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 6627 6628 /* post the one request, then check for more */ 6629 ibt_status = ibt_post_send(state->id_chnl_hdl, 6630 &node->w_swr, 1, NULL); 6631 if (ibt_status != IBT_SUCCESS) { 6632 ibd_print_warn(state, "ibd_post_send: " 6633 "posting one wr failed: ret=%d", ibt_status); 6634 ibd_tx_cleanup(state, node); 6635 } 6636 6637 tx_head = NULL; 6638 for (;;) { 6639 if (tx_head == NULL) { 6640 mutex_enter(&state->id_txpost_lock); 6641 tx_head = state->id_tx_head; 6642 if (tx_head == NULL) { 6643 state->id_tx_busy = 0; 6644 mutex_exit(&state->id_txpost_lock); 6645 return; 6646 } 6647 state->id_tx_head = NULL; 6648 mutex_exit(&state->id_txpost_lock); 6649 } 6650 6651 /* 6652 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 6653 * at a time if possible, and keep posting them. 6654 */ 6655 for (n_wrs = 0, elem = tx_head; 6656 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 6657 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 6658 nodes[n_wrs] = elem; 6659 wrs[n_wrs] = elem->w_swr; 6660 } 6661 tx_head = elem; 6662 6663 ASSERT(n_wrs != 0); 6664 6665 /* 6666 * If posting fails for some reason, we'll never receive 6667 * completion intimation, so we'll need to cleanup. But 6668 * we need to make sure we don't clean up nodes whose 6669 * wrs have been successfully posted. We assume that the 6670 * hca driver returns on the first failure to post and 6671 * therefore the first 'num_posted' entries don't need 6672 * cleanup here. 6673 */ 6674 num_posted = 0; 6675 ibt_status = ibt_post_send(state->id_chnl_hdl, 6676 wrs, n_wrs, &num_posted); 6677 if (ibt_status != IBT_SUCCESS) { 6678 ibd_print_warn(state, "ibd_post_send: " 6679 "posting multiple wrs failed: " 6680 "requested=%d, done=%d, ret=%d", 6681 n_wrs, num_posted, ibt_status); 6682 6683 for (i = num_posted; i < n_wrs; i++) 6684 ibd_tx_cleanup(state, nodes[i]); 6685 } 6686 } 6687 } 6688 6689 static int 6690 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 6691 uint_t lsohdr_sz) 6692 { 6693 ibt_wr_ds_t *sgl; 6694 ibt_status_t ibt_status; 6695 mblk_t *nmp; 6696 mblk_t *data_mp; 6697 uchar_t *bufp; 6698 size_t blksize; 6699 size_t skip; 6700 size_t avail; 6701 uint_t pktsize; 6702 uint_t frag_len; 6703 uint_t pending_hdr; 6704 int nmblks; 6705 int i; 6706 6707 /* 6708 * Let's skip ahead to the data if this is LSO 6709 */ 6710 data_mp = mp; 6711 pending_hdr = 0; 6712 if (lsohdr_sz) { 6713 pending_hdr = lsohdr_sz; 6714 for (nmp = mp; nmp; nmp = nmp->b_cont) { 6715 frag_len = nmp->b_wptr - nmp->b_rptr; 6716 if (frag_len > pending_hdr) 6717 break; 6718 pending_hdr -= frag_len; 6719 } 6720 data_mp = nmp; /* start of data past lso header */ 6721 ASSERT(data_mp != NULL); 6722 } 6723 6724 /* 6725 * Calculate the size of message data and number of msg blocks 6726 */ 6727 pktsize = 0; 6728 for (nmblks = 0, nmp = data_mp; nmp != NULL; 6729 nmp = nmp->b_cont, nmblks++) { 6730 pktsize += MBLKL(nmp); 6731 } 6732 pktsize -= pending_hdr; 6733 6734 /* 6735 * We only do ibt_map_mem_iov() if the pktsize is above the 6736 * "copy-threshold", and if the number of mp fragments is less than 6737 * the maximum acceptable. 6738 */ 6739 if ((state->id_hca_res_lkey_capab) && 6740 (pktsize > state->id_ud_tx_copy_thresh) && 6741 (nmblks < state->id_max_sqseg_hiwm)) { 6742 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 6743 ibt_iov_attr_t iov_attr; 6744 6745 iov_attr.iov_as = NULL; 6746 iov_attr.iov = iov_arr; 6747 iov_attr.iov_buf = NULL; 6748 iov_attr.iov_list_len = nmblks; 6749 iov_attr.iov_wr_nds = state->id_max_sqseg; 6750 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 6751 iov_attr.iov_flags = IBT_IOV_SLEEP; 6752 6753 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 6754 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 6755 iov_arr[i].iov_len = MBLKL(nmp); 6756 if (i == 0) { 6757 iov_arr[i].iov_addr += pending_hdr; 6758 iov_arr[i].iov_len -= pending_hdr; 6759 } 6760 } 6761 6762 node->w_buftype = IBD_WQE_MAPPED; 6763 node->w_swr.wr_sgl = node->w_sgl; 6764 6765 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 6766 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 6767 if (ibt_status != IBT_SUCCESS) { 6768 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 6769 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 6770 goto ibd_copy_path; 6771 } 6772 6773 return (0); 6774 } 6775 6776 ibd_copy_path: 6777 if (pktsize <= state->id_tx_buf_sz) { 6778 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 6779 node->w_swr.wr_nds = 1; 6780 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 6781 node->w_buftype = IBD_WQE_TXBUF; 6782 6783 /* 6784 * Even though this is the copy path for transfers less than 6785 * id_tx_buf_sz, it could still be an LSO packet. If so, it 6786 * is possible the first data mblk fragment (data_mp) still 6787 * contains part of the LSO header that we need to skip. 6788 */ 6789 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6790 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 6791 blksize = MBLKL(nmp) - pending_hdr; 6792 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 6793 bufp += blksize; 6794 pending_hdr = 0; 6795 } 6796 6797 return (0); 6798 } 6799 6800 /* 6801 * Copy path for transfers greater than id_tx_buf_sz 6802 */ 6803 node->w_swr.wr_sgl = node->w_sgl; 6804 if (ibd_acquire_lsobufs(state, pktsize, 6805 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 6806 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 6807 return (-1); 6808 } 6809 node->w_buftype = IBD_WQE_LSOBUF; 6810 6811 /* 6812 * Copy the larger-than-id_tx_buf_sz packet into a set of 6813 * fixed-sized, pre-mapped LSO buffers. Note that we might 6814 * need to skip part of the LSO header in the first fragment 6815 * as before. 6816 */ 6817 nmp = data_mp; 6818 skip = pending_hdr; 6819 for (i = 0; i < node->w_swr.wr_nds; i++) { 6820 sgl = node->w_swr.wr_sgl + i; 6821 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 6822 avail = IBD_LSO_BUFSZ; 6823 while (nmp && avail) { 6824 blksize = MBLKL(nmp) - skip; 6825 if (blksize > avail) { 6826 bcopy(nmp->b_rptr + skip, bufp, avail); 6827 skip += avail; 6828 avail = 0; 6829 } else { 6830 bcopy(nmp->b_rptr + skip, bufp, blksize); 6831 skip = 0; 6832 avail -= blksize; 6833 bufp += blksize; 6834 nmp = nmp->b_cont; 6835 } 6836 } 6837 } 6838 6839 return (0); 6840 } 6841 6842 /* 6843 * Schedule a completion queue polling to reap the resource we're 6844 * short on. If we implement the change to reap tx completions 6845 * in a separate thread, we'll need to wake up that thread here. 6846 */ 6847 static int 6848 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 6849 { 6850 ibd_req_t *req; 6851 6852 mutex_enter(&state->id_sched_lock); 6853 state->id_sched_needed |= resource_type; 6854 mutex_exit(&state->id_sched_lock); 6855 6856 /* 6857 * If we are asked to queue a work entry, we need to do it 6858 */ 6859 if (q_flag) { 6860 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6861 if (req == NULL) 6862 return (-1); 6863 6864 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 6865 } 6866 6867 return (0); 6868 } 6869 6870 /* 6871 * The passed in packet has this format: 6872 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 6873 */ 6874 static boolean_t 6875 ibd_send(ibd_state_t *state, mblk_t *mp) 6876 { 6877 ibd_ace_t *ace; 6878 ibd_swqe_t *node; 6879 ipoib_mac_t *dest; 6880 ib_header_info_t *ipibp; 6881 ip6_t *ip6h; 6882 uint_t pktsize; 6883 uint32_t mss; 6884 uint32_t hckflags; 6885 uint32_t lsoflags = 0; 6886 uint_t lsohdr_sz = 0; 6887 int ret, len; 6888 boolean_t dofree = B_FALSE; 6889 boolean_t rc; 6890 /* if (rc_chan == NULL) send by UD; else send by RC; */ 6891 ibd_rc_chan_t *rc_chan; 6892 int nmblks; 6893 mblk_t *nmp; 6894 6895 /* 6896 * If we aren't done with the device initialization and start, 6897 * we shouldn't be here. 6898 */ 6899 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6900 return (B_FALSE); 6901 6902 /* 6903 * Obtain an address handle for the destination. 6904 */ 6905 ipibp = (ib_header_info_t *)mp->b_rptr; 6906 dest = (ipoib_mac_t *)&ipibp->ib_dst; 6907 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6908 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 6909 6910 rc_chan = NULL; 6911 ace = ibd_acache_lookup(state, dest, &ret, 1); 6912 if (state->id_enable_rc && (ace != NULL) && 6913 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { 6914 if (ace->ac_chan == NULL) { 6915 state->rc_null_conn++; 6916 } else { 6917 if (ace->ac_chan->chan_state == 6918 IBD_RC_STATE_ACT_ESTAB) { 6919 rc_chan = ace->ac_chan; 6920 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 6921 node = WQE_TO_SWQE( 6922 rc_chan->tx_wqe_list.dl_head); 6923 if (node != NULL) { 6924 rc_chan->tx_wqe_list.dl_cnt -= 1; 6925 rc_chan->tx_wqe_list.dl_head = 6926 node->swqe_next; 6927 } else { 6928 node = ibd_rc_acquire_swqes(rc_chan); 6929 } 6930 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 6931 6932 if (node == NULL) { 6933 state->rc_swqe_short++; 6934 mutex_enter(&state->id_sched_lock); 6935 state->id_sched_needed |= 6936 IBD_RSRC_RC_SWQE; 6937 mutex_exit(&state->id_sched_lock); 6938 ibd_dec_ref_ace(state, ace); 6939 return (B_FALSE); 6940 } 6941 } else { 6942 state->rc_no_estab_conn++; 6943 } 6944 } 6945 } 6946 6947 if (rc_chan == NULL) { 6948 mutex_enter(&state->id_tx_list.dl_mutex); 6949 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 6950 if (node != NULL) { 6951 state->id_tx_list.dl_cnt -= 1; 6952 state->id_tx_list.dl_head = node->swqe_next; 6953 } else { 6954 node = ibd_acquire_swqe(state); 6955 } 6956 mutex_exit(&state->id_tx_list.dl_mutex); 6957 if (node == NULL) { 6958 /* 6959 * If we don't have an swqe available, schedule a 6960 * transmit completion queue cleanup and hold off on 6961 * sending more packets until we have some free swqes 6962 */ 6963 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { 6964 if (ace != NULL) { 6965 ibd_dec_ref_ace(state, ace); 6966 } 6967 return (B_FALSE); 6968 } 6969 6970 /* 6971 * If a poll cannot be scheduled, we have no choice but 6972 * to drop this packet 6973 */ 6974 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 6975 if (ace != NULL) { 6976 ibd_dec_ref_ace(state, ace); 6977 } 6978 return (B_TRUE); 6979 } 6980 } 6981 6982 /* 6983 * Initialize the commonly used fields in swqe to NULL to protect 6984 * against ibd_tx_cleanup accidentally misinterpreting these on a 6985 * failure. 6986 */ 6987 node->swqe_im_mblk = NULL; 6988 node->w_swr.wr_nds = 0; 6989 node->w_swr.wr_sgl = NULL; 6990 node->w_swr.wr_opcode = IBT_WRC_SEND; 6991 6992 /* 6993 * Calculate the size of message data and number of msg blocks 6994 */ 6995 pktsize = 0; 6996 for (nmblks = 0, nmp = mp; nmp != NULL; 6997 nmp = nmp->b_cont, nmblks++) { 6998 pktsize += MBLKL(nmp); 6999 } 7000 7001 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 7002 atomic_inc_64(&state->id_brd_xmt); 7003 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 7004 atomic_inc_64(&state->id_multi_xmt); 7005 7006 if (ace != NULL) { 7007 node->w_ahandle = ace; 7008 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 7009 } else { 7010 DPRINT(5, 7011 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 7012 ((ret == EFAULT) ? "failed" : "queued"), 7013 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 7014 htonl(dest->ipoib_gidpref[1]), 7015 htonl(dest->ipoib_gidsuff[0]), 7016 htonl(dest->ipoib_gidsuff[1])); 7017 state->rc_ace_not_found++; 7018 node->w_ahandle = NULL; 7019 7020 /* 7021 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 7022 * can not find a path for the specific dest address. We 7023 * should get rid of this kind of packet. We also should get 7024 * rid of the packet if we cannot schedule a poll via the 7025 * async thread. For the normal case, ibd will return the 7026 * packet to upper layer and wait for AH creating. 7027 * 7028 * Note that we always queue a work slot entry for the async 7029 * thread when we fail AH lookup (even in intr mode); this is 7030 * due to the convoluted way the code currently looks for AH. 7031 */ 7032 if (ret == EFAULT) { 7033 dofree = B_TRUE; 7034 rc = B_TRUE; 7035 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 7036 dofree = B_TRUE; 7037 rc = B_TRUE; 7038 } else { 7039 dofree = B_FALSE; 7040 rc = B_FALSE; 7041 } 7042 goto ibd_send_fail; 7043 } 7044 7045 /* 7046 * For ND6 packets, padding is at the front of the source lladdr. 7047 * Insert the padding at front. 7048 */ 7049 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 7050 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 7051 if (!pullupmsg(mp, IPV6_HDR_LEN + 7052 sizeof (ib_header_info_t))) { 7053 DPRINT(10, "ibd_send: pullupmsg failure "); 7054 dofree = B_TRUE; 7055 rc = B_TRUE; 7056 goto ibd_send_fail; 7057 } 7058 ipibp = (ib_header_info_t *)mp->b_rptr; 7059 } 7060 ip6h = (ip6_t *)((uchar_t *)ipibp + 7061 sizeof (ib_header_info_t)); 7062 len = ntohs(ip6h->ip6_plen); 7063 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 7064 mblk_t *pad; 7065 7066 pad = allocb(4, 0); 7067 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 7068 linkb(mp, pad); 7069 if (MBLKL(mp) < sizeof (ib_header_info_t) + 7070 IPV6_HDR_LEN + len + 4) { 7071 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 7072 IPV6_HDR_LEN + len + 4)) { 7073 DPRINT(10, "ibd_send: pullupmsg " 7074 "failure "); 7075 dofree = B_TRUE; 7076 rc = B_TRUE; 7077 goto ibd_send_fail; 7078 } 7079 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 7080 sizeof (ib_header_info_t)); 7081 } 7082 7083 /* LINTED: E_CONSTANT_CONDITION */ 7084 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 7085 } 7086 } 7087 7088 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); 7089 mp->b_rptr += sizeof (ib_addrs_t); 7090 pktsize -= sizeof (ib_addrs_t); 7091 7092 if (rc_chan) { /* send in RC mode */ 7093 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 7094 ibt_iov_attr_t iov_attr; 7095 uint_t i; 7096 size_t blksize; 7097 uchar_t *bufp; 7098 ibd_rc_tx_largebuf_t *lbufp; 7099 7100 atomic_add_64(&state->rc_xmt_bytes, pktsize); 7101 7102 /* 7103 * Upper layer does Tx checksum, we don't need do any 7104 * checksum here. 7105 */ 7106 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); 7107 7108 /* 7109 * We only do ibt_map_mem_iov() if the pktsize is above 7110 * the "copy-threshold", and if the number of mp 7111 * fragments is less than the maximum acceptable. 7112 */ 7113 if (pktsize <= state->id_rc_tx_copy_thresh) { 7114 atomic_inc_64(&state->rc_xmt_small_pkt); 7115 /* 7116 * Only process unicast packet in Reliable Connected 7117 * mode. 7118 */ 7119 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 7120 node->w_swr.wr_nds = 1; 7121 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 7122 node->w_buftype = IBD_WQE_TXBUF; 7123 7124 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 7125 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7126 blksize = MBLKL(nmp); 7127 bcopy(nmp->b_rptr, bufp, blksize); 7128 bufp += blksize; 7129 } 7130 freemsg(mp); 7131 ASSERT(node->swqe_im_mblk == NULL); 7132 } else { 7133 if ((state->rc_enable_iov_map) && 7134 (nmblks < state->rc_max_sqseg_hiwm)) { 7135 7136 /* do ibt_map_mem_iov() */ 7137 iov_attr.iov_as = NULL; 7138 iov_attr.iov = iov_arr; 7139 iov_attr.iov_buf = NULL; 7140 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; 7141 iov_attr.iov_lso_hdr_sz = 0; 7142 iov_attr.iov_flags = IBT_IOV_SLEEP; 7143 7144 i = 0; 7145 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7146 iov_arr[i].iov_len = MBLKL(nmp); 7147 if (iov_arr[i].iov_len != 0) { 7148 iov_arr[i].iov_addr = (caddr_t) 7149 (void *)nmp->b_rptr; 7150 i++; 7151 } 7152 } 7153 iov_attr.iov_list_len = i; 7154 node->w_swr.wr_sgl = node->w_sgl; 7155 7156 ret = ibt_map_mem_iov(state->id_hca_hdl, 7157 &iov_attr, (ibt_all_wr_t *)&node->w_swr, 7158 &node->w_mi_hdl); 7159 if (ret != IBT_SUCCESS) { 7160 atomic_inc_64( 7161 &state->rc_xmt_map_fail_pkt); 7162 DPRINT(30, "ibd_send: ibt_map_mem_iov(" 7163 ") failed, nmblks=%d, real_nmblks" 7164 "=%d, ret=0x%x", nmblks, i, ret); 7165 goto ibd_rc_large_copy; 7166 } 7167 7168 atomic_inc_64(&state->rc_xmt_map_succ_pkt); 7169 node->w_buftype = IBD_WQE_MAPPED; 7170 node->swqe_im_mblk = mp; 7171 } else { 7172 atomic_inc_64(&state->rc_xmt_fragmented_pkt); 7173 ibd_rc_large_copy: 7174 mutex_enter(&state->rc_tx_large_bufs_lock); 7175 if (state->rc_tx_largebuf_nfree == 0) { 7176 state->rc_xmt_buf_short++; 7177 mutex_exit 7178 (&state->rc_tx_large_bufs_lock); 7179 mutex_enter(&state->id_sched_lock); 7180 state->id_sched_needed |= 7181 IBD_RSRC_RC_TX_LARGEBUF; 7182 mutex_exit(&state->id_sched_lock); 7183 dofree = B_FALSE; 7184 rc = B_FALSE; 7185 /* 7186 * If we don't have Tx large bufs, 7187 * return failure. node->w_buftype 7188 * should not be IBD_WQE_RC_COPYBUF, 7189 * otherwise it will cause problem 7190 * in ibd_rc_tx_cleanup() 7191 */ 7192 node->w_buftype = IBD_WQE_TXBUF; 7193 goto ibd_send_fail; 7194 } 7195 7196 lbufp = state->rc_tx_largebuf_free_head; 7197 ASSERT(lbufp->lb_buf != NULL); 7198 state->rc_tx_largebuf_free_head = 7199 lbufp->lb_next; 7200 lbufp->lb_next = NULL; 7201 /* Update nfree count */ 7202 state->rc_tx_largebuf_nfree --; 7203 mutex_exit(&state->rc_tx_large_bufs_lock); 7204 bufp = lbufp->lb_buf; 7205 node->w_sgl[0].ds_va = 7206 (ib_vaddr_t)(uintptr_t)bufp; 7207 node->w_sgl[0].ds_key = 7208 state->rc_tx_mr_desc.md_lkey; 7209 node->w_sgl[0].ds_len = pktsize; 7210 node->w_swr.wr_sgl = node->w_sgl; 7211 node->w_swr.wr_nds = 1; 7212 node->w_buftype = IBD_WQE_RC_COPYBUF; 7213 node->w_rc_tx_largebuf = lbufp; 7214 7215 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 7216 blksize = MBLKL(nmp); 7217 if (blksize != 0) { 7218 bcopy(nmp->b_rptr, bufp, 7219 blksize); 7220 bufp += blksize; 7221 } 7222 } 7223 freemsg(mp); 7224 ASSERT(node->swqe_im_mblk == NULL); 7225 } 7226 } 7227 7228 node->swqe_next = NULL; 7229 mutex_enter(&rc_chan->tx_post_lock); 7230 if (rc_chan->tx_busy) { 7231 if (rc_chan->tx_head) { 7232 rc_chan->tx_tail->swqe_next = 7233 SWQE_TO_WQE(node); 7234 } else { 7235 rc_chan->tx_head = node; 7236 } 7237 rc_chan->tx_tail = node; 7238 mutex_exit(&rc_chan->tx_post_lock); 7239 } else { 7240 rc_chan->tx_busy = 1; 7241 mutex_exit(&rc_chan->tx_post_lock); 7242 ibd_rc_post_send(rc_chan, node); 7243 } 7244 7245 return (B_TRUE); 7246 } /* send by RC */ 7247 7248 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { 7249 /* 7250 * Too long pktsize. The packet size from GLD should <= 7251 * state->id_mtu + sizeof (ib_addrs_t) 7252 */ 7253 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { 7254 ibd_req_t *req; 7255 7256 mutex_enter(&ace->tx_too_big_mutex); 7257 if (ace->tx_too_big_ongoing) { 7258 mutex_exit(&ace->tx_too_big_mutex); 7259 state->rc_xmt_reenter_too_long_pkt++; 7260 dofree = B_TRUE; 7261 } else { 7262 ace->tx_too_big_ongoing = B_TRUE; 7263 mutex_exit(&ace->tx_too_big_mutex); 7264 state->rc_xmt_icmp_too_long_pkt++; 7265 7266 req = kmem_cache_alloc(state->id_req_kmc, 7267 KM_NOSLEEP); 7268 if (req == NULL) { 7269 ibd_print_warn(state, "ibd_send: alloc " 7270 "ibd_req_t fail"); 7271 /* Drop it. */ 7272 dofree = B_TRUE; 7273 } else { 7274 req->rq_ptr = mp; 7275 req->rq_ptr2 = ace; 7276 ibd_queue_work_slot(state, req, 7277 IBD_ASYNC_RC_TOO_BIG); 7278 dofree = B_FALSE; 7279 } 7280 } 7281 } else { 7282 ibd_print_warn(state, "Reliable Connected mode is on. " 7283 "Multicast packet length %d > %d is too long to " 7284 "send packet (%d > %d), drop it", 7285 pktsize, state->id_mtu); 7286 state->rc_xmt_drop_too_long_pkt++; 7287 /* Drop it. */ 7288 dofree = B_TRUE; 7289 } 7290 rc = B_TRUE; 7291 goto ibd_send_fail; 7292 } 7293 7294 atomic_add_64(&state->id_xmt_bytes, pktsize); 7295 atomic_inc_64(&state->id_xmt_pkt); 7296 7297 /* 7298 * Do LSO and checksum related work here. For LSO send, adjust the 7299 * ud destination, the opcode and the LSO header information to the 7300 * work request. 7301 */ 7302 mac_lso_get(mp, &mss, &lsoflags); 7303 if ((lsoflags & HW_LSO) != HW_LSO) { 7304 node->w_swr.wr_opcode = IBT_WRC_SEND; 7305 lsohdr_sz = 0; 7306 } else { 7307 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 7308 /* 7309 * The routine can only fail if there's no memory; we 7310 * can only drop the packet if this happens 7311 */ 7312 ibd_print_warn(state, 7313 "ibd_send: no memory, lso posting failed"); 7314 dofree = B_TRUE; 7315 rc = B_TRUE; 7316 goto ibd_send_fail; 7317 } 7318 7319 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 7320 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 7321 } 7322 7323 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); 7324 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 7325 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 7326 else 7327 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 7328 7329 /* 7330 * Prepare the sgl for posting; the routine can only fail if there's 7331 * no lso buf available for posting. If this is the case, we should 7332 * probably resched for lso bufs to become available and then try again. 7333 */ 7334 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 7335 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 7336 dofree = B_TRUE; 7337 rc = B_TRUE; 7338 } else { 7339 dofree = B_FALSE; 7340 rc = B_FALSE; 7341 } 7342 goto ibd_send_fail; 7343 } 7344 node->swqe_im_mblk = mp; 7345 7346 /* 7347 * Queue the wqe to hardware; since we can now simply queue a 7348 * post instead of doing it serially, we cannot assume anything 7349 * about the 'node' after ibd_post_send() returns. 7350 */ 7351 node->swqe_next = NULL; 7352 7353 mutex_enter(&state->id_txpost_lock); 7354 if (state->id_tx_busy) { 7355 if (state->id_tx_head) { 7356 state->id_tx_tail->swqe_next = 7357 SWQE_TO_WQE(node); 7358 } else { 7359 state->id_tx_head = node; 7360 } 7361 state->id_tx_tail = node; 7362 mutex_exit(&state->id_txpost_lock); 7363 } else { 7364 state->id_tx_busy = 1; 7365 mutex_exit(&state->id_txpost_lock); 7366 ibd_post_send(state, node); 7367 } 7368 7369 return (B_TRUE); 7370 7371 ibd_send_fail: 7372 if (node && mp) 7373 ibd_free_lsohdr(node, mp); 7374 7375 if (dofree) 7376 freemsg(mp); 7377 7378 if (node != NULL) { 7379 if (rc_chan) { 7380 ibd_rc_tx_cleanup(node); 7381 } else { 7382 ibd_tx_cleanup(state, node); 7383 } 7384 } 7385 7386 return (rc); 7387 } 7388 7389 /* 7390 * GLDv3 entry point for transmitting datagram. 7391 */ 7392 static mblk_t * 7393 ibd_m_tx(void *arg, mblk_t *mp) 7394 { 7395 ibd_state_t *state = (ibd_state_t *)arg; 7396 mblk_t *next; 7397 7398 if (state->id_type == IBD_PORT_DRIVER) { 7399 freemsgchain(mp); 7400 return (NULL); 7401 } 7402 7403 if ((state->id_link_state != LINK_STATE_UP) || 7404 !(state->id_mac_state & IBD_DRV_STARTED)) { 7405 freemsgchain(mp); 7406 mp = NULL; 7407 } 7408 7409 while (mp != NULL) { 7410 next = mp->b_next; 7411 mp->b_next = NULL; 7412 if (ibd_send(state, mp) == B_FALSE) { 7413 /* Send fail */ 7414 mp->b_next = next; 7415 break; 7416 } 7417 mp = next; 7418 } 7419 7420 return (mp); 7421 } 7422 7423 /* 7424 * this handles Tx and Rx completions. With separate CQs, this handles 7425 * only Rx completions. 7426 */ 7427 static uint_t 7428 ibd_intr(caddr_t arg) 7429 { 7430 ibd_state_t *state = (ibd_state_t *)arg; 7431 7432 ibd_poll_rcq(state, state->id_rcq_hdl); 7433 7434 return (DDI_INTR_CLAIMED); 7435 } 7436 7437 /* 7438 * Poll and fully drain the send cq 7439 */ 7440 static void 7441 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7442 { 7443 ibt_wc_t *wcs = state->id_txwcs; 7444 uint_t numwcs = state->id_txwcs_size; 7445 ibd_wqe_t *wqe; 7446 ibd_swqe_t *head, *tail; 7447 ibt_wc_t *wc; 7448 uint_t num_polled; 7449 int i; 7450 7451 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7452 head = tail = NULL; 7453 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7454 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 7455 if (wc->wc_status != IBT_WC_SUCCESS) { 7456 /* 7457 * Channel being torn down. 7458 */ 7459 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7460 DPRINT(5, "ibd_drain_scq: flush error"); 7461 DPRINT(10, "ibd_drain_scq: Bad " 7462 "status %d", wc->wc_status); 7463 } else { 7464 DPRINT(10, "ibd_drain_scq: " 7465 "unexpected wc_status %d", 7466 wc->wc_status); 7467 } 7468 /* 7469 * Fallthrough to invoke the Tx handler to 7470 * release held resources, e.g., AH refcount. 7471 */ 7472 } 7473 /* 7474 * Add this swqe to the list to be cleaned up. 7475 */ 7476 if (head) 7477 tail->swqe_next = wqe; 7478 else 7479 head = WQE_TO_SWQE(wqe); 7480 tail = WQE_TO_SWQE(wqe); 7481 } 7482 tail->swqe_next = NULL; 7483 ibd_tx_cleanup_list(state, head, tail); 7484 7485 /* 7486 * Resume any blocked transmissions if possible 7487 */ 7488 ibd_resume_transmission(state); 7489 } 7490 } 7491 7492 /* 7493 * Poll and fully drain the receive cq 7494 */ 7495 static void 7496 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7497 { 7498 ibt_wc_t *wcs = state->id_rxwcs; 7499 uint_t numwcs = state->id_rxwcs_size; 7500 ibd_rwqe_t *rwqe; 7501 ibt_wc_t *wc; 7502 uint_t num_polled; 7503 int i; 7504 mblk_t *head, *tail, *mp; 7505 7506 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7507 head = tail = NULL; 7508 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7509 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 7510 if (wc->wc_status != IBT_WC_SUCCESS) { 7511 /* 7512 * Channel being torn down. 7513 */ 7514 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7515 DPRINT(5, "ibd_drain_rcq: " 7516 "expected flushed rwqe"); 7517 } else { 7518 DPRINT(5, "ibd_drain_rcq: " 7519 "unexpected wc_status %d", 7520 wc->wc_status); 7521 } 7522 atomic_inc_32( 7523 &state->id_rx_list.dl_bufs_outstanding); 7524 freemsg(rwqe->rwqe_im_mblk); 7525 continue; 7526 } 7527 mp = ibd_process_rx(state, rwqe, wc); 7528 if (mp == NULL) 7529 continue; 7530 7531 /* 7532 * Add this mp to the list to send to the nw layer. 7533 */ 7534 if (head) 7535 tail->b_next = mp; 7536 else 7537 head = mp; 7538 tail = mp; 7539 } 7540 if (head) 7541 mac_rx(state->id_mh, state->id_rh, head); 7542 7543 /* 7544 * Account for #rwqes polled. 7545 * Post more here, if less than one fourth full. 7546 */ 7547 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 7548 (state->id_ud_num_rwqe / 4)) 7549 ibd_post_recv_intr(state); 7550 } 7551 } 7552 7553 /* 7554 * Common code for interrupt handling as well as for polling 7555 * for all completed wqe's while detaching. 7556 */ 7557 static void 7558 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7559 { 7560 int flag, redo_flag; 7561 int redo = 1; 7562 7563 flag = IBD_CQ_POLLING; 7564 redo_flag = IBD_REDO_CQ_POLLING; 7565 7566 mutex_enter(&state->id_scq_poll_lock); 7567 if (state->id_scq_poll_busy & flag) { 7568 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 7569 state->id_scq_poll_busy |= redo_flag; 7570 mutex_exit(&state->id_scq_poll_lock); 7571 return; 7572 } 7573 state->id_scq_poll_busy |= flag; 7574 mutex_exit(&state->id_scq_poll_lock); 7575 7576 /* 7577 * In some cases (eg detaching), this code can be invoked on 7578 * any cpu after disabling cq notification (thus no concurrency 7579 * exists). Apart from that, the following applies normally: 7580 * Transmit completion handling could be from any cpu if 7581 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 7582 * is interrupt driven. 7583 */ 7584 7585 /* 7586 * Poll and drain the CQ 7587 */ 7588 ibd_drain_scq(state, cq_hdl); 7589 7590 /* 7591 * Enable CQ notifications and redrain the cq to catch any 7592 * completions we might have missed after the ibd_drain_scq() 7593 * above and before the ibt_enable_cq_notify() that follows. 7594 * Finally, service any new requests to poll the cq that 7595 * could've come in after the ibt_enable_cq_notify(). 7596 */ 7597 do { 7598 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 7599 IBT_SUCCESS) { 7600 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7601 } 7602 7603 ibd_drain_scq(state, cq_hdl); 7604 7605 mutex_enter(&state->id_scq_poll_lock); 7606 if (state->id_scq_poll_busy & redo_flag) 7607 state->id_scq_poll_busy &= ~redo_flag; 7608 else { 7609 state->id_scq_poll_busy &= ~flag; 7610 redo = 0; 7611 } 7612 mutex_exit(&state->id_scq_poll_lock); 7613 7614 } while (redo); 7615 } 7616 7617 /* 7618 * Common code for interrupt handling as well as for polling 7619 * for all completed wqe's while detaching. 7620 */ 7621 static void 7622 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 7623 { 7624 int flag, redo_flag; 7625 int redo = 1; 7626 7627 flag = IBD_CQ_POLLING; 7628 redo_flag = IBD_REDO_CQ_POLLING; 7629 7630 mutex_enter(&state->id_rcq_poll_lock); 7631 if (state->id_rcq_poll_busy & flag) { 7632 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 7633 state->id_rcq_poll_busy |= redo_flag; 7634 mutex_exit(&state->id_rcq_poll_lock); 7635 return; 7636 } 7637 state->id_rcq_poll_busy |= flag; 7638 mutex_exit(&state->id_rcq_poll_lock); 7639 7640 /* 7641 * Poll and drain the CQ 7642 */ 7643 ibd_drain_rcq(state, rcq); 7644 7645 /* 7646 * Enable CQ notifications and redrain the cq to catch any 7647 * completions we might have missed after the ibd_drain_cq() 7648 * above and before the ibt_enable_cq_notify() that follows. 7649 * Finally, service any new requests to poll the cq that 7650 * could've come in after the ibt_enable_cq_notify(). 7651 */ 7652 do { 7653 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 7654 IBT_SUCCESS) { 7655 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7656 } 7657 7658 ibd_drain_rcq(state, rcq); 7659 7660 mutex_enter(&state->id_rcq_poll_lock); 7661 if (state->id_rcq_poll_busy & redo_flag) 7662 state->id_rcq_poll_busy &= ~redo_flag; 7663 else { 7664 state->id_rcq_poll_busy &= ~flag; 7665 redo = 0; 7666 } 7667 mutex_exit(&state->id_rcq_poll_lock); 7668 7669 } while (redo); 7670 } 7671 7672 /* 7673 * Unmap the memory area associated with a given swqe. 7674 */ 7675 void 7676 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 7677 { 7678 ibt_status_t stat; 7679 7680 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 7681 7682 if (swqe->w_mi_hdl) { 7683 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 7684 swqe->w_mi_hdl)) != IBT_SUCCESS) { 7685 DPRINT(10, 7686 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 7687 } 7688 swqe->w_mi_hdl = NULL; 7689 } 7690 swqe->w_swr.wr_nds = 0; 7691 } 7692 7693 void 7694 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 7695 { 7696 /* 7697 * The recycling logic can be eliminated from here 7698 * and put into the async thread if we create another 7699 * list to hold ACE's for unjoined mcg's. 7700 */ 7701 if (DEC_REF_DO_CYCLE(ace)) { 7702 ibd_mce_t *mce; 7703 7704 /* 7705 * Check with the lock taken: we decremented 7706 * reference count without the lock, and some 7707 * transmitter might already have bumped the 7708 * reference count (possible in case of multicast 7709 * disable when we leave the AH on the active 7710 * list). If not still 0, get out, leaving the 7711 * recycle bit intact. 7712 * 7713 * Atomically transition the AH from active 7714 * to free list, and queue a work request to 7715 * leave the group and destroy the mce. No 7716 * transmitter can be looking at the AH or 7717 * the MCE in between, since we have the 7718 * ac_mutex lock. In the SendOnly reap case, 7719 * it is not necessary to hold the ac_mutex 7720 * and recheck the ref count (since the AH was 7721 * taken off the active list), we just do it 7722 * to have uniform processing with the Full 7723 * reap case. 7724 */ 7725 mutex_enter(&state->id_ac_mutex); 7726 mce = ace->ac_mce; 7727 if (GET_REF_CYCLE(ace) == 0) { 7728 CLEAR_REFCYCLE(ace); 7729 /* 7730 * Identify the case of fullmember reap as 7731 * opposed to mcg trap reap. Also, port up 7732 * might set ac_mce to NULL to indicate Tx 7733 * cleanup should do no more than put the 7734 * AH in the free list (see ibd_async_link). 7735 */ 7736 if (mce != NULL) { 7737 ace->ac_mce = NULL; 7738 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 7739 /* 7740 * mc_req was initialized at mce 7741 * creation time. 7742 */ 7743 ibd_queue_work_slot(state, 7744 &mce->mc_req, IBD_ASYNC_REAP); 7745 } 7746 IBD_ACACHE_INSERT_FREE(state, ace); 7747 } 7748 mutex_exit(&state->id_ac_mutex); 7749 } 7750 } 7751 7752 /* 7753 * Common code that deals with clean ups after a successful or 7754 * erroneous transmission attempt. 7755 */ 7756 static void 7757 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 7758 { 7759 ibd_ace_t *ace = swqe->w_ahandle; 7760 7761 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 7762 7763 /* 7764 * If this was a dynamic mapping in ibd_send(), we need to 7765 * unmap here. If this was an lso buffer we'd used for sending, 7766 * we need to release the lso buf to the pool, since the resource 7767 * is scarce. However, if this was simply a normal send using 7768 * the copybuf (present in each swqe), we don't need to release it. 7769 */ 7770 if (swqe->swqe_im_mblk != NULL) { 7771 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7772 ibd_unmap_mem(state, swqe); 7773 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7774 ibd_release_lsobufs(state, 7775 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7776 } 7777 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7778 freemsg(swqe->swqe_im_mblk); 7779 swqe->swqe_im_mblk = NULL; 7780 } 7781 7782 /* 7783 * Drop the reference count on the AH; it can be reused 7784 * now for a different destination if there are no more 7785 * posted sends that will use it. This can be eliminated 7786 * if we can always associate each Tx buffer with an AH. 7787 * The ace can be null if we are cleaning up from the 7788 * ibd_send() error path. 7789 */ 7790 if (ace != NULL) { 7791 ibd_dec_ref_ace(state, ace); 7792 } 7793 7794 /* 7795 * Release the send wqe for reuse. 7796 */ 7797 swqe->swqe_next = NULL; 7798 ibd_release_swqe(state, swqe, swqe, 1); 7799 } 7800 7801 static void 7802 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 7803 { 7804 ibd_ace_t *ace; 7805 ibd_swqe_t *swqe; 7806 int n = 0; 7807 7808 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 7809 7810 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 7811 7812 /* 7813 * If this was a dynamic mapping in ibd_send(), we need to 7814 * unmap here. If this was an lso buffer we'd used for sending, 7815 * we need to release the lso buf to the pool, since the 7816 * resource is scarce. However, if this was simply a normal 7817 * send using the copybuf (present in each swqe), we don't need 7818 * to release it. 7819 */ 7820 if (swqe->swqe_im_mblk != NULL) { 7821 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7822 ibd_unmap_mem(state, swqe); 7823 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7824 ibd_release_lsobufs(state, 7825 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7826 } 7827 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7828 freemsg(swqe->swqe_im_mblk); 7829 swqe->swqe_im_mblk = NULL; 7830 } 7831 7832 /* 7833 * Drop the reference count on the AH; it can be reused 7834 * now for a different destination if there are no more 7835 * posted sends that will use it. This can be eliminated 7836 * if we can always associate each Tx buffer with an AH. 7837 * The ace can be null if we are cleaning up from the 7838 * ibd_send() error path. 7839 */ 7840 ace = swqe->w_ahandle; 7841 if (ace != NULL) { 7842 ibd_dec_ref_ace(state, ace); 7843 } 7844 n++; 7845 } 7846 7847 /* 7848 * Release the send wqes for reuse. 7849 */ 7850 ibd_release_swqe(state, head, tail, n); 7851 } 7852 7853 /* 7854 * Processing to be done after receipt of a packet; hand off to GLD 7855 * in the format expected by GLD. The received packet has this 7856 * format: 2b sap :: 00 :: data. 7857 */ 7858 static mblk_t * 7859 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 7860 { 7861 ib_header_info_t *phdr; 7862 mblk_t *mp; 7863 ipoib_hdr_t *ipibp; 7864 ipha_t *iphap; 7865 ip6_t *ip6h; 7866 int len; 7867 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 7868 uint32_t bufs; 7869 7870 /* 7871 * Track number handed to upper layer that need to be returned. 7872 */ 7873 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 7874 7875 /* Never run out of rwqes, use allocb when running low */ 7876 if (bufs >= state->id_rx_bufs_outstanding_limit) { 7877 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 7878 atomic_inc_32(&state->id_rx_allocb); 7879 mp = allocb(pkt_len, BPRI_HI); 7880 if (mp) { 7881 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 7882 ibd_post_recv(state, rwqe); 7883 } else { /* no memory */ 7884 atomic_inc_32(&state->id_rx_allocb_failed); 7885 ibd_post_recv(state, rwqe); 7886 return (NULL); 7887 } 7888 } else { 7889 mp = rwqe->rwqe_im_mblk; 7890 } 7891 7892 7893 /* 7894 * Adjust write pointer depending on how much data came in. 7895 */ 7896 mp->b_wptr = mp->b_rptr + pkt_len; 7897 7898 /* 7899 * Make sure this is NULL or we're in trouble. 7900 */ 7901 if (mp->b_next != NULL) { 7902 ibd_print_warn(state, 7903 "ibd_process_rx: got duplicate mp from rcq?"); 7904 mp->b_next = NULL; 7905 } 7906 7907 /* 7908 * the IB link will deliver one of the IB link layer 7909 * headers called, the Global Routing Header (GRH). 7910 * ibd driver uses the information in GRH to build the 7911 * Header_info structure and pass it with the datagram up 7912 * to GLDv3. 7913 * If the GRH is not valid, indicate to GLDv3 by setting 7914 * the VerTcFlow field to 0. 7915 */ 7916 phdr = (ib_header_info_t *)mp->b_rptr; 7917 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 7918 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 7919 7920 /* if it is loop back packet, just drop it. */ 7921 if (state->id_enable_rc) { 7922 if (bcmp(&phdr->ib_grh.ipoib_sqpn, 7923 &state->rc_macaddr_loopback, 7924 IPOIB_ADDRL) == 0) { 7925 freemsg(mp); 7926 return (NULL); 7927 } 7928 } else { 7929 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 7930 IPOIB_ADDRL) == 0) { 7931 freemsg(mp); 7932 return (NULL); 7933 } 7934 } 7935 7936 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 7937 sizeof (ipoib_mac_t)); 7938 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 7939 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 7940 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 7941 } else { 7942 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 7943 } 7944 } else { 7945 /* 7946 * It can not be a IBA multicast packet. Must have been 7947 * unicast for us. Just copy the interface address to dst. 7948 */ 7949 phdr->ib_grh.ipoib_vertcflow = 0; 7950 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 7951 sizeof (ipoib_mac_t)); 7952 } 7953 7954 /* 7955 * For ND6 packets, padding is at the front of the source/target 7956 * lladdr. However the inet6 layer is not aware of it, hence remove 7957 * the padding from such packets. 7958 */ 7959 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 7960 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 7961 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 7962 len = ntohs(ip6h->ip6_plen); 7963 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 7964 /* LINTED: E_CONSTANT_CONDITION */ 7965 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 7966 } 7967 } 7968 7969 /* 7970 * Update statistics 7971 */ 7972 atomic_add_64(&state->id_rcv_bytes, pkt_len); 7973 atomic_inc_64(&state->id_rcv_pkt); 7974 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 7975 atomic_inc_64(&state->id_brd_rcv); 7976 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 7977 atomic_inc_64(&state->id_multi_rcv); 7978 7979 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 7980 /* 7981 * Set receive checksum status in mp 7982 * Hardware checksumming can be considered valid only if: 7983 * 1. CQE.IP_OK bit is set 7984 * 2. CQE.CKSUM = 0xffff 7985 * 3. IPv6 routing header is not present in the packet 7986 * 4. If there are no IP_OPTIONS in the IP HEADER 7987 */ 7988 7989 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 7990 (wc->wc_cksum == 0xFFFF) && 7991 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 7992 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 7993 } 7994 7995 return (mp); 7996 } 7997 7998 /* 7999 * Callback code invoked from STREAMs when the receive data buffer is 8000 * free for recycling. 8001 */ 8002 static void 8003 ibd_freemsg_cb(char *arg) 8004 { 8005 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 8006 ibd_state_t *state = rwqe->w_state; 8007 8008 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 8009 8010 /* 8011 * If the driver is stopped, just free the rwqe. 8012 */ 8013 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 8014 DPRINT(6, "ibd_freemsg: wqe being freed"); 8015 rwqe->rwqe_im_mblk = NULL; 8016 ibd_free_rwqe(state, rwqe); 8017 return; 8018 } 8019 8020 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 8021 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 8022 if (rwqe->rwqe_im_mblk == NULL) { 8023 ibd_free_rwqe(state, rwqe); 8024 DPRINT(6, "ibd_freemsg: desballoc failed"); 8025 return; 8026 } 8027 8028 ibd_post_recv(state, rwqe); 8029 } 8030 8031 static uint_t 8032 ibd_tx_recycle(caddr_t arg) 8033 { 8034 ibd_state_t *state = (ibd_state_t *)arg; 8035 8036 /* 8037 * Poll for completed entries 8038 */ 8039 ibd_poll_scq(state, state->id_scq_hdl); 8040 8041 return (DDI_INTR_CLAIMED); 8042 } 8043 8044 #ifdef IBD_LOGGING 8045 static void 8046 ibd_log_init(void) 8047 { 8048 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 8049 ibd_lbuf_ndx = 0; 8050 8051 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 8052 } 8053 8054 static void 8055 ibd_log_fini(void) 8056 { 8057 if (ibd_lbuf) 8058 kmem_free(ibd_lbuf, IBD_LOG_SZ); 8059 ibd_lbuf_ndx = 0; 8060 ibd_lbuf = NULL; 8061 8062 mutex_destroy(&ibd_lbuf_lock); 8063 } 8064 8065 static void 8066 ibd_log(const char *fmt, ...) 8067 { 8068 va_list ap; 8069 uint32_t off; 8070 uint32_t msglen; 8071 char tmpbuf[IBD_DMAX_LINE]; 8072 8073 if (ibd_lbuf == NULL) 8074 return; 8075 8076 va_start(ap, fmt); 8077 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 8078 va_end(ap); 8079 8080 if (msglen >= IBD_DMAX_LINE) 8081 msglen = IBD_DMAX_LINE - 1; 8082 8083 mutex_enter(&ibd_lbuf_lock); 8084 8085 off = ibd_lbuf_ndx; /* current msg should go here */ 8086 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 8087 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 8088 8089 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 8090 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 8091 8092 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 8093 ibd_lbuf_ndx = 0; 8094 8095 mutex_exit(&ibd_lbuf_lock); 8096 8097 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 8098 } 8099 #endif 8100 8101 /* ARGSUSED */ 8102 static int 8103 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8104 int *rvalp) 8105 { 8106 ibd_create_ioctl_t *cmd = karg; 8107 ibd_state_t *state, *port_state, *p; 8108 int i, err, rval = 0; 8109 mac_register_t *macp; 8110 ibt_hca_portinfo_t *pinfop = NULL; 8111 ibt_status_t ibt_status; 8112 uint_t psize, pinfosz; 8113 boolean_t force_create = B_FALSE; 8114 8115 cmd->ibdioc.ioc_status = 0; 8116 8117 if (cmd->ibdioc.ioc_port_inst < 0) { 8118 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8119 return (EINVAL); 8120 } 8121 port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst); 8122 if (port_state == NULL) { 8123 DPRINT(10, "ibd_create_partition: failed to get state %d", 8124 cmd->ibdioc.ioc_port_inst); 8125 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 8126 return (EINVAL); 8127 } 8128 8129 /* Limited PKeys not supported */ 8130 if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) { 8131 rval = EINVAL; 8132 goto part_create_return; 8133 } 8134 8135 if (cmd->ioc_force_create == 0) { 8136 /* 8137 * Check if the port pkey table contains the pkey for which 8138 * this partition is being created. 8139 */ 8140 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8141 port_state->id_port, &pinfop, &psize, &pinfosz); 8142 8143 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8144 rval = EINVAL; 8145 goto part_create_return; 8146 } 8147 8148 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) { 8149 rval = ENETDOWN; 8150 cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN; 8151 goto part_create_return; 8152 } 8153 8154 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) { 8155 if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) { 8156 break; 8157 } 8158 } 8159 if (i == pinfop->p_pkey_tbl_sz) { 8160 rval = EINVAL; 8161 cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT; 8162 goto part_create_return; 8163 } 8164 } else { 8165 force_create = B_TRUE; 8166 } 8167 8168 mutex_enter(&ibd_objlist_lock); 8169 for (p = ibd_objlist_head; p; p = p->id_next) { 8170 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) && 8171 (p->id_pkey == cmd->ioc_pkey)) { 8172 mutex_exit(&ibd_objlist_lock); 8173 rval = EEXIST; 8174 cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS; 8175 goto part_create_return; 8176 } 8177 } 8178 mutex_exit(&ibd_objlist_lock); 8179 8180 state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP); 8181 8182 state->id_type = IBD_PARTITION_OBJ; 8183 8184 state->id_plinkid = cmd->ioc_partid; 8185 state->id_dlinkid = cmd->ibdioc.ioc_linkid; 8186 state->id_port_inst = cmd->ibdioc.ioc_port_inst; 8187 8188 state->id_dip = port_state->id_dip; 8189 state->id_port = port_state->id_port; 8190 state->id_pkey = cmd->ioc_pkey; 8191 state->id_hca_guid = port_state->id_hca_guid; 8192 state->id_port_guid = port_state->id_port_guid; 8193 state->id_force_create = force_create; 8194 8195 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 8196 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 8197 8198 if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) { 8199 rval = EIO; 8200 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE; 8201 goto fail; 8202 } 8203 8204 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 8205 rval = EAGAIN; 8206 goto fail; 8207 } 8208 8209 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 8210 macp->m_dip = port_state->id_dip; 8211 macp->m_instance = (uint_t)-1; 8212 macp->m_driver = state; 8213 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 8214 macp->m_callbacks = &ibd_m_callbacks; 8215 macp->m_min_sdu = 0; 8216 if (state->id_enable_rc) { 8217 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 8218 } else { 8219 macp->m_max_sdu = IBD_DEF_MAX_SDU; 8220 } 8221 macp->m_priv_props = ibd_priv_props; 8222 8223 err = mac_register(macp, &state->id_mh); 8224 mac_free(macp); 8225 8226 if (err != 0) { 8227 DPRINT(10, "ibd_create_partition: mac_register() failed %d", 8228 err); 8229 rval = err; 8230 goto fail; 8231 } 8232 8233 err = dls_devnet_create(state->id_mh, 8234 cmd->ioc_partid, crgetzoneid(credp)); 8235 if (err != 0) { 8236 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed " 8237 "%d", err); 8238 rval = err; 8239 (void) mac_unregister(state->id_mh); 8240 goto fail; 8241 } 8242 8243 /* 8244 * Add the new partition state structure to the list 8245 */ 8246 mutex_enter(&ibd_objlist_lock); 8247 if (ibd_objlist_head) 8248 state->id_next = ibd_objlist_head; 8249 8250 ibd_objlist_head = state; 8251 mutex_exit(&ibd_objlist_lock); 8252 8253 part_create_return: 8254 if (pinfop) { 8255 ibt_free_portinfo(pinfop, pinfosz); 8256 } 8257 return (rval); 8258 8259 fail: 8260 if (pinfop) { 8261 ibt_free_portinfo(pinfop, pinfosz); 8262 } 8263 ibd_part_unattach(state); 8264 kmem_free(state, sizeof (ibd_state_t)); 8265 return (rval); 8266 } 8267 8268 /* ARGSUSED */ 8269 static int 8270 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 8271 int *rvalp) 8272 { 8273 int err; 8274 datalink_id_t tmpid; 8275 ibd_state_t *node, *prev; 8276 ibd_delete_ioctl_t *cmd = karg; 8277 8278 prev = NULL; 8279 8280 mutex_enter(&ibd_objlist_lock); 8281 node = ibd_objlist_head; 8282 8283 /* Find the ibd state structure corresponding the partion */ 8284 while (node != NULL) { 8285 if (node->id_plinkid == cmd->ioc_partid) 8286 break; 8287 prev = node; 8288 node = node->id_next; 8289 } 8290 8291 if (node == NULL) { 8292 mutex_exit(&ibd_objlist_lock); 8293 return (ENOENT); 8294 } 8295 8296 if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) { 8297 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed " 8298 "%d", err); 8299 mutex_exit(&ibd_objlist_lock); 8300 return (err); 8301 } 8302 8303 /* 8304 * Call ibd_part_unattach() only after making sure that the instance has 8305 * not been started yet and is also not in late hca init mode. 8306 */ 8307 ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8308 8309 err = 0; 8310 if ((node->id_mac_state & IBD_DRV_STARTED) || 8311 (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) || 8312 (ibd_part_busy(node) != DDI_SUCCESS) || 8313 ((err = mac_disable(node->id_mh)) != 0)) { 8314 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid, 8315 crgetzoneid(credp)); 8316 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8317 mutex_exit(&ibd_objlist_lock); 8318 return (err != 0 ? err : EBUSY); 8319 } 8320 8321 node->id_mac_state |= IBD_DRV_IN_DELETION; 8322 8323 ibd_part_unattach(node); 8324 8325 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8326 8327 /* Remove the partition state structure from the linked list */ 8328 if (prev == NULL) 8329 ibd_objlist_head = node->id_next; 8330 else 8331 prev->id_next = node->id_next; 8332 mutex_exit(&ibd_objlist_lock); 8333 8334 if ((err = mac_unregister(node->id_mh)) != 0) { 8335 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d", 8336 err); 8337 } 8338 8339 cv_destroy(&node->id_macst_cv); 8340 mutex_destroy(&node->id_macst_lock); 8341 8342 kmem_free(node, sizeof (ibd_state_t)); 8343 8344 return (0); 8345 } 8346 8347 /* ARGSUSED */ 8348 static int 8349 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred, 8350 int *rvalp) 8351 { 8352 ibd_ioctl_t cmd; 8353 ibpart_ioctl_t partioc; 8354 ibport_ioctl_t portioc; 8355 #ifdef _MULTI_DATAMODEL 8356 ibport_ioctl32_t portioc32; 8357 #endif 8358 ibd_state_t *state, *port_state; 8359 int size; 8360 ibt_hca_portinfo_t *pinfop = NULL; 8361 ibt_status_t ibt_status; 8362 uint_t psize, pinfosz; 8363 int rval = 0; 8364 8365 size = sizeof (ibd_ioctl_t); 8366 if (ddi_copyin((void *)arg, &cmd, size, mode)) { 8367 return (EFAULT); 8368 } 8369 cmd.ioc_status = 0; 8370 switch (cmd.ioc_info_cmd) { 8371 case IBD_INFO_CMD_IBPART: 8372 size = sizeof (ibpart_ioctl_t); 8373 if (ddi_copyin((void *)arg, &partioc, size, mode)) { 8374 return (EFAULT); 8375 } 8376 8377 mutex_enter(&ibd_objlist_lock); 8378 /* Find the ibd state structure corresponding the partition */ 8379 for (state = ibd_objlist_head; state; state = state->id_next) { 8380 if (state->id_plinkid == cmd.ioc_linkid) { 8381 break; 8382 } 8383 } 8384 8385 if (state == NULL) { 8386 mutex_exit(&ibd_objlist_lock); 8387 return (ENOENT); 8388 } 8389 8390 partioc.ibdioc.ioc_linkid = state->id_dlinkid; 8391 partioc.ibdioc.ioc_port_inst = state->id_port_inst; 8392 partioc.ibdioc.ioc_portnum = state->id_port; 8393 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid; 8394 partioc.ibdioc.ioc_portguid = state->id_port_guid; 8395 partioc.ibdioc.ioc_status = 0; 8396 partioc.ioc_partid = state->id_plinkid; 8397 partioc.ioc_pkey = state->id_pkey; 8398 partioc.ioc_force_create = state->id_force_create; 8399 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) { 8400 mutex_exit(&ibd_objlist_lock); 8401 return (EFAULT); 8402 } 8403 mutex_exit(&ibd_objlist_lock); 8404 8405 break; 8406 8407 case IBD_INFO_CMD_IBPORT: 8408 if ((cmd.ioc_port_inst < 0) || ((port_state = 8409 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8410 DPRINT(10, "ibd_create_partition: failed to get" 8411 " state %d", cmd.ioc_port_inst); 8412 size = sizeof (ibd_ioctl_t); 8413 cmd.ioc_status = IBD_INVALID_PORT_INST; 8414 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8415 mode)) { 8416 return (EFAULT); 8417 } 8418 return (EINVAL); 8419 } 8420 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8421 port_state->id_port, &pinfop, &psize, &pinfosz); 8422 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8423 return (EINVAL); 8424 } 8425 #ifdef _MULTI_DATAMODEL 8426 switch (ddi_model_convert_from(mode & FMODELS)) { 8427 case DDI_MODEL_ILP32: { 8428 size = sizeof (ibport_ioctl32_t); 8429 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8430 rval = EFAULT; 8431 goto fail; 8432 } 8433 portioc32.ibdioc.ioc_status = 0; 8434 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8435 portioc32.ibdioc.ioc_hcaguid = 8436 port_state->id_hca_guid; 8437 portioc32.ibdioc.ioc_portguid = 8438 port_state->id_port_guid; 8439 if (portioc32.ioc_pkey_tbl_sz != 8440 pinfop->p_pkey_tbl_sz) { 8441 rval = EINVAL; 8442 size = sizeof (ibd_ioctl_t); 8443 portioc32.ibdioc.ioc_status = 8444 IBD_INVALID_PKEY_TBL_SIZE; 8445 if (ddi_copyout((void *)&portioc32.ibdioc, 8446 (void *)arg, size, mode)) { 8447 rval = EFAULT; 8448 goto fail; 8449 } 8450 goto fail; 8451 } 8452 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8453 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8454 (void *)(uintptr_t)portioc32.ioc_pkeys, size, 8455 mode)) { 8456 rval = EFAULT; 8457 goto fail; 8458 } 8459 size = sizeof (ibport_ioctl32_t); 8460 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8461 mode)) { 8462 rval = EFAULT; 8463 goto fail; 8464 } 8465 break; 8466 } 8467 case DDI_MODEL_NONE: 8468 size = sizeof (ibport_ioctl_t); 8469 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8470 rval = EFAULT; 8471 goto fail; 8472 } 8473 portioc.ibdioc.ioc_status = 0; 8474 portioc.ibdioc.ioc_portnum = port_state->id_port; 8475 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8476 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8477 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8478 rval = EINVAL; 8479 size = sizeof (ibd_ioctl_t); 8480 portioc.ibdioc.ioc_status = 8481 IBD_INVALID_PKEY_TBL_SIZE; 8482 if (ddi_copyout((void *)&portioc.ibdioc, 8483 (void *)arg, size, mode)) { 8484 rval = EFAULT; 8485 goto fail; 8486 } 8487 goto fail; 8488 } 8489 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8490 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8491 (void *)(portioc.ioc_pkeys), size, mode)) { 8492 rval = EFAULT; 8493 goto fail; 8494 } 8495 size = sizeof (ibport_ioctl_t); 8496 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8497 mode)) { 8498 rval = EFAULT; 8499 goto fail; 8500 } 8501 break; 8502 } 8503 #else /* ! _MULTI_DATAMODEL */ 8504 size = sizeof (ibport_ioctl_t); 8505 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8506 rval = EFAULT; 8507 goto fail; 8508 } 8509 portioc.ibdioc.ioc_status = 0; 8510 portioc.ibdioc.ioc_portnum = port_state->id_port; 8511 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8512 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8513 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8514 rval = EINVAL; 8515 size = sizeof (ibd_ioctl_t); 8516 portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE; 8517 if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg, 8518 size, mode)) { 8519 rval = EFAULT; 8520 goto fail; 8521 } 8522 goto fail; 8523 } 8524 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8525 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8526 (void *)(portioc.ioc_pkeys), size, mode)) { 8527 rval = EFAULT; 8528 goto fail; 8529 } 8530 size = sizeof (ibport_ioctl_t); 8531 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8532 mode)) { 8533 rval = EFAULT; 8534 goto fail; 8535 } 8536 #endif /* _MULTI_DATAMODEL */ 8537 8538 break; 8539 8540 case IBD_INFO_CMD_PKEYTBLSZ: 8541 if ((cmd.ioc_port_inst < 0) || ((port_state = 8542 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8543 DPRINT(10, "ibd_create_partition: failed to get" 8544 " state %d", cmd.ioc_port_inst); 8545 size = sizeof (ibd_ioctl_t); 8546 cmd.ioc_status = IBD_INVALID_PORT_INST; 8547 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8548 mode)) { 8549 return (EFAULT); 8550 } 8551 return (EINVAL); 8552 } 8553 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8554 port_state->id_port, &pinfop, &psize, &pinfosz); 8555 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8556 return (EINVAL); 8557 } 8558 #ifdef _MULTI_DATAMODEL 8559 switch (ddi_model_convert_from(mode & FMODELS)) { 8560 case DDI_MODEL_ILP32: { 8561 size = sizeof (ibport_ioctl32_t); 8562 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8563 rval = EFAULT; 8564 goto fail; 8565 } 8566 portioc32.ibdioc.ioc_status = 0; 8567 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8568 portioc32.ibdioc.ioc_hcaguid = 8569 port_state->id_hca_guid; 8570 portioc32.ibdioc.ioc_portguid = 8571 port_state->id_port_guid; 8572 portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8573 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8574 mode)) { 8575 rval = EFAULT; 8576 goto fail; 8577 } 8578 break; 8579 } 8580 case DDI_MODEL_NONE: 8581 size = sizeof (ibport_ioctl_t); 8582 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8583 rval = EFAULT; 8584 goto fail; 8585 } 8586 portioc.ibdioc.ioc_status = 0; 8587 portioc.ibdioc.ioc_portnum = port_state->id_port; 8588 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8589 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8590 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8591 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8592 mode)) { 8593 rval = EFAULT; 8594 goto fail; 8595 } 8596 break; 8597 } 8598 #else /* ! _MULTI_DATAMODEL */ 8599 size = sizeof (ibport_ioctl_t); 8600 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8601 rval = EFAULT; 8602 goto fail; 8603 } 8604 portioc.ibdioc.ioc_status = 0; 8605 portioc.ibdioc.ioc_portnum = port_state->id_port; 8606 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8607 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8608 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8609 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8610 mode)) { 8611 rval = EFAULT; 8612 goto fail; 8613 } 8614 #endif /* _MULTI_DATAMODEL */ 8615 break; 8616 8617 default: 8618 return (EINVAL); 8619 8620 } /* switch (cmd.ioc_info_cmd) */ 8621 fail: 8622 if (pinfop) { 8623 ibt_free_portinfo(pinfop, pinfosz); 8624 } 8625 return (rval); 8626 } 8627 8628 /* ARGSUSED */ 8629 static void 8630 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl, 8631 ibt_async_code_t code, ibt_async_event_t *event) 8632 { 8633 ibd_state_t *state = (ibd_state_t *)arg; 8634 link_state_t lstate; 8635 8636 switch (code) { 8637 case IBT_EVENT_PORT_UP: 8638 case IBT_ERROR_PORT_DOWN: 8639 if (ibd_get_port_state(state, &lstate) != 0) 8640 break; 8641 8642 if (state->id_link_state != lstate) { 8643 state->id_link_state = lstate; 8644 mac_link_update(state->id_mh, lstate); 8645 } 8646 break; 8647 default: 8648 break; 8649 } 8650 } 8651 8652 static int 8653 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate) 8654 { 8655 ibt_hca_portinfo_t *port_infop; 8656 uint_t psize, port_infosz; 8657 ibt_status_t ret; 8658 8659 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 8660 &port_infop, &psize, &port_infosz); 8661 if ((ret != IBT_SUCCESS) || (psize != 1)) 8662 return (-1); 8663 8664 state->id_sgid = *port_infop->p_sgid_tbl; 8665 state->id_link_speed = ibd_get_portspeed(state); 8666 8667 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) 8668 *lstate = LINK_STATE_UP; 8669 else 8670 *lstate = LINK_STATE_DOWN; 8671 8672 ibt_free_portinfo(port_infop, port_infosz); 8673 return (0); 8674 } 8675 8676 static int 8677 ibd_port_attach(dev_info_t *dip) 8678 { 8679 ibd_state_t *state; 8680 link_state_t lstate; 8681 int instance; 8682 ibt_status_t ret; 8683 8684 /* 8685 * Allocate softstate structure 8686 */ 8687 instance = ddi_get_instance(dip); 8688 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) { 8689 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed"); 8690 return (DDI_FAILURE); 8691 } 8692 8693 state = ddi_get_soft_state(ibd_list, instance); 8694 8695 state->id_dip = dip; 8696 state->id_type = IBD_PORT_DRIVER; 8697 8698 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 8699 "port-number", 0)) == 0) { 8700 DPRINT(10, "ibd_port_attach: invalid port number (%d)", 8701 state->id_port); 8702 return (DDI_FAILURE); 8703 } 8704 if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8705 "hca-guid", 0)) == 0) { 8706 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)", 8707 state->id_hca_guid); 8708 return (DDI_FAILURE); 8709 } 8710 if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8711 "port-guid", 0)) == 0) { 8712 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)", 8713 state->id_port_guid); 8714 return (DDI_FAILURE); 8715 } 8716 8717 /* 8718 * Attach to IBTL 8719 */ 8720 if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state, 8721 &state->id_ibt_hdl)) != IBT_SUCCESS) { 8722 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d", 8723 ret); 8724 goto done; 8725 } 8726 8727 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 8728 8729 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 8730 &state->id_hca_hdl)) != IBT_SUCCESS) { 8731 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8732 ret); 8733 goto done; 8734 } 8735 state->id_mac_state |= IBD_DRV_HCA_OPENED; 8736 8737 /* Update link status */ 8738 8739 if (ibd_get_port_state(state, &lstate) != 0) { 8740 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8741 ret); 8742 goto done; 8743 } 8744 state->id_link_state = lstate; 8745 /* 8746 * Register ibd interfaces with the Nemo framework 8747 */ 8748 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 8749 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()"); 8750 goto done; 8751 } 8752 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 8753 8754 mac_link_update(state->id_mh, lstate); 8755 8756 return (DDI_SUCCESS); 8757 done: 8758 (void) ibd_port_unattach(state, dip); 8759 return (DDI_FAILURE); 8760 } 8761 8762 static int 8763 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip) 8764 { 8765 int instance; 8766 uint32_t progress = state->id_mac_state; 8767 ibt_status_t ret; 8768 8769 if (progress & IBD_DRV_MAC_REGISTERED) { 8770 (void) mac_unregister(state->id_mh); 8771 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 8772 } 8773 8774 if (progress & IBD_DRV_HCA_OPENED) { 8775 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 8776 IBT_SUCCESS) { 8777 ibd_print_warn(state, "failed to close " 8778 "HCA device, ret=%d", ret); 8779 } 8780 state->id_hca_hdl = NULL; 8781 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 8782 } 8783 8784 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 8785 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 8786 ibd_print_warn(state, 8787 "ibt_detach() failed, ret=%d", ret); 8788 } 8789 state->id_ibt_hdl = NULL; 8790 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 8791 } 8792 instance = ddi_get_instance(dip); 8793 ddi_soft_state_free(ibd_list, instance); 8794 8795 return (DDI_SUCCESS); 8796 } 8797 8798 ibt_status_t 8799 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr) 8800 { 8801 ibd_state_t *state; 8802 8803 mutex_enter(&ibd_objlist_lock); 8804 8805 /* Find the ibd state structure corresponding the partition */ 8806 for (state = ibd_objlist_head; state; state = state->id_next) { 8807 if (state->id_plinkid == linkid) { 8808 break; 8809 } 8810 } 8811 8812 if (state == NULL) { 8813 mutex_exit(&ibd_objlist_lock); 8814 return (IBT_NO_SUCH_OBJECT); 8815 } 8816 8817 attr->pa_dlinkid = state->id_dlinkid; 8818 attr->pa_plinkid = state->id_plinkid; 8819 attr->pa_port = state->id_port; 8820 attr->pa_hca_guid = state->id_hca_guid; 8821 attr->pa_port_guid = state->id_port_guid; 8822 attr->pa_pkey = state->id_pkey; 8823 8824 mutex_exit(&ibd_objlist_lock); 8825 8826 return (IBT_SUCCESS); 8827 } 8828 8829 ibt_status_t 8830 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts) 8831 { 8832 ibd_state_t *state; 8833 int n = 0; 8834 ibt_part_attr_t *attr; 8835 8836 mutex_enter(&ibd_objlist_lock); 8837 8838 for (state = ibd_objlist_head; state; state = state->id_next) 8839 n++; 8840 8841 *nparts = n; 8842 if (n == 0) { 8843 *attr_list = NULL; 8844 mutex_exit(&ibd_objlist_lock); 8845 return (IBT_SUCCESS); 8846 } 8847 8848 *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP); 8849 attr = *attr_list; 8850 for (state = ibd_objlist_head; state; state = state->id_next) { 8851 #ifdef DEBUG 8852 ASSERT(n > 0); 8853 n--; 8854 #endif 8855 attr->pa_dlinkid = state->id_dlinkid; 8856 attr->pa_plinkid = state->id_plinkid; 8857 attr->pa_port = state->id_port; 8858 attr->pa_hca_guid = state->id_hca_guid; 8859 attr->pa_port_guid = state->id_port_guid; 8860 attr->pa_pkey = state->id_pkey; 8861 attr++; 8862 } 8863 8864 mutex_exit(&ibd_objlist_lock); 8865 return (IBT_SUCCESS); 8866 } 8867