1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip_if.h> /* for IP6_DL_SAP */ 54 #include <inet/ip6.h> /* for ip6_t */ 55 #include <inet/tcp.h> /* for tcph_t */ 56 #include <netinet/icmp6.h> /* for icmp6_t */ 57 #include <sys/callb.h> 58 #include <sys/modhash.h> 59 60 #include <sys/ib/clients/ibd/ibd.h> 61 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 62 #include <sys/note.h> 63 #include <sys/multidata.h> 64 65 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 66 67 /* 68 * Per-interface tunables 69 * 70 * ibd_tx_copy_thresh 71 * This sets the threshold at which ibd will attempt to do a bcopy of the 72 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 73 * is restricted by various parameters, so setting of this value must be 74 * made after careful considerations only. For instance, IB HCAs currently 75 * impose a relatively small limit (when compared to ethernet NICs) on the 76 * length of the SGL for transmit. On the other hand, the ip stack could 77 * send down mp chains that are quite long when LSO is enabled. 78 * 79 * ibd_num_swqe 80 * Number of "send WQE" elements that will be allocated and used by ibd. 81 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 82 * buffer in each of these send wqes must be taken into account. This 83 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 84 * currently set to the same value of ibd_tx_copy_thresh, but may be 85 * changed independently if needed). 86 * 87 * ibd_num_rwqe 88 * Number of "receive WQE" elements that will be allocated and used by 89 * ibd. This parameter is limited by the maximum channel size of the HCA. 90 * Each buffer in the receive wqe will be of MTU size. 91 * 92 * ibd_num_lso_bufs 93 * Number of "larger-than-MTU" copy buffers to use for cases when the 94 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 95 * and too large to be used with regular MTU-sized copy buffers. It is 96 * not recommended to tune this variable without understanding the 97 * application environment and/or memory resources. The size of each of 98 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 99 * 100 * ibd_num_ah 101 * Number of AH cache entries to allocate 102 * 103 * ibd_hash_size 104 * Hash table size for the active AH list 105 * 106 * ibd_separate_cqs 107 * ibd_txcomp_poll 108 * These boolean variables (1 or 0) may be used to tune the behavior of 109 * ibd in managing the send and receive completion queues and in deciding 110 * whether or not transmit completions should be polled or interrupt 111 * driven (when the completion queues are separate). If both the completion 112 * queues are interrupt driven, it may not be possible for the handlers to 113 * be invoked concurrently, depending on how the interrupts are tied on 114 * the PCI intr line. Note that some combination of these two parameters 115 * may not be meaningful (and therefore not allowed). 116 * 117 * ibd_tx_softintr 118 * ibd_rx_softintr 119 * The softintr mechanism allows ibd to avoid event queue overflows if 120 * the receive/completion handlers are to be expensive. These are enabled 121 * by default. 122 * 123 * ibd_log_sz 124 * This specifies the size of the ibd log buffer in bytes. The buffer is 125 * allocated and logging is enabled only when IBD_LOGGING is defined. 126 * 127 */ 128 uint_t ibd_tx_copy_thresh = 0x1000; 129 uint_t ibd_num_swqe = 4000; 130 uint_t ibd_num_rwqe = 4000; 131 uint_t ibd_num_lso_bufs = 0x400; 132 uint_t ibd_num_ah = 64; 133 uint_t ibd_hash_size = 32; 134 uint_t ibd_separate_cqs = 1; 135 uint_t ibd_txcomp_poll = 0; 136 uint_t ibd_rx_softintr = 1; 137 uint_t ibd_tx_softintr = 1; 138 uint_t ibd_create_broadcast_group = 1; 139 uint_t ibd_force_lso_disable = 1; 140 #ifdef IBD_LOGGING 141 uint_t ibd_log_sz = 0x20000; 142 #endif 143 144 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 145 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 146 #define IBD_NUM_SWQE ibd_num_swqe 147 #define IBD_NUM_RWQE ibd_num_rwqe 148 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 149 #define IBD_NUM_AH ibd_num_ah 150 #define IBD_HASH_SIZE ibd_hash_size 151 #ifdef IBD_LOGGING 152 #define IBD_LOG_SZ ibd_log_sz 153 #endif 154 155 /* 156 * Receive CQ moderation parameters: NOT tunables 157 */ 158 static uint_t ibd_rxcomp_count = 4; 159 static uint_t ibd_rxcomp_usec = 10; 160 161 /* 162 * Send CQ moderation parameters: NOT tunables 163 */ 164 #define IBD_TXCOMP_COUNT 10 165 #define IBD_TXCOMP_USEC 300 166 167 /* 168 * Thresholds 169 * 170 * When waiting for resources (swqes or lso buffers) to become available, 171 * the first two thresholds below determine how long to wait before informing 172 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 173 * determines how low the available swqes should go before we start polling 174 * the completion queue. 175 */ 176 #define IBD_FREE_LSOS_THRESH 8 177 #define IBD_FREE_SWQES_THRESH 20 178 #define IBD_TX_POLL_THRESH 80 179 180 /* 181 * When doing multiple-send-wr or multiple-recv-wr posts, this value 182 * determines how many to do at a time (in a single ibt_post_send/recv). 183 */ 184 #define IBD_MAX_POST_MULTIPLE 4 185 186 /* 187 * Maximum length for returning chained mps back to crossbow 188 */ 189 #define IBD_MAX_RX_MP_LEN 16 190 191 /* 192 * LSO parameters 193 */ 194 #define IBD_LSO_MAXLEN 65536 195 #define IBD_LSO_BUFSZ 8192 196 #define IBD_PROP_LSO_POLICY "lso-policy" 197 198 /* 199 * Completion queue polling control 200 */ 201 #define IBD_RX_CQ_POLLING 0x1 202 #define IBD_TX_CQ_POLLING 0x2 203 #define IBD_REDO_RX_CQ_POLLING 0x4 204 #define IBD_REDO_TX_CQ_POLLING 0x8 205 206 /* 207 * Flag bits for resources to reap 208 */ 209 #define IBD_RSRC_SWQE 0x1 210 #define IBD_RSRC_LSOBUF 0x2 211 212 /* 213 * Async operation types 214 */ 215 #define IBD_ASYNC_GETAH 1 216 #define IBD_ASYNC_JOIN 2 217 #define IBD_ASYNC_LEAVE 3 218 #define IBD_ASYNC_PROMON 4 219 #define IBD_ASYNC_PROMOFF 5 220 #define IBD_ASYNC_REAP 6 221 #define IBD_ASYNC_TRAP 7 222 #define IBD_ASYNC_SCHED 8 223 #define IBD_ASYNC_LINK 9 224 #define IBD_ASYNC_EXIT 10 225 226 /* 227 * Async operation states 228 */ 229 #define IBD_OP_NOTSTARTED 0 230 #define IBD_OP_ONGOING 1 231 #define IBD_OP_COMPLETED 2 232 #define IBD_OP_ERRORED 3 233 #define IBD_OP_ROUTERED 4 234 235 /* 236 * State of IBD driver initialization during attach/m_start 237 */ 238 #define IBD_DRV_STATE_INITIALIZED 0x00001 239 #define IBD_DRV_RXINTR_ADDED 0x00002 240 #define IBD_DRV_TXINTR_ADDED 0x00004 241 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 242 #define IBD_DRV_HCA_OPENED 0x00010 243 #define IBD_DRV_PD_ALLOCD 0x00020 244 #define IBD_DRV_MAC_REGISTERED 0x00040 245 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 246 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 247 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 248 #define IBD_DRV_CQS_ALLOCD 0x00400 249 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 250 #define IBD_DRV_TXLIST_ALLOCD 0x01000 251 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 252 #define IBD_DRV_RXLIST_ALLOCD 0x04000 253 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 254 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 255 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 256 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 257 #define IBD_DRV_STARTED 0x80000 258 259 /* 260 * Start/stop in-progress flags; note that restart must always remain 261 * the OR of start and stop flag values. 262 */ 263 #define IBD_DRV_START_IN_PROGRESS 0x10000000 264 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 265 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 266 267 /* 268 * Miscellaneous constants 269 */ 270 #define IBD_SEND 0 271 #define IBD_RECV 1 272 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 273 #define IBD_DEF_MAX_SDU 2044 274 #define IBD_DEFAULT_QKEY 0xB1B 275 #ifdef IBD_LOGGING 276 #define IBD_DMAX_LINE 100 277 #endif 278 279 /* 280 * Enumerations for link states 281 */ 282 typedef enum { 283 IBD_LINK_DOWN, 284 IBD_LINK_UP, 285 IBD_LINK_UP_ABSENT 286 } ibd_link_op_t; 287 288 /* 289 * Driver State Pointer 290 */ 291 void *ibd_list; 292 293 /* 294 * Logging 295 */ 296 #ifdef IBD_LOGGING 297 kmutex_t ibd_lbuf_lock; 298 uint8_t *ibd_lbuf; 299 uint32_t ibd_lbuf_ndx; 300 #endif 301 302 /* 303 * Required system entry points 304 */ 305 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 306 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 307 308 /* 309 * Required driver entry points for GLDv3 310 */ 311 static int ibd_m_stat(void *, uint_t, uint64_t *); 312 static int ibd_m_start(void *); 313 static void ibd_m_stop(void *); 314 static int ibd_m_promisc(void *, boolean_t); 315 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 316 static int ibd_m_unicst(void *, const uint8_t *); 317 static mblk_t *ibd_m_tx(void *, mblk_t *); 318 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 319 320 /* 321 * Private driver entry points for GLDv3 322 */ 323 324 /* 325 * Initialization 326 */ 327 static int ibd_state_init(ibd_state_t *, dev_info_t *); 328 static int ibd_init_txlist(ibd_state_t *); 329 static int ibd_init_rxlist(ibd_state_t *); 330 static int ibd_acache_init(ibd_state_t *); 331 #ifdef IBD_LOGGING 332 static void ibd_log_init(void); 333 #endif 334 335 /* 336 * Termination/cleanup 337 */ 338 static void ibd_state_fini(ibd_state_t *); 339 static void ibd_fini_txlist(ibd_state_t *); 340 static void ibd_fini_rxlist(ibd_state_t *); 341 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 342 static void ibd_acache_fini(ibd_state_t *); 343 #ifdef IBD_LOGGING 344 static void ibd_log_fini(void); 345 #endif 346 347 /* 348 * Allocation/acquire/map routines 349 */ 350 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t); 351 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 352 static int ibd_alloc_tx_copybufs(ibd_state_t *); 353 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 354 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **); 355 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 356 uint32_t *); 357 358 /* 359 * Free/release/unmap routines 360 */ 361 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 362 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 363 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *); 364 static void ibd_free_tx_copybufs(ibd_state_t *); 365 static void ibd_free_tx_lsobufs(ibd_state_t *); 366 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *); 367 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 368 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 369 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 370 371 /* 372 * Handlers/callback routines 373 */ 374 static uint_t ibd_intr(char *); 375 static uint_t ibd_tx_recycle(char *); 376 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 377 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 378 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 379 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t); 380 static void ibd_freemsg_cb(char *); 381 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 382 ibt_async_event_t *); 383 static void ibd_snet_notices_handler(void *, ib_gid_t, 384 ibt_subnet_event_code_t, ibt_subnet_event_t *); 385 386 /* 387 * Send/receive routines 388 */ 389 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 390 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 391 static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t); 392 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 393 static void ibd_flush_rx(ibd_state_t *, mblk_t *); 394 395 /* 396 * Threads 397 */ 398 static void ibd_async_work(ibd_state_t *); 399 400 /* 401 * Async tasks 402 */ 403 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 404 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 405 static void ibd_async_setprom(ibd_state_t *); 406 static void ibd_async_unsetprom(ibd_state_t *); 407 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 408 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 409 static void ibd_async_txsched(ibd_state_t *); 410 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 411 412 /* 413 * Async task helpers 414 */ 415 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 416 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 417 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 418 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 419 ipoib_mac_t *, ipoib_mac_t *); 420 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 421 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 422 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 423 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 424 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 425 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 426 static uint64_t ibd_get_portspeed(ibd_state_t *); 427 static boolean_t ibd_async_safe(ibd_state_t *); 428 static void ibd_async_done(ibd_state_t *); 429 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 430 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 431 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 432 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 433 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 434 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 435 436 /* 437 * Helpers for attach/start routines 438 */ 439 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 440 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 441 static int ibd_unattach(ibd_state_t *, dev_info_t *); 442 static int ibd_get_port_details(ibd_state_t *); 443 static int ibd_alloc_cqs(ibd_state_t *); 444 static int ibd_setup_ud_channel(ibd_state_t *); 445 static int ibd_start(ibd_state_t *); 446 static int ibd_undo_start(ibd_state_t *, link_state_t); 447 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 448 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 449 450 451 /* 452 * Miscellaneous helpers 453 */ 454 static int ibd_sched_poll(ibd_state_t *, int, int); 455 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 456 static int ibd_resume_transmission(ibd_state_t *); 457 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 458 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 459 static void *list_get_head(list_t *); 460 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 461 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 462 static void ibd_print_warn(ibd_state_t *, char *, ...); 463 #ifdef IBD_LOGGING 464 static void ibd_log(const char *, ...); 465 #endif 466 467 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 468 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 469 470 /* Module Driver Info */ 471 static struct modldrv ibd_modldrv = { 472 &mod_driverops, /* This one is a driver */ 473 "InfiniBand GLDv3 Driver", /* short description */ 474 &ibd_dev_ops /* driver specific ops */ 475 }; 476 477 /* Module Linkage */ 478 static struct modlinkage ibd_modlinkage = { 479 MODREV_1, (void *)&ibd_modldrv, NULL 480 }; 481 482 /* 483 * Module (static) info passed to IBTL during ibt_attach 484 */ 485 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 486 IBTI_V_CURR, 487 IBT_NETWORK, 488 ibd_async_handler, 489 NULL, 490 "IPIB" 491 }; 492 493 /* 494 * GLDv3 entry points 495 */ 496 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 497 static mac_callbacks_t ibd_m_callbacks = { 498 IBD_M_CALLBACK_FLAGS, 499 ibd_m_stat, 500 ibd_m_start, 501 ibd_m_stop, 502 ibd_m_promisc, 503 ibd_m_multicst, 504 ibd_m_unicst, 505 ibd_m_tx, 506 NULL, 507 ibd_m_getcapab 508 }; 509 510 /* 511 * Fill/clear <scope> and <p_key> in multicast/broadcast address 512 */ 513 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 514 { \ 515 *(uint32_t *)((char *)(maddr) + 4) |= \ 516 htonl((uint32_t)(scope) << 16); \ 517 *(uint32_t *)((char *)(maddr) + 8) |= \ 518 htonl((uint32_t)(pkey) << 16); \ 519 } 520 521 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 522 { \ 523 *(uint32_t *)((char *)(maddr) + 4) &= \ 524 htonl(~((uint32_t)0xF << 16)); \ 525 *(uint32_t *)((char *)(maddr) + 8) &= \ 526 htonl(~((uint32_t)0xFFFF << 16)); \ 527 } 528 529 /* 530 * Rudimentary debugging support 531 */ 532 #ifdef DEBUG 533 int ibd_debuglevel = 100; 534 static void 535 debug_print(int l, char *fmt, ...) 536 { 537 va_list ap; 538 539 if (l < ibd_debuglevel) 540 return; 541 va_start(ap, fmt); 542 vcmn_err(CE_CONT, fmt, ap); 543 va_end(ap); 544 } 545 #define DPRINT debug_print 546 #else 547 #define DPRINT 548 #endif 549 550 /* 551 * Common routine to print warning messages; adds in hca guid, port number 552 * and pkey to be able to identify the IBA interface. 553 */ 554 static void 555 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 556 { 557 ib_guid_t hca_guid; 558 char ibd_print_buf[256]; 559 int len; 560 va_list ap; 561 562 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 563 0, "hca-guid", 0); 564 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 565 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 566 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 567 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 568 va_start(ap, fmt); 569 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 570 fmt, ap); 571 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 572 va_end(ap); 573 } 574 575 /* 576 * Warlock directives 577 */ 578 579 /* 580 * id_lso_lock 581 * 582 * state->id_lso->bkt_nfree may be accessed without a lock to 583 * determine the threshold at which we have to ask the nw layer 584 * to resume transmission (see ibd_resume_transmission()). 585 */ 586 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 587 ibd_state_t::id_lso)) 588 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 589 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 590 591 /* 592 * id_cq_poll_lock 593 */ 594 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock, 595 ibd_state_t::id_cq_poll_busy)) 596 597 /* 598 * id_txpost_lock 599 */ 600 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 601 ibd_state_t::id_tx_head)) 602 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 603 ibd_state_t::id_tx_busy)) 604 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 605 ibd_state_t::id_tx_tailp)) 606 607 /* 608 * id_rxpost_lock 609 */ 610 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 611 ibd_state_t::id_rx_head)) 612 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 613 ibd_state_t::id_rx_busy)) 614 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 615 ibd_state_t::id_rx_tailp)) 616 617 /* 618 * id_acache_req_lock 619 */ 620 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 621 ibd_state_t::id_acache_req_cv)) 622 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 623 ibd_state_t::id_req_list)) 624 625 /* 626 * id_ac_mutex 627 * 628 * This mutex is actually supposed to protect id_ah_op as well, 629 * but this path of the code isn't clean (see update of id_ah_op 630 * in ibd_async_acache(), immediately after the call to 631 * ibd_async_mcache()). For now, we'll skip this check by 632 * declaring that id_ah_op is protected by some internal scheme 633 * that warlock isn't aware of. 634 */ 635 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 636 ibd_state_t::id_ah_active)) 637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 638 ibd_state_t::id_ah_free)) 639 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 640 ibd_state_t::id_ah_addr)) 641 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 642 ibd_state_t::id_ah_op)) 643 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 644 ibd_state_t::id_ah_error)) 645 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 646 647 /* 648 * id_mc_mutex 649 */ 650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 651 ibd_state_t::id_mc_full)) 652 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 653 ibd_state_t::id_mc_non)) 654 655 /* 656 * id_trap_lock 657 */ 658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 659 ibd_state_t::id_trap_cv)) 660 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 661 ibd_state_t::id_trap_stop)) 662 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 663 ibd_state_t::id_trap_inprog)) 664 665 /* 666 * id_prom_op 667 */ 668 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 669 ibd_state_t::id_prom_op)) 670 671 /* 672 * id_sched_lock 673 */ 674 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 675 ibd_state_t::id_sched_needed)) 676 677 /* 678 * id_link_mutex 679 */ 680 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 681 ibd_state_t::id_link_state)) 682 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 683 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 684 ibd_state_t::id_link_speed)) 685 686 /* 687 * id_tx_list.dl_mutex 688 */ 689 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 690 ibd_state_t::id_tx_list.dl_head)) 691 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 692 ibd_state_t::id_tx_list.dl_tail)) 693 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 694 ibd_state_t::id_tx_list.dl_pending_sends)) 695 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 696 ibd_state_t::id_tx_list.dl_cnt)) 697 698 /* 699 * id_rx_list.dl_mutex 700 */ 701 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 702 ibd_state_t::id_rx_list.dl_head)) 703 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 704 ibd_state_t::id_rx_list.dl_tail)) 705 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 706 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 707 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 708 ibd_state_t::id_rx_list.dl_cnt)) 709 710 711 /* 712 * Items protected by atomic updates 713 */ 714 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 715 ibd_state_s::id_brd_rcv 716 ibd_state_s::id_brd_xmt 717 ibd_state_s::id_multi_rcv 718 ibd_state_s::id_multi_xmt 719 ibd_state_s::id_num_intrs 720 ibd_state_s::id_rcv_bytes 721 ibd_state_s::id_rcv_pkt 722 ibd_state_s::id_tx_short 723 ibd_state_s::id_xmt_bytes 724 ibd_state_s::id_xmt_pkt)) 725 726 /* 727 * Non-mutex protection schemes for data elements. Almost all of 728 * these are non-shared items. 729 */ 730 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 731 callb_cpr 732 ib_gid_s 733 ib_header_info 734 ibd_acache_rq 735 ibd_acache_s::ac_mce 736 ibd_mcache::mc_fullreap 737 ibd_mcache::mc_jstate 738 ibd_mcache::mc_req 739 ibd_rwqe_s 740 ibd_swqe_s 741 ibd_wqe_s 742 ibt_wr_ds_s::ds_va 743 ibt_wr_lso_s 744 ipoib_mac::ipoib_qpn 745 mac_capab_lso_s 746 msgb::b_next 747 msgb::b_rptr 748 msgb::b_wptr)) 749 750 int 751 _init() 752 { 753 int status; 754 755 /* 756 * Sanity check some parameter settings. Tx completion polling 757 * only makes sense with separate CQs for Tx and Rx. 758 */ 759 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 760 cmn_err(CE_NOTE, "!ibd: %s", 761 "Setting ibd_txcomp_poll = 0 for combined CQ"); 762 ibd_txcomp_poll = 0; 763 } 764 765 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 766 if (status != 0) { 767 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 768 return (status); 769 } 770 771 mac_init_ops(&ibd_dev_ops, "ibd"); 772 status = mod_install(&ibd_modlinkage); 773 if (status != 0) { 774 DPRINT(10, "_init:failed in mod_install()"); 775 ddi_soft_state_fini(&ibd_list); 776 mac_fini_ops(&ibd_dev_ops); 777 return (status); 778 } 779 780 #ifdef IBD_LOGGING 781 ibd_log_init(); 782 #endif 783 return (0); 784 } 785 786 int 787 _info(struct modinfo *modinfop) 788 { 789 return (mod_info(&ibd_modlinkage, modinfop)); 790 } 791 792 int 793 _fini() 794 { 795 int status; 796 797 status = mod_remove(&ibd_modlinkage); 798 if (status != 0) 799 return (status); 800 801 mac_fini_ops(&ibd_dev_ops); 802 ddi_soft_state_fini(&ibd_list); 803 #ifdef IBD_LOGGING 804 ibd_log_fini(); 805 #endif 806 return (0); 807 } 808 809 /* 810 * Convert the GID part of the mac address from network byte order 811 * to host order. 812 */ 813 static void 814 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 815 { 816 ib_sn_prefix_t nbopref; 817 ib_guid_t nboguid; 818 819 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 820 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 821 dgid->gid_prefix = b2h64(nbopref); 822 dgid->gid_guid = b2h64(nboguid); 823 } 824 825 /* 826 * Create the IPoIB address in network byte order from host order inputs. 827 */ 828 static void 829 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 830 ib_guid_t guid) 831 { 832 ib_sn_prefix_t nbopref; 833 ib_guid_t nboguid; 834 835 mac->ipoib_qpn = htonl(qpn); 836 nbopref = h2b64(prefix); 837 nboguid = h2b64(guid); 838 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 839 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 840 } 841 842 /* 843 * Send to the appropriate all-routers group when the IBA multicast group 844 * does not exist, based on whether the target group is v4 or v6. 845 */ 846 static boolean_t 847 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 848 ipoib_mac_t *rmac) 849 { 850 boolean_t retval = B_TRUE; 851 uint32_t adjscope = state->id_scope << 16; 852 uint32_t topword; 853 854 /* 855 * Copy the first 4 bytes in without assuming any alignment of 856 * input mac address; this will have IPoIB signature, flags and 857 * scope bits. 858 */ 859 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 860 topword = ntohl(topword); 861 862 /* 863 * Generate proper address for IPv4/v6, adding in the Pkey properly. 864 */ 865 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 866 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 867 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 868 ((uint32_t)(state->id_pkey << 16))), 869 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 870 else 871 /* 872 * Does not have proper bits in the mgid address. 873 */ 874 retval = B_FALSE; 875 876 return (retval); 877 } 878 879 /* 880 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 881 * front of optional src/tgt link layer address. Right now Solaris inserts 882 * padding by default at the end. The routine which is doing is nce_xmit() 883 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 884 * the packet comes down from IP layer to the IBD driver, it is in the 885 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 886 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 887 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 888 * 889 * The send routine at IBD driver changes this packet as follows: 890 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 891 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 892 * aligned. 893 * 894 * At the receiving side again ibd_process_rx takes the above packet and 895 * removes the two bytes of front padding and inserts it at the end. This 896 * is since the IP layer does not understand padding at the front. 897 */ 898 #define IBD_PAD_NSNA(ip6h, len, type) { \ 899 uchar_t *nd_lla_ptr; \ 900 icmp6_t *icmp6; \ 901 nd_opt_hdr_t *opt; \ 902 int i; \ 903 \ 904 icmp6 = (icmp6_t *)&ip6h[1]; \ 905 len -= sizeof (nd_neighbor_advert_t); \ 906 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 907 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 908 (len != 0)) { \ 909 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 910 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 911 ASSERT(opt != NULL); \ 912 nd_lla_ptr = (uchar_t *)&opt[1]; \ 913 if (type == IBD_SEND) { \ 914 for (i = IPOIB_ADDRL; i > 0; i--) \ 915 *(nd_lla_ptr + i + 1) = \ 916 *(nd_lla_ptr + i - 1); \ 917 } else { \ 918 for (i = 0; i < IPOIB_ADDRL; i++) \ 919 *(nd_lla_ptr + i) = \ 920 *(nd_lla_ptr + i + 2); \ 921 } \ 922 *(nd_lla_ptr + i) = 0; \ 923 *(nd_lla_ptr + i + 1) = 0; \ 924 } \ 925 } 926 927 /* 928 * Address handle entries maintained by the driver are kept in the 929 * free and active lists. Each entry starts out in the free list; 930 * it migrates to the active list when primed using ibt_get_paths() 931 * and ibt_modify_ud_dest() for transmission to a specific destination. 932 * In the active list, the entry has a reference count indicating the 933 * number of ongoing/uncompleted transmits that reference it. The 934 * entry is left in the active list even after the reference count 935 * goes to 0, since successive transmits can find it there and do 936 * not need to set up another entry (ie the path information is 937 * cached using the active list). Entries on the active list are 938 * also hashed using the destination link address as a key for faster 939 * lookups during transmits. 940 * 941 * For any destination address (unicast or multicast, whatever the 942 * join states), there will be at most one entry in the active list. 943 * Entries with a 0 reference count on the active list can be reused 944 * for a transmit to a new destination, if the free list is empty. 945 * 946 * The AH free list insertion/deletion is protected with the id_ac_mutex, 947 * since the async thread and Tx callback handlers insert/delete. The 948 * active list does not need a lock (all operations are done by the 949 * async thread) but updates to the reference count are atomically 950 * done (increments done by Tx path, decrements by the Tx callback handler). 951 */ 952 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 953 list_insert_head(&state->id_ah_free, ce) 954 #define IBD_ACACHE_GET_FREE(state) \ 955 list_get_head(&state->id_ah_free) 956 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 957 int _ret_; \ 958 list_insert_head(&state->id_ah_active, ce); \ 959 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 960 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 961 ASSERT(_ret_ == 0); \ 962 } 963 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 964 list_remove(&state->id_ah_active, ce); \ 965 (void) mod_hash_remove(state->id_ah_active_hash, \ 966 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 967 } 968 #define IBD_ACACHE_GET_ACTIVE(state) \ 969 list_get_head(&state->id_ah_active) 970 971 /* 972 * Membership states for different mcg's are tracked by two lists: 973 * the "non" list is used for promiscuous mode, when all mcg traffic 974 * needs to be inspected. This type of membership is never used for 975 * transmission, so there can not be an AH in the active list 976 * corresponding to a member in this list. This list does not need 977 * any protection, since all operations are performed by the async 978 * thread. 979 * 980 * "Full" and "SendOnly" membership is tracked using a single list, 981 * the "full" list. This is because this single list can then be 982 * searched during transmit to a multicast group (if an AH for the 983 * mcg is not found in the active list), since at least one type 984 * of membership must be present before initiating the transmit. 985 * This list is also emptied during driver detach, since sendonly 986 * membership acquired during transmit is dropped at detach time 987 * alongwith ipv4 broadcast full membership. Insert/deletes to 988 * this list are done only by the async thread, but it is also 989 * searched in program context (see multicast disable case), thus 990 * the id_mc_mutex protects the list. The driver detach path also 991 * deconstructs the "full" list, but it ensures that the async 992 * thread will not be accessing the list (by blocking out mcg 993 * trap handling and making sure no more Tx reaping will happen). 994 * 995 * Currently, an IBA attach is done in the SendOnly case too, 996 * although this is not required. 997 */ 998 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 999 list_insert_head(&state->id_mc_full, mce) 1000 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1001 list_insert_head(&state->id_mc_non, mce) 1002 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1003 ibd_mcache_find(mgid, &state->id_mc_full) 1004 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1005 ibd_mcache_find(mgid, &state->id_mc_non) 1006 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1007 list_remove(&state->id_mc_full, mce) 1008 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1009 list_remove(&state->id_mc_non, mce) 1010 1011 /* 1012 * AH and MCE active list manipulation: 1013 * 1014 * Multicast disable requests and MCG delete traps are two cases 1015 * where the active AH entry for the mcg (if any unreferenced one exists) 1016 * will be moved to the free list (to force the next Tx to the mcg to 1017 * join the MCG in SendOnly mode). Port up handling will also move AHs 1018 * from active to free list. 1019 * 1020 * In the case when some transmits are still pending on an entry 1021 * for an mcg, but a multicast disable has already been issued on the 1022 * mcg, there are some options to consider to preserve the join state 1023 * to ensure the emitted packet is properly routed on the IBA fabric. 1024 * For the AH, we can 1025 * 1. take out of active list at multicast disable time. 1026 * 2. take out of active list only when last pending Tx completes. 1027 * For the MCE, we can 1028 * 3. take out of active list at multicast disable time. 1029 * 4. take out of active list only when last pending Tx completes. 1030 * 5. move from active list to stale list at multicast disable time. 1031 * We choose to use 2,4. We use option 4 so that if a multicast enable 1032 * is tried before the pending Tx completes, the enable code finds the 1033 * mce in the active list and just has to make sure it will not be reaped 1034 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1035 * a stale list (#5) that would be checked in the enable code would need 1036 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1037 * after the multicast disable would try to put an AH in the active list, 1038 * and associate the mce it finds in the active list to this new AH, 1039 * whereas the mce is already associated with the previous AH (taken off 1040 * the active list), and will be removed once the pending Tx's complete 1041 * (unless a reference count on mce's is implemented). One implication of 1042 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1043 * grab new references on the AH, further delaying the leave. 1044 * 1045 * In the case of mcg delete (or create) trap when the port is sendonly 1046 * joined, the AH and MCE handling is different: the AH and MCE has to be 1047 * immediately taken off the active lists (forcing a join and path lookup 1048 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1049 * to an mcg as it is repeatedly created and deleted and goes thru 1050 * reincarnations). 1051 * 1052 * When a port is already sendonly joined, and a multicast enable is 1053 * attempted, the same mce structure is promoted; this ensures only a 1054 * single mce on the active list tracks the most powerful join state. 1055 * 1056 * In the case of port up event handling, the MCE for sendonly membership 1057 * is freed up, and the ACE is put into the free list as soon as possible 1058 * (depending on whether posted Tx's have completed). For fullmembership 1059 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1060 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1061 * done; else the mce is deconstructed (mc_fullreap case). 1062 * 1063 * MCG creation and deletion trap handling: 1064 * 1065 * These traps are unreliable (meaning sometimes the trap might never 1066 * be delivered to the subscribed nodes) and may arrive out-of-order 1067 * since they use UD transport. An alternative to relying on these 1068 * unreliable traps is to poll for mcg presence every so often, but 1069 * instead of doing that, we try to be as conservative as possible 1070 * while handling the traps, and hope that the traps do arrive at 1071 * the subscribed nodes soon. Note that if a node is fullmember 1072 * joined to an mcg, it can not possibly receive a mcg create/delete 1073 * trap for that mcg (by fullmember definition); if it does, it is 1074 * an old trap from a previous incarnation of the mcg. 1075 * 1076 * Whenever a trap is received, the driver cleans up its sendonly 1077 * membership to the group; we choose to do a sendonly leave even 1078 * on a creation trap to handle the case of a prior deletion of the mcg 1079 * having gone unnoticed. Consider an example scenario: 1080 * T1: MCG M is deleted, and fires off deletion trap D1. 1081 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1082 * T3: Node N tries to transmit to M, joining in sendonly mode. 1083 * T4: MCG M is deleted, and fires off deletion trap D2. 1084 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1085 * If the trap is D2, then a LEAVE is not required, since the mcg 1086 * is already deleted; but if it is D1, a LEAVE is required. A safe 1087 * approach is to always LEAVE, but the SM may be confused if it 1088 * receives a LEAVE without a prior JOIN. 1089 * 1090 * Management of the non-membership to an mcg is similar to the above, 1091 * except that if the interface is in promiscuous mode, it is required 1092 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1093 * if the re-join attempt fails (in which case a warning message needs 1094 * to be printed), it is not clear whether it failed due to the mcg not 1095 * existing, or some fabric/hca issues, due to the delayed nature of 1096 * trap delivery. Querying the SA to establish presence/absence of the 1097 * mcg is also racy at best. Thus, the driver just prints a warning 1098 * message when it can not rejoin after receiving a create trap, although 1099 * this might be (on rare occassions) a mis-warning if the create trap is 1100 * received after the mcg was deleted. 1101 */ 1102 1103 /* 1104 * Implementation of atomic "recycle" bits and reference count 1105 * on address handles. This utilizes the fact that max reference 1106 * count on any handle is limited by number of send wqes, thus 1107 * high bits in the ac_ref field can be used as the recycle bits, 1108 * and only the low bits hold the number of pending Tx requests. 1109 * This atomic AH reference counting allows the Tx completion 1110 * handler not to acquire the id_ac_mutex to process every completion, 1111 * thus reducing lock contention problems between completion and 1112 * the Tx path. 1113 */ 1114 #define CYCLEVAL 0x80000 1115 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1116 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1117 #define GET_REF(ace) ((ace)->ac_ref) 1118 #define GET_REF_CYCLE(ace) ( \ 1119 /* \ 1120 * Make sure "cycle" bit is set. \ 1121 */ \ 1122 ASSERT(CYCLE_SET(ace)), \ 1123 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1124 ) 1125 #define INC_REF(ace, num) { \ 1126 atomic_add_32(&(ace)->ac_ref, num); \ 1127 } 1128 #define SET_CYCLE_IF_REF(ace) ( \ 1129 CYCLE_SET(ace) ? B_TRUE : \ 1130 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1131 CYCLEVAL ? \ 1132 /* \ 1133 * Clear the "cycle" bit we just set; \ 1134 * ref count known to be 0 from above. \ 1135 */ \ 1136 CLEAR_REFCYCLE(ace), B_FALSE : \ 1137 /* \ 1138 * We set "cycle" bit; let caller know. \ 1139 */ \ 1140 B_TRUE \ 1141 ) 1142 #define DEC_REF_DO_CYCLE(ace) ( \ 1143 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1144 CYCLEVAL ? \ 1145 /* \ 1146 * Ref count known to be 0 from above. \ 1147 */ \ 1148 B_TRUE : \ 1149 B_FALSE \ 1150 ) 1151 1152 static void * 1153 list_get_head(list_t *list) 1154 { 1155 list_node_t *lhead = list_head(list); 1156 1157 if (lhead != NULL) 1158 list_remove(list, lhead); 1159 return (lhead); 1160 } 1161 1162 /* 1163 * This is always guaranteed to be able to queue the work. 1164 */ 1165 static void 1166 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1167 { 1168 /* Initialize request */ 1169 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1170 ptr->rq_op = op; 1171 1172 /* 1173 * Queue provided slot onto request pool. 1174 */ 1175 mutex_enter(&state->id_acache_req_lock); 1176 list_insert_tail(&state->id_req_list, ptr); 1177 1178 /* Go, fetch, async thread */ 1179 cv_signal(&state->id_acache_req_cv); 1180 mutex_exit(&state->id_acache_req_lock); 1181 } 1182 1183 /* 1184 * Main body of the per interface async thread. 1185 */ 1186 static void 1187 ibd_async_work(ibd_state_t *state) 1188 { 1189 ibd_req_t *ptr; 1190 callb_cpr_t cprinfo; 1191 1192 mutex_enter(&state->id_acache_req_lock); 1193 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1194 callb_generic_cpr, "ibd_async_work"); 1195 1196 for (;;) { 1197 ptr = list_get_head(&state->id_req_list); 1198 if (ptr != NULL) { 1199 mutex_exit(&state->id_acache_req_lock); 1200 1201 /* 1202 * Once we have done the operation, there is no 1203 * guarantee the request slot is going to be valid, 1204 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1205 * TRAP). 1206 * 1207 * Perform the request. 1208 */ 1209 switch (ptr->rq_op) { 1210 case IBD_ASYNC_GETAH: 1211 ibd_async_acache(state, &ptr->rq_mac); 1212 break; 1213 case IBD_ASYNC_JOIN: 1214 case IBD_ASYNC_LEAVE: 1215 ibd_async_multicast(state, 1216 ptr->rq_gid, ptr->rq_op); 1217 break; 1218 case IBD_ASYNC_PROMON: 1219 ibd_async_setprom(state); 1220 break; 1221 case IBD_ASYNC_PROMOFF: 1222 ibd_async_unsetprom(state); 1223 break; 1224 case IBD_ASYNC_REAP: 1225 ibd_async_reap_group(state, 1226 ptr->rq_ptr, ptr->rq_gid, 1227 IB_MC_JSTATE_FULL); 1228 /* 1229 * the req buf contains in mce 1230 * structure, so we do not need 1231 * to free it here. 1232 */ 1233 ptr = NULL; 1234 break; 1235 case IBD_ASYNC_TRAP: 1236 ibd_async_trap(state, ptr); 1237 break; 1238 case IBD_ASYNC_SCHED: 1239 ibd_async_txsched(state); 1240 break; 1241 case IBD_ASYNC_LINK: 1242 ibd_async_link(state, ptr); 1243 break; 1244 case IBD_ASYNC_EXIT: 1245 mutex_enter(&state->id_acache_req_lock); 1246 #ifndef __lock_lint 1247 CALLB_CPR_EXIT(&cprinfo); 1248 #else 1249 mutex_exit(&state->id_acache_req_lock); 1250 #endif 1251 return; 1252 } 1253 if (ptr != NULL) 1254 kmem_cache_free(state->id_req_kmc, ptr); 1255 1256 mutex_enter(&state->id_acache_req_lock); 1257 } else { 1258 #ifndef __lock_lint 1259 /* 1260 * Nothing to do: wait till new request arrives. 1261 */ 1262 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1263 cv_wait(&state->id_acache_req_cv, 1264 &state->id_acache_req_lock); 1265 CALLB_CPR_SAFE_END(&cprinfo, 1266 &state->id_acache_req_lock); 1267 #endif 1268 } 1269 } 1270 1271 /*NOTREACHED*/ 1272 _NOTE(NOT_REACHED) 1273 } 1274 1275 /* 1276 * Return when it is safe to queue requests to the async daemon; primarily 1277 * for subnet trap and async event handling. Disallow requests before the 1278 * daemon is created, and when interface deinitilization starts. 1279 */ 1280 static boolean_t 1281 ibd_async_safe(ibd_state_t *state) 1282 { 1283 mutex_enter(&state->id_trap_lock); 1284 if (state->id_trap_stop) { 1285 mutex_exit(&state->id_trap_lock); 1286 return (B_FALSE); 1287 } 1288 state->id_trap_inprog++; 1289 mutex_exit(&state->id_trap_lock); 1290 return (B_TRUE); 1291 } 1292 1293 /* 1294 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1295 * trap or event handling to complete to kill the async thread and deconstruct 1296 * the mcg/ace list. 1297 */ 1298 static void 1299 ibd_async_done(ibd_state_t *state) 1300 { 1301 mutex_enter(&state->id_trap_lock); 1302 if (--state->id_trap_inprog == 0) 1303 cv_signal(&state->id_trap_cv); 1304 mutex_exit(&state->id_trap_lock); 1305 } 1306 1307 /* 1308 * Hash functions: 1309 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1310 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1311 * These operate on mac addresses input into ibd_send, but there is no 1312 * guarantee on the alignment of the ipoib_mac_t structure. 1313 */ 1314 /*ARGSUSED*/ 1315 static uint_t 1316 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1317 { 1318 ulong_t ptraddr = (ulong_t)key; 1319 uint_t hval; 1320 1321 /* 1322 * If the input address is 4 byte aligned, we can just dereference 1323 * it. This is most common, since IP will send in a 4 byte aligned 1324 * IP header, which implies the 24 byte IPoIB psuedo header will be 1325 * 4 byte aligned too. 1326 */ 1327 if ((ptraddr & 3) == 0) 1328 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1329 1330 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1331 return (hval); 1332 } 1333 1334 static int 1335 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1336 { 1337 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1338 return (0); 1339 else 1340 return (1); 1341 } 1342 1343 /* 1344 * Initialize all the per interface caches and lists; AH cache, 1345 * MCG list etc. 1346 */ 1347 static int 1348 ibd_acache_init(ibd_state_t *state) 1349 { 1350 ibd_ace_t *ce; 1351 int i; 1352 1353 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1354 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1355 1356 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1357 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1358 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1359 offsetof(ibd_ace_t, ac_list)); 1360 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1361 offsetof(ibd_ace_t, ac_list)); 1362 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1363 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1364 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1365 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1366 offsetof(ibd_mce_t, mc_list)); 1367 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1368 offsetof(ibd_mce_t, mc_list)); 1369 list_create(&state->id_req_list, sizeof (ibd_req_t), 1370 offsetof(ibd_req_t, rq_list)); 1371 1372 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1373 IBD_NUM_AH, KM_SLEEP); 1374 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1375 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1376 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1377 ibd_acache_fini(state); 1378 return (DDI_FAILURE); 1379 } else { 1380 CLEAR_REFCYCLE(ce); 1381 ce->ac_mce = NULL; 1382 IBD_ACACHE_INSERT_FREE(state, ce); 1383 } 1384 } 1385 return (DDI_SUCCESS); 1386 } 1387 1388 static void 1389 ibd_acache_fini(ibd_state_t *state) 1390 { 1391 ibd_ace_t *ptr; 1392 1393 mutex_enter(&state->id_ac_mutex); 1394 1395 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1396 ASSERT(GET_REF(ptr) == 0); 1397 (void) ibt_free_ud_dest(ptr->ac_dest); 1398 } 1399 1400 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1401 ASSERT(GET_REF(ptr) == 0); 1402 (void) ibt_free_ud_dest(ptr->ac_dest); 1403 } 1404 1405 list_destroy(&state->id_ah_free); 1406 list_destroy(&state->id_ah_active); 1407 list_destroy(&state->id_mc_full); 1408 list_destroy(&state->id_mc_non); 1409 list_destroy(&state->id_req_list); 1410 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1411 mutex_exit(&state->id_ac_mutex); 1412 mutex_destroy(&state->id_ac_mutex); 1413 mutex_destroy(&state->id_mc_mutex); 1414 mutex_destroy(&state->id_acache_req_lock); 1415 cv_destroy(&state->id_acache_req_cv); 1416 } 1417 1418 /* 1419 * Search AH active hash list for a cached path to input destination. 1420 * If we are "just looking", hold == F. When we are in the Tx path, 1421 * we set hold == T to grab a reference on the AH so that it can not 1422 * be recycled to a new destination while the Tx request is posted. 1423 */ 1424 static ibd_ace_t * 1425 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1426 { 1427 ibd_ace_t *ptr; 1428 1429 ASSERT(mutex_owned(&state->id_ac_mutex)); 1430 1431 /* 1432 * Do hash search. 1433 */ 1434 if (mod_hash_find(state->id_ah_active_hash, 1435 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1436 if (hold) 1437 INC_REF(ptr, num); 1438 return (ptr); 1439 } 1440 return (NULL); 1441 } 1442 1443 /* 1444 * This is called by the tx side; if an initialized AH is found in 1445 * the active list, it is locked down and can be used; if no entry 1446 * is found, an async request is queued to do path resolution. 1447 */ 1448 static ibd_ace_t * 1449 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1450 { 1451 ibd_ace_t *ptr; 1452 ibd_req_t *req; 1453 1454 /* 1455 * Only attempt to print when we can; in the mdt pattr case, the 1456 * address is not aligned properly. 1457 */ 1458 if (((ulong_t)mac & 3) == 0) { 1459 DPRINT(4, 1460 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1461 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1462 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1463 htonl(mac->ipoib_gidsuff[1])); 1464 } 1465 1466 mutex_enter(&state->id_ac_mutex); 1467 1468 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1469 mutex_exit(&state->id_ac_mutex); 1470 return (ptr); 1471 } 1472 1473 /* 1474 * Implementation of a single outstanding async request; if 1475 * the operation is not started yet, queue a request and move 1476 * to ongoing state. Remember in id_ah_addr for which address 1477 * we are queueing the request, in case we need to flag an error; 1478 * Any further requests, for the same or different address, until 1479 * the operation completes, is sent back to GLDv3 to be retried. 1480 * The async thread will update id_ah_op with an error indication 1481 * or will set it to indicate the next look up can start; either 1482 * way, it will mac_tx_update() so that all blocked requests come 1483 * back here. 1484 */ 1485 *err = EAGAIN; 1486 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1487 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1488 if (req != NULL) { 1489 /* 1490 * We did not even find the entry; queue a request 1491 * for it. 1492 */ 1493 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1494 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1495 state->id_ah_op = IBD_OP_ONGOING; 1496 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1497 } 1498 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1499 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1500 /* 1501 * Check the status of the pathrecord lookup request 1502 * we had queued before. 1503 */ 1504 if (state->id_ah_op == IBD_OP_ERRORED) { 1505 *err = EFAULT; 1506 state->id_ah_error++; 1507 } else { 1508 /* 1509 * IBD_OP_ROUTERED case: We need to send to the 1510 * all-router MCG. If we can find the AH for 1511 * the mcg, the Tx will be attempted. If we 1512 * do not find the AH, we return NORESOURCES 1513 * to retry. 1514 */ 1515 ipoib_mac_t routermac; 1516 1517 (void) ibd_get_allroutergroup(state, mac, &routermac); 1518 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1519 numwqe); 1520 } 1521 state->id_ah_op = IBD_OP_NOTSTARTED; 1522 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1523 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1524 /* 1525 * This case can happen when we get a higher band 1526 * packet. The easiest way is to reset the state machine 1527 * to accommodate the higher priority packet. 1528 */ 1529 state->id_ah_op = IBD_OP_NOTSTARTED; 1530 } 1531 mutex_exit(&state->id_ac_mutex); 1532 1533 return (ptr); 1534 } 1535 1536 /* 1537 * Grab a not-currently-in-use AH/PathRecord from the active 1538 * list to recycle to a new destination. Only the async thread 1539 * executes this code. 1540 */ 1541 static ibd_ace_t * 1542 ibd_acache_get_unref(ibd_state_t *state) 1543 { 1544 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1545 1546 ASSERT(mutex_owned(&state->id_ac_mutex)); 1547 1548 /* 1549 * Do plain linear search. 1550 */ 1551 while (ptr != NULL) { 1552 /* 1553 * Note that it is possible that the "cycle" bit 1554 * is set on the AH w/o any reference count. The 1555 * mcg must have been deleted, and the tx cleanup 1556 * just decremented the reference count to 0, but 1557 * hasn't gotten around to grabbing the id_ac_mutex 1558 * to move the AH into the free list. 1559 */ 1560 if (GET_REF(ptr) == 0) { 1561 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1562 break; 1563 } 1564 ptr = list_next(&state->id_ah_active, ptr); 1565 } 1566 return (ptr); 1567 } 1568 1569 /* 1570 * Invoked to clean up AH from active list in case of multicast 1571 * disable and to handle sendonly memberships during mcg traps. 1572 * And for port up processing for multicast and unicast AHs. 1573 * Normally, the AH is taken off the active list, and put into 1574 * the free list to be recycled for a new destination. In case 1575 * Tx requests on the AH have not completed yet, the AH is marked 1576 * for reaping (which will put the AH on the free list) once the Tx's 1577 * complete; in this case, depending on the "force" input, we take 1578 * out the AH from the active list right now, or leave it also for 1579 * the reap operation. Returns TRUE if the AH is taken off the active 1580 * list (and either put into the free list right now, or arranged for 1581 * later), FALSE otherwise. 1582 */ 1583 static boolean_t 1584 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1585 { 1586 ibd_ace_t *acactive; 1587 boolean_t ret = B_TRUE; 1588 1589 ASSERT(mutex_owned(&state->id_ac_mutex)); 1590 1591 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1592 1593 /* 1594 * Note that the AH might already have the cycle bit set 1595 * on it; this might happen if sequences of multicast 1596 * enables and disables are coming so fast, that posted 1597 * Tx's to the mcg have not completed yet, and the cycle 1598 * bit is set successively by each multicast disable. 1599 */ 1600 if (SET_CYCLE_IF_REF(acactive)) { 1601 if (!force) { 1602 /* 1603 * The ace is kept on the active list, further 1604 * Tx's can still grab a reference on it; the 1605 * ace is reaped when all pending Tx's 1606 * referencing the AH complete. 1607 */ 1608 ret = B_FALSE; 1609 } else { 1610 /* 1611 * In the mcg trap case, we always pull the 1612 * AH from the active list. And also the port 1613 * up multi/unicast case. 1614 */ 1615 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1616 acactive->ac_mce = NULL; 1617 } 1618 } else { 1619 /* 1620 * Determined the ref count is 0, thus reclaim 1621 * immediately after pulling out the ace from 1622 * the active list. 1623 */ 1624 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1625 acactive->ac_mce = NULL; 1626 IBD_ACACHE_INSERT_FREE(state, acactive); 1627 } 1628 1629 } 1630 return (ret); 1631 } 1632 1633 /* 1634 * Helper function for async path record lookup. If we are trying to 1635 * Tx to a MCG, check our membership, possibly trying to join the 1636 * group if required. If that fails, try to send the packet to the 1637 * all router group (indicated by the redirect output), pointing 1638 * the input mac address to the router mcg address. 1639 */ 1640 static ibd_mce_t * 1641 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1642 { 1643 ib_gid_t mgid; 1644 ibd_mce_t *mce; 1645 ipoib_mac_t routermac; 1646 1647 *redirect = B_FALSE; 1648 ibd_n2h_gid(mac, &mgid); 1649 1650 /* 1651 * Check the FullMember+SendOnlyNonMember list. 1652 * Since we are the only one who manipulates the 1653 * id_mc_full list, no locks are needed. 1654 */ 1655 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1656 if (mce != NULL) { 1657 DPRINT(4, "ibd_async_mcache : already joined to group"); 1658 return (mce); 1659 } 1660 1661 /* 1662 * Not found; try to join(SendOnlyNonMember) and attach. 1663 */ 1664 DPRINT(4, "ibd_async_mcache : not joined to group"); 1665 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1666 NULL) { 1667 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1668 return (mce); 1669 } 1670 1671 /* 1672 * MCGroup not present; try to join the all-router group. If 1673 * any of the following steps succeed, we will be redirecting 1674 * to the all router group. 1675 */ 1676 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1677 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1678 return (NULL); 1679 *redirect = B_TRUE; 1680 ibd_n2h_gid(&routermac, &mgid); 1681 bcopy(&routermac, mac, IPOIB_ADDRL); 1682 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1683 mgid.gid_prefix, mgid.gid_guid); 1684 1685 /* 1686 * Are we already joined to the router group? 1687 */ 1688 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1689 DPRINT(4, "ibd_async_mcache : using already joined router" 1690 "group\n"); 1691 return (mce); 1692 } 1693 1694 /* 1695 * Can we join(SendOnlyNonMember) the router group? 1696 */ 1697 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1698 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1699 NULL) { 1700 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1701 return (mce); 1702 } 1703 1704 return (NULL); 1705 } 1706 1707 /* 1708 * Async path record lookup code. 1709 */ 1710 static void 1711 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1712 { 1713 ibd_ace_t *ce; 1714 ibd_mce_t *mce = NULL; 1715 ibt_path_attr_t path_attr; 1716 ibt_path_info_t path_info; 1717 ib_gid_t destgid; 1718 char ret = IBD_OP_NOTSTARTED; 1719 1720 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1721 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1722 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1723 htonl(mac->ipoib_gidsuff[1])); 1724 1725 /* 1726 * Check whether we are trying to transmit to a MCG. 1727 * In that case, we need to make sure we are a member of 1728 * the MCG. 1729 */ 1730 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1731 boolean_t redirected; 1732 1733 /* 1734 * If we can not find or join the group or even 1735 * redirect, error out. 1736 */ 1737 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1738 NULL) { 1739 state->id_ah_op = IBD_OP_ERRORED; 1740 return; 1741 } 1742 1743 /* 1744 * If we got redirected, we need to determine whether 1745 * the AH for the new mcg is in the cache already, and 1746 * not pull it in then; otherwise proceed to get the 1747 * path for the new mcg. There is no guarantee that 1748 * if the AH is currently in the cache, it will still be 1749 * there when we look in ibd_acache_lookup(), but that's 1750 * okay, we will come back here. 1751 */ 1752 if (redirected) { 1753 ret = IBD_OP_ROUTERED; 1754 DPRINT(4, "ibd_async_acache : redirected to " 1755 "%08X:%08X:%08X:%08X:%08X", 1756 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1757 htonl(mac->ipoib_gidpref[1]), 1758 htonl(mac->ipoib_gidsuff[0]), 1759 htonl(mac->ipoib_gidsuff[1])); 1760 1761 mutex_enter(&state->id_ac_mutex); 1762 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1763 state->id_ah_op = IBD_OP_ROUTERED; 1764 mutex_exit(&state->id_ac_mutex); 1765 DPRINT(4, "ibd_async_acache : router AH found"); 1766 return; 1767 } 1768 mutex_exit(&state->id_ac_mutex); 1769 } 1770 } 1771 1772 /* 1773 * Get an AH from the free list. 1774 */ 1775 mutex_enter(&state->id_ac_mutex); 1776 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1777 /* 1778 * No free ones; try to grab an unreferenced active 1779 * one. Maybe we need to make the active list LRU, 1780 * but that will create more work for Tx callbacks. 1781 * Is there a way of not having to pull out the 1782 * entry from the active list, but just indicate it 1783 * is being recycled? Yes, but that creates one more 1784 * check in the fast lookup path. 1785 */ 1786 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1787 /* 1788 * Pretty serious shortage now. 1789 */ 1790 state->id_ah_op = IBD_OP_NOTSTARTED; 1791 mutex_exit(&state->id_ac_mutex); 1792 DPRINT(10, "ibd_async_acache : failed to find AH " 1793 "slot\n"); 1794 return; 1795 } 1796 /* 1797 * We could check whether ac_mce points to a SendOnly 1798 * member and drop that membership now. Or do it lazily 1799 * at detach time. 1800 */ 1801 ce->ac_mce = NULL; 1802 } 1803 mutex_exit(&state->id_ac_mutex); 1804 ASSERT(ce->ac_mce == NULL); 1805 1806 /* 1807 * Update the entry. 1808 */ 1809 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1810 1811 bzero(&path_info, sizeof (path_info)); 1812 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1813 path_attr.pa_sgid = state->id_sgid; 1814 path_attr.pa_num_dgids = 1; 1815 ibd_n2h_gid(&ce->ac_mac, &destgid); 1816 path_attr.pa_dgids = &destgid; 1817 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1818 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1819 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1820 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1821 goto error; 1822 } 1823 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1824 ntohl(ce->ac_mac.ipoib_qpn), 1825 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1826 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1827 goto error; 1828 } 1829 1830 /* 1831 * mce is set whenever an AH is being associated with a 1832 * MCG; this will come in handy when we leave the MCG. The 1833 * lock protects Tx fastpath from scanning the active list. 1834 */ 1835 if (mce != NULL) 1836 ce->ac_mce = mce; 1837 mutex_enter(&state->id_ac_mutex); 1838 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1839 state->id_ah_op = ret; 1840 mutex_exit(&state->id_ac_mutex); 1841 return; 1842 error: 1843 /* 1844 * We might want to drop SendOnly membership here if we 1845 * joined above. The lock protects Tx callbacks inserting 1846 * into the free list. 1847 */ 1848 mutex_enter(&state->id_ac_mutex); 1849 state->id_ah_op = IBD_OP_ERRORED; 1850 IBD_ACACHE_INSERT_FREE(state, ce); 1851 mutex_exit(&state->id_ac_mutex); 1852 } 1853 1854 /* 1855 * While restoring port's presence on the subnet on a port up, it is possible 1856 * that the port goes down again. 1857 */ 1858 static void 1859 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1860 { 1861 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1862 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1863 LINK_STATE_UP; 1864 ibd_mce_t *mce, *pmce; 1865 ibd_ace_t *ace, *pace; 1866 1867 DPRINT(10, "ibd_async_link(): %d", opcode); 1868 1869 /* 1870 * On a link up, revalidate the link speed/width. No point doing 1871 * this on a link down, since we will be unable to do SA operations, 1872 * defaulting to the lowest speed. Also notice that we update our 1873 * notion of speed before calling mac_link_update(), which will do 1874 * neccesary higher level notifications for speed changes. 1875 */ 1876 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1877 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1878 state->id_link_speed = ibd_get_portspeed(state); 1879 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1880 } 1881 1882 /* 1883 * Do all the work required to establish our presence on 1884 * the subnet. 1885 */ 1886 if (opcode == IBD_LINK_UP_ABSENT) { 1887 /* 1888 * If in promiscuous mode ... 1889 */ 1890 if (state->id_prom_op == IBD_OP_COMPLETED) { 1891 /* 1892 * Drop all nonmembership. 1893 */ 1894 ibd_async_unsetprom(state); 1895 1896 /* 1897 * Then, try to regain nonmembership to all mcg's. 1898 */ 1899 ibd_async_setprom(state); 1900 1901 } 1902 1903 /* 1904 * Drop all sendonly membership (which also gets rid of the 1905 * AHs); try to reacquire all full membership. 1906 */ 1907 mce = list_head(&state->id_mc_full); 1908 while ((pmce = mce) != NULL) { 1909 mce = list_next(&state->id_mc_full, mce); 1910 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1911 ibd_leave_group(state, 1912 pmce->mc_info.mc_adds_vect.av_dgid, 1913 IB_MC_JSTATE_SEND_ONLY_NON); 1914 else 1915 ibd_reacquire_group(state, pmce); 1916 } 1917 1918 /* 1919 * Recycle all active AHs to free list (and if there are 1920 * pending posts, make sure they will go into the free list 1921 * once the Tx's complete). Grab the lock to prevent 1922 * concurrent Tx's as well as Tx cleanups. 1923 */ 1924 mutex_enter(&state->id_ac_mutex); 1925 ace = list_head(&state->id_ah_active); 1926 while ((pace = ace) != NULL) { 1927 boolean_t cycled; 1928 1929 ace = list_next(&state->id_ah_active, ace); 1930 mce = pace->ac_mce; 1931 cycled = ibd_acache_recycle(state, &pace->ac_mac, 1932 B_TRUE); 1933 /* 1934 * If this is for an mcg, it must be for a fullmember, 1935 * since we got rid of send-only members above when 1936 * processing the mce list. 1937 */ 1938 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1939 IB_MC_JSTATE_FULL))); 1940 1941 /* 1942 * Check if the fullmember mce needs to be torn down, 1943 * ie whether the DLPI disable has already been done. 1944 * If so, do some of the work of tx_cleanup, namely 1945 * causing leave (which will fail), detach and 1946 * mce-freeing. tx_cleanup will put the AH into free 1947 * list. The reason to duplicate some of this 1948 * tx_cleanup work is because we want to delete the 1949 * AH right now instead of waiting for tx_cleanup, to 1950 * force subsequent Tx's to reacquire an AH. 1951 */ 1952 if ((mce != NULL) && (mce->mc_fullreap)) 1953 ibd_async_reap_group(state, mce, 1954 mce->mc_info.mc_adds_vect.av_dgid, 1955 mce->mc_jstate); 1956 } 1957 mutex_exit(&state->id_ac_mutex); 1958 } 1959 1960 /* 1961 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1962 * (which stops further events from being delivered) before 1963 * mac_unregister(). At this point, it is guaranteed that mac_register 1964 * has already been done. 1965 */ 1966 mutex_enter(&state->id_link_mutex); 1967 state->id_link_state = lstate; 1968 mac_link_update(state->id_mh, lstate); 1969 mutex_exit(&state->id_link_mutex); 1970 1971 ibd_async_done(state); 1972 } 1973 1974 /* 1975 * Check the pkey table to see if we can find the pkey we're looking for. 1976 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1977 * failure. 1978 */ 1979 static int 1980 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1981 uint16_t *pkix) 1982 { 1983 uint16_t ndx; 1984 1985 ASSERT(pkix != NULL); 1986 1987 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 1988 if (pkey_tbl[ndx] == pkey) { 1989 *pkix = ndx; 1990 return (0); 1991 } 1992 } 1993 return (-1); 1994 } 1995 1996 /* 1997 * When the link is notified up, we need to do a few things, based 1998 * on the port's current p_init_type_reply claiming a reinit has been 1999 * done or not. The reinit steps are: 2000 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2001 * the old Pkey and GID0 are correct. 2002 * 2. Register for mcg traps (already done by ibmf). 2003 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2004 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2005 * 4. Give up all sendonly memberships. 2006 * 5. Acquire all full memberships. 2007 * 6. In promiscuous mode, acquire all non memberships. 2008 * 7. Recycle all AHs to free list. 2009 */ 2010 static void 2011 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2012 { 2013 ibt_hca_portinfo_t *port_infop = NULL; 2014 ibt_status_t ibt_status; 2015 uint_t psize, port_infosz; 2016 ibd_link_op_t opcode; 2017 ibd_req_t *req; 2018 link_state_t new_link_state = LINK_STATE_UP; 2019 uint8_t itreply; 2020 uint16_t pkix; 2021 int ret; 2022 2023 /* 2024 * Let's not race with a plumb or an unplumb; if we detect a 2025 * pkey relocation event later on here, we may have to restart. 2026 */ 2027 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2028 2029 mutex_enter(&state->id_link_mutex); 2030 2031 /* 2032 * If the init code in ibd_m_start hasn't yet set up the 2033 * pkey/gid, nothing to do; that code will set the link state. 2034 */ 2035 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2036 mutex_exit(&state->id_link_mutex); 2037 goto link_mod_return; 2038 } 2039 2040 /* 2041 * If this routine was called in response to a port down event, 2042 * we just need to see if this should be informed. 2043 */ 2044 if (code == IBT_ERROR_PORT_DOWN) { 2045 new_link_state = LINK_STATE_DOWN; 2046 goto update_link_state; 2047 } 2048 2049 /* 2050 * If it's not a port down event we've received, try to get the port 2051 * attributes first. If we fail here, the port is as good as down. 2052 * Otherwise, if the link went down by the time the handler gets 2053 * here, give up - we cannot even validate the pkey/gid since those 2054 * are not valid and this is as bad as a port down anyway. 2055 */ 2056 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2057 &port_infop, &psize, &port_infosz); 2058 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2059 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2060 new_link_state = LINK_STATE_DOWN; 2061 goto update_link_state; 2062 } 2063 2064 /* 2065 * Check the SM InitTypeReply flags. If both NoLoadReply and 2066 * PreserveContentReply are 0, we don't know anything about the 2067 * data loaded into the port attributes, so we need to verify 2068 * if gid0 and pkey are still valid. 2069 */ 2070 itreply = port_infop->p_init_type_reply; 2071 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2072 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2073 /* 2074 * Check to see if the subnet part of GID0 has changed. If 2075 * not, check the simple case first to see if the pkey 2076 * index is the same as before; finally check to see if the 2077 * pkey has been relocated to a different index in the table. 2078 */ 2079 if (bcmp(port_infop->p_sgid_tbl, 2080 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2081 2082 new_link_state = LINK_STATE_DOWN; 2083 2084 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2085 state->id_pkey) { 2086 2087 new_link_state = LINK_STATE_UP; 2088 2089 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2090 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2091 2092 ibt_free_portinfo(port_infop, port_infosz); 2093 mutex_exit(&state->id_link_mutex); 2094 2095 /* 2096 * Currently a restart is required if our pkey has moved 2097 * in the pkey table. If we get the ibt_recycle_ud() to 2098 * work as documented (expected), we may be able to 2099 * avoid a complete restart. Note that we've already 2100 * marked both the start and stop 'in-progress' flags, 2101 * so it is ok to go ahead and do this restart. 2102 */ 2103 ibd_undo_start(state, LINK_STATE_DOWN); 2104 if ((ret = ibd_start(state)) != 0) { 2105 DPRINT(10, "ibd_restart: cannot restart, " 2106 "ret=%d", ret); 2107 } 2108 2109 goto link_mod_return; 2110 } else { 2111 new_link_state = LINK_STATE_DOWN; 2112 } 2113 } 2114 2115 update_link_state: 2116 if (port_infop) { 2117 ibt_free_portinfo(port_infop, port_infosz); 2118 } 2119 2120 /* 2121 * If the old state is the same as the new state, nothing to do 2122 */ 2123 if (state->id_link_state == new_link_state) { 2124 mutex_exit(&state->id_link_mutex); 2125 goto link_mod_return; 2126 } 2127 2128 /* 2129 * Ok, so there was a link state change; see if it's safe to ask 2130 * the async thread to do the work 2131 */ 2132 if (!ibd_async_safe(state)) { 2133 state->id_link_state = new_link_state; 2134 mutex_exit(&state->id_link_mutex); 2135 goto link_mod_return; 2136 } 2137 2138 mutex_exit(&state->id_link_mutex); 2139 2140 /* 2141 * If we're reporting a link up, check InitTypeReply to see if 2142 * the SM has ensured that the port's presence in mcg, traps, 2143 * etc. is intact. 2144 */ 2145 if (new_link_state == LINK_STATE_DOWN) { 2146 opcode = IBD_LINK_DOWN; 2147 } else { 2148 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2149 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2150 opcode = IBD_LINK_UP; 2151 } else { 2152 opcode = IBD_LINK_UP_ABSENT; 2153 } 2154 } 2155 2156 /* 2157 * Queue up a request for ibd_async_link() to handle this link 2158 * state change event 2159 */ 2160 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2161 req->rq_ptr = (void *)opcode; 2162 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2163 2164 link_mod_return: 2165 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2166 } 2167 2168 /* 2169 * For the port up/down events, IBTL guarantees there will not be concurrent 2170 * invocations of the handler. IBTL might coalesce link transition events, 2171 * and not invoke the handler for _each_ up/down transition, but it will 2172 * invoke the handler with last known state 2173 */ 2174 static void 2175 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2176 ibt_async_code_t code, ibt_async_event_t *event) 2177 { 2178 ibd_state_t *state = (ibd_state_t *)clnt_private; 2179 2180 switch (code) { 2181 case IBT_ERROR_CATASTROPHIC_CHAN: 2182 ibd_print_warn(state, "catastrophic channel error"); 2183 break; 2184 case IBT_ERROR_CQ: 2185 ibd_print_warn(state, "completion queue error"); 2186 break; 2187 case IBT_PORT_CHANGE_EVENT: 2188 /* 2189 * Events will be delivered to all instances that have 2190 * done ibt_open_hca() but not yet done ibt_close_hca(). 2191 * Only need to do work for our port; IBTF will deliver 2192 * events for other ports on the hca we have ibt_open_hca'ed 2193 * too. Note that id_port is initialized in ibd_attach() 2194 * before we do an ibt_open_hca() in ibd_attach(). 2195 */ 2196 ASSERT(state->id_hca_hdl == hca_hdl); 2197 if (state->id_port != event->ev_port) 2198 break; 2199 2200 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2201 IBT_PORT_CHANGE_PKEY) { 2202 ibd_link_mod(state, code); 2203 } 2204 break; 2205 case IBT_ERROR_PORT_DOWN: 2206 case IBT_CLNT_REREG_EVENT: 2207 case IBT_EVENT_PORT_UP: 2208 /* 2209 * Events will be delivered to all instances that have 2210 * done ibt_open_hca() but not yet done ibt_close_hca(). 2211 * Only need to do work for our port; IBTF will deliver 2212 * events for other ports on the hca we have ibt_open_hca'ed 2213 * too. Note that id_port is initialized in ibd_attach() 2214 * before we do an ibt_open_hca() in ibd_attach(). 2215 */ 2216 ASSERT(state->id_hca_hdl == hca_hdl); 2217 if (state->id_port != event->ev_port) 2218 break; 2219 2220 ibd_link_mod(state, code); 2221 break; 2222 2223 case IBT_HCA_ATTACH_EVENT: 2224 case IBT_HCA_DETACH_EVENT: 2225 /* 2226 * When a new card is plugged to the system, attach_event is 2227 * invoked. Additionally, a cfgadm needs to be run to make the 2228 * card known to the system, and an ifconfig needs to be run to 2229 * plumb up any ibd interfaces on the card. In the case of card 2230 * unplug, a cfgadm is run that will trigger any RCM scripts to 2231 * unplumb the ibd interfaces on the card; when the card is 2232 * actually unplugged, the detach_event is invoked; 2233 * additionally, if any ibd instances are still active on the 2234 * card (eg there were no associated RCM scripts), driver's 2235 * detach routine is invoked. 2236 */ 2237 break; 2238 default: 2239 break; 2240 } 2241 } 2242 2243 static int 2244 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2245 { 2246 mac_register_t *macp; 2247 int ret; 2248 2249 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2250 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2251 return (DDI_FAILURE); 2252 } 2253 2254 /* 2255 * Note that when we register with mac during attach, we don't 2256 * have the id_macaddr yet, so we'll simply be registering a 2257 * zero macaddr that we'll overwrite later during plumb (in 2258 * ibd_m_start()). Similar is the case with id_mtu - we'll 2259 * update the mac layer with the correct mtu during plumb. 2260 */ 2261 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2262 macp->m_driver = state; 2263 macp->m_dip = dip; 2264 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2265 macp->m_callbacks = &ibd_m_callbacks; 2266 macp->m_min_sdu = 0; 2267 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2268 2269 /* 2270 * Register ourselves with the GLDv3 interface 2271 */ 2272 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2273 mac_free(macp); 2274 DPRINT(10, 2275 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2276 return (DDI_FAILURE); 2277 } 2278 2279 mac_free(macp); 2280 return (DDI_SUCCESS); 2281 } 2282 2283 static int 2284 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2285 { 2286 ibt_hca_attr_t hca_attrs; 2287 ibt_status_t ibt_status; 2288 2289 /* 2290 * Query the HCA and fetch its attributes 2291 */ 2292 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2293 ASSERT(ibt_status == IBT_SUCCESS); 2294 2295 /* 2296 * 1. Set the Hardware Checksum capability. Currently we only consider 2297 * full checksum offload. 2298 */ 2299 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { 2300 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2301 } 2302 2303 /* 2304 * 2. Set LSO policy, capability and maximum length 2305 */ 2306 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2307 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2308 state->id_lso_policy = B_TRUE; 2309 } else { 2310 state->id_lso_policy = B_FALSE; 2311 } 2312 2313 /* 2314 * Work-around for Bug 6866957. Ignore policy from ibd.conf. 2315 * Turn off LSO forcibly. Remove it when the work-around is no longer 2316 * needed. 2317 */ 2318 if (ibd_force_lso_disable) { 2319 state->id_lso_policy = B_FALSE; 2320 } 2321 /* End of Workaround */ 2322 2323 if (hca_attrs.hca_max_lso_size > 0) { 2324 state->id_lso_capable = B_TRUE; 2325 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2326 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2327 else 2328 state->id_lso_maxlen = hca_attrs.hca_max_lso_size; 2329 } else { 2330 state->id_lso_capable = B_FALSE; 2331 state->id_lso_maxlen = 0; 2332 } 2333 2334 /* 2335 * 3. Set Reserved L_Key capability 2336 */ 2337 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2338 state->id_hca_res_lkey_capab = 1; 2339 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2340 } 2341 2342 /* 2343 * 4. Set maximum sqseg value after checking to see if extended sgl 2344 * size information is provided by the hca 2345 */ 2346 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2347 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2348 } else { 2349 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2350 } 2351 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2352 state->id_max_sqseg = IBD_MAX_SQSEG; 2353 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2354 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2355 state->id_max_sqseg, IBD_MAX_SQSEG); 2356 } 2357 2358 /* 2359 * 5. Set number of recv and send wqes after checking hca maximum 2360 * channel size 2361 */ 2362 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2363 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2364 } else { 2365 state->id_num_rwqe = IBD_NUM_RWQE; 2366 } 2367 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2368 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2369 } else { 2370 state->id_num_swqe = IBD_NUM_SWQE; 2371 } 2372 2373 return (DDI_SUCCESS); 2374 } 2375 2376 static int 2377 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2378 { 2379 int instance; 2380 uint32_t progress = state->id_mac_state; 2381 ibt_status_t ret; 2382 2383 if (progress & IBD_DRV_MAC_REGISTERED) { 2384 (void) mac_unregister(state->id_mh); 2385 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2386 } 2387 2388 if (progress & IBD_DRV_PD_ALLOCD) { 2389 if ((ret = ibt_free_pd(state->id_hca_hdl, 2390 state->id_pd_hdl)) != IBT_SUCCESS) { 2391 ibd_print_warn(state, "failed to free " 2392 "protection domain, ret=%d", ret); 2393 } 2394 state->id_pd_hdl = NULL; 2395 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2396 } 2397 2398 if (progress & IBD_DRV_HCA_OPENED) { 2399 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2400 IBT_SUCCESS) { 2401 ibd_print_warn(state, "failed to close " 2402 "HCA device, ret=%d", ret); 2403 } 2404 state->id_hca_hdl = NULL; 2405 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2406 } 2407 2408 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2409 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 2410 ibd_print_warn(state, 2411 "ibt_detach() failed, ret=%d", ret); 2412 } 2413 state->id_ibt_hdl = NULL; 2414 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2415 } 2416 2417 if (progress & IBD_DRV_TXINTR_ADDED) { 2418 ddi_remove_softintr(state->id_tx); 2419 state->id_tx = NULL; 2420 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2421 } 2422 2423 if (progress & IBD_DRV_RXINTR_ADDED) { 2424 ddi_remove_softintr(state->id_rx); 2425 state->id_rx = NULL; 2426 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2427 } 2428 2429 if (progress & IBD_DRV_STATE_INITIALIZED) { 2430 ibd_state_fini(state); 2431 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2432 } 2433 2434 instance = ddi_get_instance(dip); 2435 ddi_soft_state_free(ibd_list, instance); 2436 2437 return (DDI_SUCCESS); 2438 } 2439 2440 /* 2441 * Attach device to the IO framework. 2442 */ 2443 static int 2444 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2445 { 2446 ibd_state_t *state = NULL; 2447 ib_guid_t hca_guid; 2448 int instance; 2449 ibt_status_t ret; 2450 int rv; 2451 2452 /* 2453 * IBD doesn't support suspend/resume 2454 */ 2455 if (cmd != DDI_ATTACH) 2456 return (DDI_FAILURE); 2457 2458 /* 2459 * Allocate softstate structure 2460 */ 2461 instance = ddi_get_instance(dip); 2462 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2463 return (DDI_FAILURE); 2464 state = ddi_get_soft_state(ibd_list, instance); 2465 2466 /* 2467 * Initialize mutexes and condition variables 2468 */ 2469 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2470 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2471 goto attach_fail; 2472 } 2473 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2474 2475 /* 2476 * Allocate rx,tx softintr 2477 */ 2478 if (ibd_rx_softintr == 1) { 2479 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2480 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2481 DPRINT(10, "ibd_attach: failed in " 2482 "ddi_add_softintr(id_rx), ret=%d", rv); 2483 goto attach_fail; 2484 } 2485 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2486 } 2487 if (ibd_tx_softintr == 1) { 2488 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2489 NULL, NULL, ibd_tx_recycle, 2490 (caddr_t)state)) != DDI_SUCCESS) { 2491 DPRINT(10, "ibd_attach: failed in " 2492 "ddi_add_softintr(id_tx), ret=%d", rv); 2493 goto attach_fail; 2494 } 2495 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2496 } 2497 2498 /* 2499 * Obtain IBA P_Key, port number and HCA guid and validate 2500 * them (for P_Key, only full members are allowed as per 2501 * IPoIB specification; neither port number nor HCA guid 2502 * can be zero) 2503 */ 2504 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2505 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2506 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2507 state->id_pkey); 2508 goto attach_fail; 2509 } 2510 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2511 "port-number", 0)) == 0) { 2512 DPRINT(10, "ibd_attach: invalid port number (%d)", 2513 state->id_port); 2514 goto attach_fail; 2515 } 2516 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2517 "hca-guid", 0)) == 0) { 2518 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2519 hca_guid); 2520 goto attach_fail; 2521 } 2522 2523 /* 2524 * Attach to IBTL 2525 */ 2526 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2527 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2528 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 2529 goto attach_fail; 2530 } 2531 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2532 2533 /* 2534 * Open the HCA 2535 */ 2536 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2537 &state->id_hca_hdl)) != IBT_SUCCESS) { 2538 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2539 goto attach_fail; 2540 } 2541 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2542 2543 /* 2544 * Record capabilities 2545 */ 2546 (void) ibd_record_capab(state, dip); 2547 2548 /* 2549 * Allocate a protection domain on the HCA 2550 */ 2551 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2552 &state->id_pd_hdl)) != IBT_SUCCESS) { 2553 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2554 goto attach_fail; 2555 } 2556 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2557 2558 2559 /* 2560 * Register ibd interfaces with the Nemo framework 2561 */ 2562 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2563 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2564 goto attach_fail; 2565 } 2566 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2567 2568 /* 2569 * We're done with everything we could to make the attach 2570 * succeed. All the buffer allocations and IPoIB broadcast 2571 * group joins are deferred to when the interface instance 2572 * is actually plumbed to avoid wasting memory. 2573 */ 2574 return (DDI_SUCCESS); 2575 2576 attach_fail: 2577 ibd_unattach(state, dip); 2578 return (DDI_FAILURE); 2579 } 2580 2581 /* 2582 * Detach device from the IO framework. 2583 */ 2584 static int 2585 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2586 { 2587 ibd_state_t *state; 2588 int instance; 2589 2590 /* 2591 * IBD doesn't support suspend/resume 2592 */ 2593 if (cmd != DDI_DETACH) 2594 return (DDI_FAILURE); 2595 2596 /* 2597 * Get the instance softstate 2598 */ 2599 instance = ddi_get_instance(dip); 2600 state = ddi_get_soft_state(ibd_list, instance); 2601 2602 /* 2603 * Release all resources we're holding still. Note that if we'd 2604 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2605 * so far, we should find all the flags we need in id_mac_state. 2606 */ 2607 (void) ibd_unattach(state, dip); 2608 2609 return (DDI_SUCCESS); 2610 } 2611 2612 /* 2613 * Pre ibt_attach() driver initialization 2614 */ 2615 static int 2616 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2617 { 2618 char buf[64]; 2619 2620 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2621 state->id_link_state = LINK_STATE_UNKNOWN; 2622 2623 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2624 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2625 state->id_trap_stop = B_TRUE; 2626 state->id_trap_inprog = 0; 2627 2628 mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2629 state->id_dip = dip; 2630 2631 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2632 2633 state->id_tx_list.dl_head = NULL; 2634 state->id_tx_list.dl_tail = NULL; 2635 state->id_tx_list.dl_pending_sends = B_FALSE; 2636 state->id_tx_list.dl_cnt = 0; 2637 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2638 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2639 state->id_tx_busy = 0; 2640 2641 state->id_rx_list.dl_head = NULL; 2642 state->id_rx_list.dl_tail = NULL; 2643 state->id_rx_list.dl_bufs_outstanding = 0; 2644 state->id_rx_list.dl_cnt = 0; 2645 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2646 mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL); 2647 2648 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2649 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2650 0, NULL, NULL, NULL, NULL, NULL, 0); 2651 2652 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 2653 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 2654 2655 return (DDI_SUCCESS); 2656 } 2657 2658 /* 2659 * Post ibt_detach() driver deconstruction 2660 */ 2661 static void 2662 ibd_state_fini(ibd_state_t *state) 2663 { 2664 cv_destroy(&state->id_macst_cv); 2665 mutex_destroy(&state->id_macst_lock); 2666 2667 kmem_cache_destroy(state->id_req_kmc); 2668 2669 mutex_destroy(&state->id_rxpost_lock); 2670 mutex_destroy(&state->id_rx_list.dl_mutex); 2671 2672 mutex_destroy(&state->id_txpost_lock); 2673 mutex_destroy(&state->id_tx_list.dl_mutex); 2674 2675 mutex_destroy(&state->id_sched_lock); 2676 mutex_destroy(&state->id_cq_poll_lock); 2677 2678 cv_destroy(&state->id_trap_cv); 2679 mutex_destroy(&state->id_trap_lock); 2680 mutex_destroy(&state->id_link_mutex); 2681 } 2682 2683 /* 2684 * Fetch link speed from SA for snmp ifspeed reporting. 2685 */ 2686 static uint64_t 2687 ibd_get_portspeed(ibd_state_t *state) 2688 { 2689 int ret; 2690 ibt_path_info_t path; 2691 ibt_path_attr_t path_attr; 2692 uint8_t num_paths; 2693 uint64_t ifspeed; 2694 2695 /* 2696 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2697 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2698 * 2000000000. Start with that as default. 2699 */ 2700 ifspeed = 2000000000; 2701 2702 bzero(&path_attr, sizeof (path_attr)); 2703 2704 /* 2705 * Get the port speed from Loopback path information. 2706 */ 2707 path_attr.pa_dgids = &state->id_sgid; 2708 path_attr.pa_num_dgids = 1; 2709 path_attr.pa_sgid = state->id_sgid; 2710 2711 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2712 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2713 goto earlydone; 2714 2715 if (num_paths < 1) 2716 goto earlydone; 2717 2718 /* 2719 * In case SA does not return an expected value, report the default 2720 * speed as 1X. 2721 */ 2722 ret = 1; 2723 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2724 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2725 ret = 1; 2726 break; 2727 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2728 ret = 4; 2729 break; 2730 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2731 ret = 12; 2732 break; 2733 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2734 ret = 2; 2735 break; 2736 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2737 ret = 8; 2738 break; 2739 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2740 ret = 16; 2741 break; 2742 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2743 ret = 24; 2744 break; 2745 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2746 ret = 32; 2747 break; 2748 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2749 ret = 48; 2750 break; 2751 } 2752 2753 ifspeed *= ret; 2754 2755 earlydone: 2756 return (ifspeed); 2757 } 2758 2759 /* 2760 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2761 * representing the input mcg mgid. 2762 */ 2763 static ibd_mce_t * 2764 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2765 { 2766 ibd_mce_t *ptr = list_head(mlist); 2767 2768 /* 2769 * Do plain linear search. 2770 */ 2771 while (ptr != NULL) { 2772 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2773 sizeof (ib_gid_t)) == 0) 2774 return (ptr); 2775 ptr = list_next(mlist, ptr); 2776 } 2777 return (NULL); 2778 } 2779 2780 /* 2781 * Execute IBA JOIN. 2782 */ 2783 static ibt_status_t 2784 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2785 { 2786 ibt_mcg_attr_t mcg_attr; 2787 2788 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2789 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2790 mcg_attr.mc_mgid = mgid; 2791 mcg_attr.mc_join_state = mce->mc_jstate; 2792 mcg_attr.mc_scope = state->id_scope; 2793 mcg_attr.mc_pkey = state->id_pkey; 2794 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2795 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2796 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2797 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2798 NULL, NULL)); 2799 } 2800 2801 /* 2802 * This code JOINs the port in the proper way (depending on the join 2803 * state) so that IBA fabric will forward mcg packets to/from the port. 2804 * It also attaches the QPN to the mcg so it can receive those mcg 2805 * packets. This code makes sure not to attach the mcg to the QP if 2806 * that has been previously done due to the mcg being joined with a 2807 * different join state, even though this is not required by SWG_0216, 2808 * refid 3610. 2809 */ 2810 static ibd_mce_t * 2811 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2812 { 2813 ibt_status_t ibt_status; 2814 ibd_mce_t *mce, *tmce, *omce = NULL; 2815 boolean_t do_attach = B_TRUE; 2816 2817 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2818 jstate, mgid.gid_prefix, mgid.gid_guid); 2819 2820 /* 2821 * For enable_multicast Full member joins, we need to do some 2822 * extra work. If there is already an mce on the list that 2823 * indicates full membership, that means the membership has 2824 * not yet been dropped (since the disable_multicast was issued) 2825 * because there are pending Tx's to the mcg; in that case, just 2826 * mark the mce not to be reaped when the Tx completion queues 2827 * an async reap operation. 2828 * 2829 * If there is already an mce on the list indicating sendonly 2830 * membership, try to promote to full membership. Be careful 2831 * not to deallocate the old mce, since there might be an AH 2832 * pointing to it; instead, update the old mce with new data 2833 * that tracks the full membership. 2834 */ 2835 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2836 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2837 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2838 ASSERT(omce->mc_fullreap); 2839 omce->mc_fullreap = B_FALSE; 2840 return (omce); 2841 } else { 2842 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2843 } 2844 } 2845 2846 /* 2847 * Allocate the ibd_mce_t to track this JOIN. 2848 */ 2849 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2850 mce->mc_fullreap = B_FALSE; 2851 mce->mc_jstate = jstate; 2852 2853 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2854 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2855 ibt_status); 2856 kmem_free(mce, sizeof (ibd_mce_t)); 2857 return (NULL); 2858 } 2859 2860 /* 2861 * Is an IBA attach required? Not if the interface is already joined 2862 * to the mcg in a different appropriate join state. 2863 */ 2864 if (jstate == IB_MC_JSTATE_NON) { 2865 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2866 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2867 do_attach = B_FALSE; 2868 } else if (jstate == IB_MC_JSTATE_FULL) { 2869 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2870 do_attach = B_FALSE; 2871 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2872 do_attach = B_FALSE; 2873 } 2874 2875 if (do_attach) { 2876 /* 2877 * Do the IBA attach. 2878 */ 2879 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2880 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2881 &mce->mc_info)) != IBT_SUCCESS) { 2882 DPRINT(10, "ibd_join_group : failed qp attachment " 2883 "%d\n", ibt_status); 2884 /* 2885 * NOTE that we should probably preserve the join info 2886 * in the list and later try to leave again at detach 2887 * time. 2888 */ 2889 (void) ibt_leave_mcg(state->id_sgid, mgid, 2890 state->id_sgid, jstate); 2891 kmem_free(mce, sizeof (ibd_mce_t)); 2892 return (NULL); 2893 } 2894 } 2895 2896 /* 2897 * Insert the ibd_mce_t in the proper list. 2898 */ 2899 if (jstate == IB_MC_JSTATE_NON) { 2900 IBD_MCACHE_INSERT_NON(state, mce); 2901 } else { 2902 /* 2903 * Set up the mc_req fields used for reaping the 2904 * mcg in case of delayed tx completion (see 2905 * ibd_tx_cleanup()). Also done for sendonly join in 2906 * case we are promoted to fullmembership later and 2907 * keep using the same mce. 2908 */ 2909 mce->mc_req.rq_gid = mgid; 2910 mce->mc_req.rq_ptr = mce; 2911 /* 2912 * Check whether this is the case of trying to join 2913 * full member, and we were already joined send only. 2914 * We try to drop our SendOnly membership, but it is 2915 * possible that the mcg does not exist anymore (and 2916 * the subnet trap never reached us), so the leave 2917 * operation might fail. 2918 */ 2919 if (omce != NULL) { 2920 (void) ibt_leave_mcg(state->id_sgid, mgid, 2921 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2922 omce->mc_jstate = IB_MC_JSTATE_FULL; 2923 bcopy(&mce->mc_info, &omce->mc_info, 2924 sizeof (ibt_mcg_info_t)); 2925 kmem_free(mce, sizeof (ibd_mce_t)); 2926 return (omce); 2927 } 2928 mutex_enter(&state->id_mc_mutex); 2929 IBD_MCACHE_INSERT_FULL(state, mce); 2930 mutex_exit(&state->id_mc_mutex); 2931 } 2932 2933 return (mce); 2934 } 2935 2936 /* 2937 * Called during port up event handling to attempt to reacquire full 2938 * membership to an mcg. Stripped down version of ibd_join_group(). 2939 * Note that it is possible that the mcg might have gone away, and 2940 * gets recreated at this point. 2941 */ 2942 static void 2943 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2944 { 2945 ib_gid_t mgid; 2946 2947 /* 2948 * If the mc_fullreap flag is set, or this join fails, a subsequent 2949 * reap/leave is going to try to leave the group. We could prevent 2950 * that by adding a boolean flag into ibd_mce_t, if required. 2951 */ 2952 if (mce->mc_fullreap) 2953 return; 2954 2955 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2956 2957 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2958 mgid.gid_guid); 2959 2960 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2961 ibd_print_warn(state, "Failure on port up to rejoin " 2962 "multicast gid %016llx:%016llx", 2963 (u_longlong_t)mgid.gid_prefix, 2964 (u_longlong_t)mgid.gid_guid); 2965 } 2966 2967 /* 2968 * This code handles delayed Tx completion cleanups for mcg's to which 2969 * disable_multicast has been issued, regular mcg related cleanups during 2970 * disable_multicast, disable_promiscous and mcg traps, as well as 2971 * cleanups during driver detach time. Depending on the join state, 2972 * it deletes the mce from the appropriate list and issues the IBA 2973 * leave/detach; except in the disable_multicast case when the mce 2974 * is left on the active list for a subsequent Tx completion cleanup. 2975 */ 2976 static void 2977 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2978 uint8_t jstate) 2979 { 2980 ibd_mce_t *tmce; 2981 boolean_t do_detach = B_TRUE; 2982 2983 /* 2984 * Before detaching, we must check whether the other list 2985 * contains the mcg; if we detach blindly, the consumer 2986 * who set up the other list will also stop receiving 2987 * traffic. 2988 */ 2989 if (jstate == IB_MC_JSTATE_FULL) { 2990 /* 2991 * The following check is only relevant while coming 2992 * from the Tx completion path in the reap case. 2993 */ 2994 if (!mce->mc_fullreap) 2995 return; 2996 mutex_enter(&state->id_mc_mutex); 2997 IBD_MCACHE_PULLOUT_FULL(state, mce); 2998 mutex_exit(&state->id_mc_mutex); 2999 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3000 do_detach = B_FALSE; 3001 } else if (jstate == IB_MC_JSTATE_NON) { 3002 IBD_MCACHE_PULLOUT_NON(state, mce); 3003 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3004 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3005 do_detach = B_FALSE; 3006 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3007 mutex_enter(&state->id_mc_mutex); 3008 IBD_MCACHE_PULLOUT_FULL(state, mce); 3009 mutex_exit(&state->id_mc_mutex); 3010 do_detach = B_FALSE; 3011 } 3012 3013 /* 3014 * If we are reacting to a mcg trap and leaving our sendonly or 3015 * non membership, the mcg is possibly already gone, so attempting 3016 * to leave might fail. On the other hand, we must try to leave 3017 * anyway, since this might be a trap from long ago, and we could 3018 * have potentially sendonly joined to a recent incarnation of 3019 * the mcg and are about to loose track of this information. 3020 */ 3021 if (do_detach) { 3022 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3023 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3024 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3025 } 3026 3027 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3028 kmem_free(mce, sizeof (ibd_mce_t)); 3029 } 3030 3031 /* 3032 * Async code executed due to multicast and promiscuous disable requests 3033 * and mcg trap handling; also executed during driver detach. Mostly, a 3034 * leave and detach is done; except for the fullmember case when Tx 3035 * requests are pending, whence arrangements are made for subsequent 3036 * cleanup on Tx completion. 3037 */ 3038 static void 3039 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3040 { 3041 ipoib_mac_t mcmac; 3042 boolean_t recycled; 3043 ibd_mce_t *mce; 3044 3045 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3046 jstate, mgid.gid_prefix, mgid.gid_guid); 3047 3048 if (jstate == IB_MC_JSTATE_NON) { 3049 recycled = B_TRUE; 3050 mce = IBD_MCACHE_FIND_NON(state, mgid); 3051 /* 3052 * In case we are handling a mcg trap, we might not find 3053 * the mcg in the non list. 3054 */ 3055 if (mce == NULL) { 3056 return; 3057 } 3058 } else { 3059 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3060 3061 /* 3062 * In case we are handling a mcg trap, make sure the trap 3063 * is not arriving late; if we have an mce that indicates 3064 * that we are already a fullmember, that would be a clear 3065 * indication that the trap arrived late (ie, is for a 3066 * previous incarnation of the mcg). 3067 */ 3068 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3069 if ((mce == NULL) || (mce->mc_jstate == 3070 IB_MC_JSTATE_FULL)) { 3071 return; 3072 } 3073 } else { 3074 ASSERT(jstate == IB_MC_JSTATE_FULL); 3075 3076 /* 3077 * If join group failed, mce will be NULL here. 3078 * This is because in GLDv3 driver, set multicast 3079 * will always return success. 3080 */ 3081 if (mce == NULL) { 3082 return; 3083 } 3084 3085 mce->mc_fullreap = B_TRUE; 3086 } 3087 3088 /* 3089 * If no pending Tx's remain that reference the AH 3090 * for the mcg, recycle it from active to free list. 3091 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3092 * so the last completing Tx will cause an async reap 3093 * operation to be invoked, at which time we will drop our 3094 * membership to the mcg so that the pending Tx's complete 3095 * successfully. Refer to comments on "AH and MCE active 3096 * list manipulation" at top of this file. The lock protects 3097 * against Tx fast path and Tx cleanup code. 3098 */ 3099 mutex_enter(&state->id_ac_mutex); 3100 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3101 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3102 IB_MC_JSTATE_SEND_ONLY_NON)); 3103 mutex_exit(&state->id_ac_mutex); 3104 } 3105 3106 if (recycled) { 3107 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3108 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3109 ibd_async_reap_group(state, mce, mgid, jstate); 3110 } 3111 } 3112 3113 /* 3114 * Find the broadcast address as defined by IPoIB; implicitly 3115 * determines the IBA scope, mtu, tclass etc of the link the 3116 * interface is going to be a member of. 3117 */ 3118 static ibt_status_t 3119 ibd_find_bgroup(ibd_state_t *state) 3120 { 3121 ibt_mcg_attr_t mcg_attr; 3122 uint_t numg; 3123 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3124 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3125 IB_MC_SCOPE_GLOBAL }; 3126 int i, mcgmtu; 3127 boolean_t found = B_FALSE; 3128 int ret; 3129 ibt_mcg_info_t mcg_info; 3130 3131 state->id_bgroup_created = B_FALSE; 3132 3133 query_bcast_grp: 3134 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3135 mcg_attr.mc_pkey = state->id_pkey; 3136 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3137 3138 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3139 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3140 3141 /* 3142 * Look for the IPoIB broadcast group. 3143 */ 3144 state->id_mgid.gid_prefix = 3145 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3146 ((uint64_t)state->id_scope << 48) | 3147 ((uint32_t)(state->id_pkey << 16))); 3148 mcg_attr.mc_mgid = state->id_mgid; 3149 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3150 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3151 found = B_TRUE; 3152 break; 3153 } 3154 } 3155 3156 if (!found) { 3157 if (ibd_create_broadcast_group) { 3158 /* 3159 * If we created the broadcast group, but failed to 3160 * find it, we can't do anything except leave the 3161 * one we created and return failure. 3162 */ 3163 if (state->id_bgroup_created) { 3164 ibd_print_warn(state, "IPoIB broadcast group " 3165 "absent. Unable to query after create."); 3166 goto find_bgroup_fail; 3167 } 3168 3169 /* 3170 * Create the ipoib broadcast group if it didn't exist 3171 */ 3172 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3173 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3174 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3175 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3176 mcg_attr.mc_pkey = state->id_pkey; 3177 mcg_attr.mc_flow = 0; 3178 mcg_attr.mc_sl = 0; 3179 mcg_attr.mc_tclass = 0; 3180 state->id_mgid.gid_prefix = 3181 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3182 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3183 ((uint32_t)(state->id_pkey << 16))); 3184 mcg_attr.mc_mgid = state->id_mgid; 3185 3186 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3187 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3188 ibd_print_warn(state, "IPoIB broadcast group " 3189 "absent, create failed: ret = %d\n", ret); 3190 state->id_bgroup_created = B_FALSE; 3191 return (IBT_FAILURE); 3192 } 3193 state->id_bgroup_created = B_TRUE; 3194 goto query_bcast_grp; 3195 } else { 3196 ibd_print_warn(state, "IPoIB broadcast group absent"); 3197 return (IBT_FAILURE); 3198 } 3199 } 3200 3201 /* 3202 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3203 */ 3204 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3205 if (state->id_mtu < mcgmtu) { 3206 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3207 "greater than port's maximum MTU %d", mcgmtu, 3208 state->id_mtu); 3209 ibt_free_mcg_info(state->id_mcinfo, 1); 3210 goto find_bgroup_fail; 3211 } 3212 state->id_mtu = mcgmtu; 3213 3214 return (IBT_SUCCESS); 3215 3216 find_bgroup_fail: 3217 if (state->id_bgroup_created) { 3218 (void) ibt_leave_mcg(state->id_sgid, 3219 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3220 IB_MC_JSTATE_FULL); 3221 } 3222 3223 return (IBT_FAILURE); 3224 } 3225 3226 static int 3227 ibd_alloc_tx_copybufs(ibd_state_t *state) 3228 { 3229 ibt_mr_attr_t mem_attr; 3230 3231 /* 3232 * Allocate one big chunk for all regular tx copy bufs 3233 */ 3234 state->id_tx_buf_sz = state->id_mtu; 3235 if (state->id_lso_policy && state->id_lso_capable && 3236 (IBD_TX_BUF_SZ > state->id_mtu)) { 3237 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3238 } 3239 3240 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3241 state->id_tx_buf_sz, KM_SLEEP); 3242 3243 /* 3244 * Do one memory registration on the entire txbuf area 3245 */ 3246 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3247 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3248 mem_attr.mr_as = NULL; 3249 mem_attr.mr_flags = IBT_MR_SLEEP; 3250 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3251 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3252 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3253 kmem_free(state->id_tx_bufs, 3254 state->id_num_swqe * state->id_tx_buf_sz); 3255 state->id_tx_bufs = NULL; 3256 return (DDI_FAILURE); 3257 } 3258 3259 return (DDI_SUCCESS); 3260 } 3261 3262 static int 3263 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3264 { 3265 ibt_mr_attr_t mem_attr; 3266 ibd_lsobuf_t *buflist; 3267 ibd_lsobuf_t *lbufp; 3268 ibd_lsobuf_t *tail; 3269 ibd_lsobkt_t *bktp; 3270 uint8_t *membase; 3271 uint8_t *memp; 3272 uint_t memsz; 3273 int i; 3274 3275 /* 3276 * Allocate the lso bucket 3277 */ 3278 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3279 3280 /* 3281 * Allocate the entire lso memory and register it 3282 */ 3283 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3284 membase = kmem_zalloc(memsz, KM_SLEEP); 3285 3286 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3287 mem_attr.mr_len = memsz; 3288 mem_attr.mr_as = NULL; 3289 mem_attr.mr_flags = IBT_MR_SLEEP; 3290 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3291 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3292 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3293 kmem_free(membase, memsz); 3294 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3295 return (DDI_FAILURE); 3296 } 3297 3298 /* 3299 * Now allocate the buflist. Note that the elements in the buflist and 3300 * the buffers in the lso memory have a permanent 1-1 relation, so we 3301 * can always derive the address of a buflist entry from the address of 3302 * an lso buffer. 3303 */ 3304 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3305 KM_SLEEP); 3306 3307 /* 3308 * Set up the lso buf chain 3309 */ 3310 memp = membase; 3311 lbufp = buflist; 3312 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3313 lbufp->lb_isfree = 1; 3314 lbufp->lb_buf = memp; 3315 lbufp->lb_next = lbufp + 1; 3316 3317 tail = lbufp; 3318 3319 memp += IBD_LSO_BUFSZ; 3320 lbufp++; 3321 } 3322 tail->lb_next = NULL; 3323 3324 /* 3325 * Set up the LSO buffer information in ibd state 3326 */ 3327 bktp->bkt_bufl = buflist; 3328 bktp->bkt_free_head = buflist; 3329 bktp->bkt_mem = membase; 3330 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3331 bktp->bkt_nfree = bktp->bkt_nelem; 3332 3333 state->id_lso = bktp; 3334 3335 return (DDI_SUCCESS); 3336 } 3337 3338 /* 3339 * Statically allocate Tx buffer list(s). 3340 */ 3341 static int 3342 ibd_init_txlist(ibd_state_t *state) 3343 { 3344 ibd_swqe_t *swqe; 3345 ibt_lkey_t lkey; 3346 int i; 3347 3348 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3349 return (DDI_FAILURE); 3350 3351 if (state->id_lso_policy && state->id_lso_capable) { 3352 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3353 state->id_lso_policy = B_FALSE; 3354 } 3355 3356 /* 3357 * Allocate and setup the swqe list 3358 */ 3359 lkey = state->id_tx_mr_desc.md_lkey; 3360 for (i = 0; i < state->id_num_swqe; i++) { 3361 if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) { 3362 DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed"); 3363 ibd_fini_txlist(state); 3364 return (DDI_FAILURE); 3365 } 3366 3367 /* add to list */ 3368 state->id_tx_list.dl_cnt++; 3369 if (state->id_tx_list.dl_head == NULL) { 3370 swqe->swqe_prev = NULL; 3371 swqe->swqe_next = NULL; 3372 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3373 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3374 } else { 3375 swqe->swqe_prev = state->id_tx_list.dl_tail; 3376 swqe->swqe_next = NULL; 3377 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3378 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3379 } 3380 } 3381 3382 return (DDI_SUCCESS); 3383 } 3384 3385 static int 3386 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3387 uint32_t *nds_p) 3388 { 3389 ibd_lsobkt_t *bktp; 3390 ibd_lsobuf_t *lbufp; 3391 ibd_lsobuf_t *nextp; 3392 ibt_lkey_t lso_lkey; 3393 uint_t frag_sz; 3394 uint_t num_needed; 3395 int i; 3396 3397 ASSERT(sgl_p != NULL); 3398 ASSERT(nds_p != NULL); 3399 ASSERT(req_sz != 0); 3400 3401 /* 3402 * Determine how many bufs we'd need for the size requested 3403 */ 3404 num_needed = req_sz / IBD_LSO_BUFSZ; 3405 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3406 num_needed++; 3407 3408 mutex_enter(&state->id_lso_lock); 3409 3410 /* 3411 * If we don't have enough lso bufs, return failure 3412 */ 3413 ASSERT(state->id_lso != NULL); 3414 bktp = state->id_lso; 3415 if (bktp->bkt_nfree < num_needed) { 3416 mutex_exit(&state->id_lso_lock); 3417 return (-1); 3418 } 3419 3420 /* 3421 * Pick the first 'num_needed' bufs from the free list 3422 */ 3423 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3424 lbufp = bktp->bkt_free_head; 3425 for (i = 0; i < num_needed; i++) { 3426 ASSERT(lbufp->lb_isfree != 0); 3427 ASSERT(lbufp->lb_buf != NULL); 3428 3429 nextp = lbufp->lb_next; 3430 3431 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3432 sgl_p[i].ds_key = lso_lkey; 3433 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3434 3435 lbufp->lb_isfree = 0; 3436 lbufp->lb_next = NULL; 3437 3438 lbufp = nextp; 3439 } 3440 bktp->bkt_free_head = lbufp; 3441 3442 /* 3443 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3444 * to adjust the last sgl entry's length. Since we know we need atleast 3445 * one, the i-1 use below is ok. 3446 */ 3447 if (frag_sz) { 3448 sgl_p[i-1].ds_len = frag_sz; 3449 } 3450 3451 /* 3452 * Update nfree count and return 3453 */ 3454 bktp->bkt_nfree -= num_needed; 3455 3456 mutex_exit(&state->id_lso_lock); 3457 3458 *nds_p = num_needed; 3459 3460 return (0); 3461 } 3462 3463 static void 3464 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3465 { 3466 ibd_lsobkt_t *bktp; 3467 ibd_lsobuf_t *lbufp; 3468 uint8_t *lso_mem_end; 3469 uint_t ndx; 3470 int i; 3471 3472 mutex_enter(&state->id_lso_lock); 3473 3474 bktp = state->id_lso; 3475 ASSERT(bktp != NULL); 3476 3477 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3478 for (i = 0; i < nds; i++) { 3479 uint8_t *va; 3480 3481 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3482 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3483 3484 /* 3485 * Figure out the buflist element this sgl buffer corresponds 3486 * to and put it back at the head 3487 */ 3488 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3489 lbufp = bktp->bkt_bufl + ndx; 3490 3491 ASSERT(lbufp->lb_isfree == 0); 3492 ASSERT(lbufp->lb_buf == va); 3493 3494 lbufp->lb_isfree = 1; 3495 lbufp->lb_next = bktp->bkt_free_head; 3496 bktp->bkt_free_head = lbufp; 3497 } 3498 bktp->bkt_nfree += nds; 3499 3500 mutex_exit(&state->id_lso_lock); 3501 } 3502 3503 static void 3504 ibd_free_tx_copybufs(ibd_state_t *state) 3505 { 3506 /* 3507 * Unregister txbuf mr 3508 */ 3509 if (ibt_deregister_mr(state->id_hca_hdl, 3510 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3511 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3512 } 3513 state->id_tx_mr_hdl = NULL; 3514 3515 /* 3516 * Free txbuf memory 3517 */ 3518 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3519 state->id_tx_bufs = NULL; 3520 } 3521 3522 static void 3523 ibd_free_tx_lsobufs(ibd_state_t *state) 3524 { 3525 ibd_lsobkt_t *bktp; 3526 3527 mutex_enter(&state->id_lso_lock); 3528 3529 if ((bktp = state->id_lso) == NULL) { 3530 mutex_exit(&state->id_lso_lock); 3531 return; 3532 } 3533 3534 /* 3535 * First, free the buflist 3536 */ 3537 ASSERT(bktp->bkt_bufl != NULL); 3538 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3539 3540 /* 3541 * Unregister the LSO memory and free it 3542 */ 3543 ASSERT(bktp->bkt_mr_hdl != NULL); 3544 if (ibt_deregister_mr(state->id_hca_hdl, 3545 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3546 DPRINT(10, 3547 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3548 } 3549 ASSERT(bktp->bkt_mem); 3550 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3551 3552 /* 3553 * Finally free the bucket 3554 */ 3555 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3556 state->id_lso = NULL; 3557 3558 mutex_exit(&state->id_lso_lock); 3559 } 3560 3561 /* 3562 * Free the statically allocated Tx buffer list. 3563 */ 3564 static void 3565 ibd_fini_txlist(ibd_state_t *state) 3566 { 3567 ibd_swqe_t *node; 3568 3569 /* 3570 * Free the allocated swqes 3571 */ 3572 mutex_enter(&state->id_tx_list.dl_mutex); 3573 while (state->id_tx_list.dl_head != NULL) { 3574 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3575 state->id_tx_list.dl_head = node->swqe_next; 3576 ASSERT(state->id_tx_list.dl_cnt > 0); 3577 state->id_tx_list.dl_cnt--; 3578 ibd_free_swqe(state, node); 3579 } 3580 mutex_exit(&state->id_tx_list.dl_mutex); 3581 3582 ibd_free_tx_lsobufs(state); 3583 ibd_free_tx_copybufs(state); 3584 } 3585 3586 /* 3587 * Allocate a single send wqe and register it so it is almost 3588 * ready to be posted to the hardware. 3589 */ 3590 static int 3591 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey) 3592 { 3593 ibd_swqe_t *swqe; 3594 3595 swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP); 3596 *wqe = swqe; 3597 3598 swqe->swqe_type = IBD_WQE_SEND; 3599 swqe->swqe_next = NULL; 3600 swqe->swqe_prev = NULL; 3601 swqe->swqe_im_mblk = NULL; 3602 3603 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3604 (state->id_tx_bufs + ndx * state->id_tx_buf_sz); 3605 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3606 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3607 3608 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3609 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3610 swqe->w_swr.wr_trans = IBT_UD_SRV; 3611 3612 /* These are set in send */ 3613 swqe->w_swr.wr_nds = 0; 3614 swqe->w_swr.wr_sgl = NULL; 3615 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3616 3617 return (DDI_SUCCESS); 3618 } 3619 3620 /* 3621 * Free an allocated send wqe. 3622 */ 3623 /*ARGSUSED*/ 3624 static void 3625 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3626 { 3627 kmem_free(swqe, sizeof (ibd_swqe_t)); 3628 } 3629 3630 /* 3631 * Post a rwqe to the hardware and add it to the Rx list. The 3632 * "recycle" parameter indicates whether an old rwqe is being 3633 * recycled, or this is a new one. 3634 */ 3635 static int 3636 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3637 { 3638 ibt_status_t ibt_status; 3639 3640 if (recycle == B_FALSE) { 3641 mutex_enter(&state->id_rx_list.dl_mutex); 3642 if (state->id_rx_list.dl_head == NULL) { 3643 rwqe->rwqe_prev = NULL; 3644 rwqe->rwqe_next = NULL; 3645 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3646 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3647 } else { 3648 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3649 rwqe->rwqe_next = NULL; 3650 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3651 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3652 } 3653 mutex_exit(&state->id_rx_list.dl_mutex); 3654 } 3655 3656 mutex_enter(&state->id_rxpost_lock); 3657 if (state->id_rx_busy) { 3658 rwqe->w_post_link = NULL; 3659 if (state->id_rx_head) 3660 *(state->id_rx_tailp) = (ibd_wqe_t *)rwqe; 3661 else 3662 state->id_rx_head = rwqe; 3663 state->id_rx_tailp = &(rwqe->w_post_link); 3664 } else { 3665 state->id_rx_busy = 1; 3666 do { 3667 mutex_exit(&state->id_rxpost_lock); 3668 3669 /* 3670 * Here we should add dl_cnt before post recv, because 3671 * we would have to make sure dl_cnt is updated before 3672 * the corresponding ibd_process_rx() is called. 3673 */ 3674 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3675 3676 ibt_status = ibt_post_recv(state->id_chnl_hdl, 3677 &rwqe->w_rwr, 1, NULL); 3678 if (ibt_status != IBT_SUCCESS) { 3679 (void) atomic_add_32_nv( 3680 &state->id_rx_list.dl_cnt, -1); 3681 ibd_print_warn(state, "ibd_post_recv: " 3682 "posting failed, ret=%d", ibt_status); 3683 return (DDI_FAILURE); 3684 } 3685 3686 mutex_enter(&state->id_rxpost_lock); 3687 rwqe = state->id_rx_head; 3688 if (rwqe) { 3689 state->id_rx_head = 3690 (ibd_rwqe_t *)(rwqe->w_post_link); 3691 } 3692 } while (rwqe); 3693 state->id_rx_busy = 0; 3694 } 3695 mutex_exit(&state->id_rxpost_lock); 3696 3697 return (DDI_SUCCESS); 3698 } 3699 3700 /* 3701 * Allocate the statically allocated Rx buffer list. 3702 */ 3703 static int 3704 ibd_init_rxlist(ibd_state_t *state) 3705 { 3706 ibd_rwqe_t *rwqe; 3707 int i; 3708 3709 for (i = 0; i < state->id_num_rwqe; i++) { 3710 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3711 ibd_fini_rxlist(state); 3712 return (DDI_FAILURE); 3713 } 3714 3715 if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) { 3716 ibd_free_rwqe(state, rwqe); 3717 ibd_fini_rxlist(state); 3718 return (DDI_FAILURE); 3719 } 3720 } 3721 3722 return (DDI_SUCCESS); 3723 } 3724 3725 /* 3726 * Free the statically allocated Rx buffer list. 3727 * 3728 */ 3729 static void 3730 ibd_fini_rxlist(ibd_state_t *state) 3731 { 3732 ibd_rwqe_t *node; 3733 3734 mutex_enter(&state->id_rx_list.dl_mutex); 3735 while (state->id_rx_list.dl_head != NULL) { 3736 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3737 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3738 ASSERT(state->id_rx_list.dl_cnt > 0); 3739 state->id_rx_list.dl_cnt--; 3740 3741 ibd_free_rwqe(state, node); 3742 } 3743 mutex_exit(&state->id_rx_list.dl_mutex); 3744 } 3745 3746 /* 3747 * Allocate a single recv wqe and register it so it is almost 3748 * ready to be posted to the hardware. 3749 */ 3750 static int 3751 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3752 { 3753 ibt_mr_attr_t mem_attr; 3754 ibd_rwqe_t *rwqe; 3755 3756 if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3757 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3758 return (DDI_FAILURE); 3759 } 3760 *wqe = rwqe; 3761 rwqe->rwqe_type = IBD_WQE_RECV; 3762 rwqe->w_state = state; 3763 rwqe->rwqe_next = NULL; 3764 rwqe->rwqe_prev = NULL; 3765 rwqe->w_freeing_wqe = B_FALSE; 3766 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3767 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3768 3769 rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3770 IPOIB_GRH_SIZE, KM_NOSLEEP); 3771 if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) { 3772 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3773 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3774 return (DDI_FAILURE); 3775 } 3776 3777 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3778 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3779 NULL) { 3780 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3781 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3782 state->id_mtu + IPOIB_GRH_SIZE); 3783 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3784 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3785 return (DDI_FAILURE); 3786 } 3787 3788 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3789 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3790 mem_attr.mr_as = NULL; 3791 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3792 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3793 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3794 IBT_SUCCESS) { 3795 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3796 rwqe->w_freeing_wqe = B_TRUE; 3797 freemsg(rwqe->rwqe_im_mblk); 3798 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3799 state->id_mtu + IPOIB_GRH_SIZE); 3800 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3801 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3802 return (DDI_FAILURE); 3803 } 3804 3805 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3806 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3807 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3808 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3809 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3810 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3811 rwqe->w_rwr.wr_nds = 1; 3812 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3813 3814 return (DDI_SUCCESS); 3815 } 3816 3817 /* 3818 * Free an allocated recv wqe. 3819 */ 3820 static void 3821 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3822 { 3823 if (ibt_deregister_mr(state->id_hca_hdl, 3824 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3825 DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()"); 3826 return; 3827 } 3828 3829 /* 3830 * Indicate to the callback function that this rwqe/mblk 3831 * should not be recycled. The freemsg() will invoke 3832 * ibd_freemsg_cb(). 3833 */ 3834 if (rwqe->rwqe_im_mblk != NULL) { 3835 rwqe->w_freeing_wqe = B_TRUE; 3836 freemsg(rwqe->rwqe_im_mblk); 3837 } 3838 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3839 state->id_mtu + IPOIB_GRH_SIZE); 3840 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3841 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3842 } 3843 3844 /* 3845 * Delete the rwqe being freed from the rx list. 3846 */ 3847 static void 3848 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3849 { 3850 mutex_enter(&state->id_rx_list.dl_mutex); 3851 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3852 state->id_rx_list.dl_head = rwqe->rwqe_next; 3853 else 3854 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3855 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3856 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3857 else 3858 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3859 mutex_exit(&state->id_rx_list.dl_mutex); 3860 } 3861 3862 /* 3863 * IBA Rx/Tx completion queue handler. Guaranteed to be single 3864 * threaded and nonreentrant for this CQ. When using combined CQ, 3865 * this handles Tx and Rx completions. With separate CQs, this handles 3866 * only Rx completions. 3867 */ 3868 /* ARGSUSED */ 3869 static void 3870 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3871 { 3872 ibd_state_t *state = (ibd_state_t *)arg; 3873 3874 atomic_add_64(&state->id_num_intrs, 1); 3875 3876 if (ibd_rx_softintr == 1) 3877 ddi_trigger_softintr(state->id_rx); 3878 else 3879 (void) ibd_intr((char *)state); 3880 } 3881 3882 /* 3883 * Separate CQ handler for Tx completions, when the Tx CQ is in 3884 * interrupt driven mode. 3885 */ 3886 /* ARGSUSED */ 3887 static void 3888 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3889 { 3890 ibd_state_t *state = (ibd_state_t *)arg; 3891 3892 atomic_add_64(&state->id_num_intrs, 1); 3893 3894 if (ibd_tx_softintr == 1) 3895 ddi_trigger_softintr(state->id_tx); 3896 else 3897 (void) ibd_tx_recycle((char *)state); 3898 } 3899 3900 /* 3901 * Multicast group create/delete trap handler. These will be delivered 3902 * on a kernel thread (handling can thus block) and can be invoked 3903 * concurrently. The handler can be invoked anytime after it is 3904 * registered and before ibt_detach(). 3905 */ 3906 /* ARGSUSED */ 3907 static void 3908 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 3909 ibt_subnet_event_t *event) 3910 { 3911 ibd_state_t *state = (ibd_state_t *)arg; 3912 ibd_req_t *req; 3913 3914 /* 3915 * The trap handler will get invoked once for every event for 3916 * evert port. The input "gid" is the GID0 of the port the 3917 * trap came in on; we just need to act on traps that came 3918 * to our port, meaning the port on which the ipoib interface 3919 * resides. Since ipoib uses GID0 of the port, we just match 3920 * the gids to check whether we need to handle the trap. 3921 */ 3922 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 3923 return; 3924 3925 DPRINT(10, "ibd_notices_handler : %d\n", code); 3926 3927 switch (code) { 3928 case IBT_SM_EVENT_UNAVAILABLE: 3929 /* 3930 * If we are in promiscuous mode or have 3931 * sendnonmembers, we need to print a warning 3932 * message right now. Else, just store the 3933 * information, print when we enter promiscuous 3934 * mode or attempt nonmember send. We might 3935 * also want to stop caching sendnonmember. 3936 */ 3937 ibd_print_warn(state, "IBA multicast support " 3938 "degraded due to unavailability of multicast " 3939 "traps"); 3940 break; 3941 case IBT_SM_EVENT_AVAILABLE: 3942 /* 3943 * If we printed a warning message above or 3944 * while trying to nonmember send or get into 3945 * promiscuous mode, print an okay message. 3946 */ 3947 ibd_print_warn(state, "IBA multicast support " 3948 "restored due to availability of multicast " 3949 "traps"); 3950 break; 3951 case IBT_SM_EVENT_MCG_CREATED: 3952 case IBT_SM_EVENT_MCG_DELETED: 3953 /* 3954 * Common processing of creation/deletion traps. 3955 * First check if the instance is being 3956 * [de]initialized; back off then, without doing 3957 * anything more, since we are not sure if the 3958 * async thread is around, or whether we might 3959 * be racing with the detach code in ibd_m_stop() 3960 * that scans the mcg list. 3961 */ 3962 if (!ibd_async_safe(state)) 3963 return; 3964 3965 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 3966 req->rq_gid = event->sm_notice_gid; 3967 req->rq_ptr = (void *)code; 3968 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 3969 break; 3970 } 3971 } 3972 3973 static void 3974 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 3975 { 3976 ib_gid_t mgid = req->rq_gid; 3977 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 3978 3979 DPRINT(10, "ibd_async_trap : %d\n", code); 3980 3981 /* 3982 * Atomically search the nonmember and sendonlymember lists and 3983 * delete. 3984 */ 3985 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 3986 3987 if (state->id_prom_op == IBD_OP_COMPLETED) { 3988 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 3989 3990 /* 3991 * If in promiscuous mode, try to join/attach to the new 3992 * mcg. Given the unreliable out-of-order mode of trap 3993 * delivery, we can never be sure whether it is a problem 3994 * if the join fails. Thus, we warn the admin of a failure 3995 * if this was a creation trap. Note that the trap might 3996 * actually be reporting a long past event, and the mcg 3997 * might already have been deleted, thus we might be warning 3998 * in vain. 3999 */ 4000 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4001 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4002 ibd_print_warn(state, "IBA promiscuous mode missed " 4003 "new multicast gid %016llx:%016llx", 4004 (u_longlong_t)mgid.gid_prefix, 4005 (u_longlong_t)mgid.gid_guid); 4006 } 4007 4008 /* 4009 * Free the request slot allocated by the subnet event thread. 4010 */ 4011 ibd_async_done(state); 4012 } 4013 4014 /* 4015 * GLDv3 entry point to get capabilities. 4016 */ 4017 static boolean_t 4018 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4019 { 4020 ibd_state_t *state = arg; 4021 4022 switch (cap) { 4023 case MAC_CAPAB_HCKSUM: { 4024 uint32_t *txflags = cap_data; 4025 4026 /* 4027 * We either do full checksum or not do it at all 4028 */ 4029 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4030 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4031 else 4032 return (B_FALSE); 4033 break; 4034 } 4035 4036 case MAC_CAPAB_LSO: { 4037 mac_capab_lso_t *cap_lso = cap_data; 4038 4039 /* 4040 * In addition to the capability and policy, since LSO 4041 * relies on hw checksum, we'll not enable LSO if we 4042 * don't have hw checksum. Of course, if the HCA doesn't 4043 * provide the reserved lkey capability, enabling LSO will 4044 * actually affect performance adversely, so we'll disable 4045 * LSO even for that case. 4046 */ 4047 if (!state->id_lso_policy || !state->id_lso_capable) 4048 return (B_FALSE); 4049 4050 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4051 return (B_FALSE); 4052 4053 if (state->id_hca_res_lkey_capab == 0) { 4054 ibd_print_warn(state, "no reserved-lkey capability, " 4055 "disabling LSO"); 4056 return (B_FALSE); 4057 } 4058 4059 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4060 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4061 break; 4062 } 4063 4064 default: 4065 return (B_FALSE); 4066 } 4067 4068 return (B_TRUE); 4069 } 4070 4071 static int 4072 ibd_get_port_details(ibd_state_t *state) 4073 { 4074 ibt_hca_portinfo_t *port_infop; 4075 ibt_status_t ret; 4076 uint_t psize, port_infosz; 4077 4078 mutex_enter(&state->id_link_mutex); 4079 4080 /* 4081 * Query for port information 4082 */ 4083 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4084 &port_infop, &psize, &port_infosz); 4085 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4086 mutex_exit(&state->id_link_mutex); 4087 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4088 "failed, ret=%d", ret); 4089 return (ENETDOWN); 4090 } 4091 4092 /* 4093 * If the link already went down by the time we get here, 4094 * give up 4095 */ 4096 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 4097 mutex_exit(&state->id_link_mutex); 4098 ibt_free_portinfo(port_infop, port_infosz); 4099 DPRINT(10, "ibd_get_port_details: port is not active"); 4100 return (ENETDOWN); 4101 } 4102 4103 /* 4104 * If the link is active, verify the pkey 4105 */ 4106 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4107 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4108 mutex_exit(&state->id_link_mutex); 4109 ibt_free_portinfo(port_infop, port_infosz); 4110 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 4111 "failed, ret=%d", ret); 4112 return (ENONET); 4113 } 4114 4115 state->id_mtu = (128 << port_infop->p_mtu); 4116 state->id_sgid = *port_infop->p_sgid_tbl; 4117 state->id_link_state = LINK_STATE_UP; 4118 4119 mutex_exit(&state->id_link_mutex); 4120 ibt_free_portinfo(port_infop, port_infosz); 4121 4122 /* 4123 * Now that the port is active, record the port speed 4124 */ 4125 state->id_link_speed = ibd_get_portspeed(state); 4126 4127 return (0); 4128 } 4129 4130 static int 4131 ibd_alloc_cqs(ibd_state_t *state) 4132 { 4133 ibt_hca_attr_t hca_attrs; 4134 ibt_cq_attr_t cq_attr; 4135 ibt_status_t ret; 4136 uint32_t real_size; 4137 4138 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4139 ASSERT(ret == IBT_SUCCESS); 4140 4141 /* 4142 * Allocate Rx/combined CQ: 4143 * Theoretically, there is no point in having more than #rwqe 4144 * plus #swqe cqe's, except that the CQ will be signalled for 4145 * overflow when the last wqe completes, if none of the previous 4146 * cqe's have been polled. Thus, we allocate just a few less wqe's 4147 * to make sure such overflow does not occur. 4148 */ 4149 cq_attr.cq_sched = NULL; 4150 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4151 4152 if (ibd_separate_cqs == 1) { 4153 /* 4154 * Allocate Receive CQ. 4155 */ 4156 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4157 cq_attr.cq_size = state->id_num_rwqe + 1; 4158 } else { 4159 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4160 state->id_num_rwqe = cq_attr.cq_size - 1; 4161 } 4162 4163 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4164 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4165 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4166 "failed, ret=%d\n", ret); 4167 return (DDI_FAILURE); 4168 } 4169 4170 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4171 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4172 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4173 "moderation failed, ret=%d\n", ret); 4174 } 4175 4176 state->id_rxwcs_size = state->id_num_rwqe + 1; 4177 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4178 state->id_rxwcs_size, KM_SLEEP); 4179 4180 /* 4181 * Allocate Send CQ. 4182 */ 4183 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4184 cq_attr.cq_size = state->id_num_swqe + 1; 4185 } else { 4186 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4187 state->id_num_swqe = cq_attr.cq_size - 1; 4188 } 4189 4190 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4191 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4192 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4193 "failed, ret=%d\n", ret); 4194 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4195 state->id_rxwcs_size); 4196 (void) ibt_free_cq(state->id_rcq_hdl); 4197 return (DDI_FAILURE); 4198 } 4199 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4200 IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) { 4201 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4202 "moderation failed, ret=%d\n", ret); 4203 } 4204 4205 state->id_txwcs_size = state->id_num_swqe + 1; 4206 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4207 state->id_txwcs_size, KM_SLEEP); 4208 } else { 4209 /* 4210 * Allocate combined Send/Receive CQ. 4211 */ 4212 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 4213 state->id_num_swqe + 1)) { 4214 cq_attr.cq_size = state->id_num_rwqe + 4215 state->id_num_swqe + 1; 4216 } else { 4217 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4218 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 4219 state->id_num_rwqe) / (state->id_num_rwqe + 4220 state->id_num_swqe); 4221 state->id_num_swqe = cq_attr.cq_size - 1 - 4222 state->id_num_rwqe; 4223 } 4224 4225 state->id_rxwcs_size = cq_attr.cq_size; 4226 state->id_txwcs_size = state->id_rxwcs_size; 4227 4228 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4229 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4230 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) " 4231 "failed, ret=%d\n", ret); 4232 return (DDI_FAILURE); 4233 } 4234 state->id_scq_hdl = state->id_rcq_hdl; 4235 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4236 state->id_rxwcs_size, KM_SLEEP); 4237 state->id_txwcs = state->id_rxwcs; 4238 } 4239 4240 /* 4241 * Print message in case we could not allocate as many wqe's 4242 * as was requested. 4243 */ 4244 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4245 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4246 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4247 } 4248 if (state->id_num_swqe != IBD_NUM_SWQE) { 4249 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4250 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4251 } 4252 4253 return (DDI_SUCCESS); 4254 } 4255 4256 static int 4257 ibd_setup_ud_channel(ibd_state_t *state) 4258 { 4259 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4260 ibt_ud_chan_query_attr_t ud_chan_attr; 4261 ibt_status_t ret; 4262 4263 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 4264 if (state->id_hca_res_lkey_capab) 4265 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4266 if (state->id_lso_policy && state->id_lso_capable) 4267 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4268 4269 ud_alloc_attr.ud_hca_port_num = state->id_port; 4270 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4271 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4272 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4273 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4274 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4275 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4276 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4277 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4278 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4279 ud_alloc_attr.ud_clone_chan = NULL; 4280 4281 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4282 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4283 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4284 "failed, ret=%d\n", ret); 4285 return (DDI_FAILURE); 4286 } 4287 4288 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4289 &ud_chan_attr)) != IBT_SUCCESS) { 4290 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4291 "failed, ret=%d\n", ret); 4292 (void) ibt_free_channel(state->id_chnl_hdl); 4293 return (DDI_FAILURE); 4294 } 4295 4296 state->id_qpnum = ud_chan_attr.ud_qpn; 4297 4298 return (DDI_SUCCESS); 4299 } 4300 4301 static int 4302 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 4303 { 4304 uint32_t progress = state->id_mac_state; 4305 uint_t attempts; 4306 ibt_status_t ret; 4307 ib_gid_t mgid; 4308 ibd_mce_t *mce; 4309 uint8_t jstate; 4310 4311 /* 4312 * Before we try to stop/undo whatever we did in ibd_start(), 4313 * we need to mark the link state appropriately to prevent the 4314 * ip layer from using this instance for any new transfers. Note 4315 * that if the original state of the link was "up" when we're 4316 * here, we'll set the final link state to "unknown", to behave 4317 * in the same fashion as other ethernet drivers. 4318 */ 4319 mutex_enter(&state->id_link_mutex); 4320 if (cur_link_state == LINK_STATE_DOWN) { 4321 state->id_link_state = cur_link_state; 4322 } else { 4323 state->id_link_state = LINK_STATE_UNKNOWN; 4324 } 4325 mutex_exit(&state->id_link_mutex); 4326 mac_link_update(state->id_mh, state->id_link_state); 4327 4328 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4329 if (progress & IBD_DRV_STARTED) { 4330 state->id_mac_state &= (~IBD_DRV_STARTED); 4331 } 4332 4333 /* 4334 * First, stop receive interrupts; this stops the driver from 4335 * handing up buffers to higher layers. Wait for receive buffers 4336 * to be returned and give up after 5 seconds. 4337 */ 4338 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4339 4340 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4341 4342 attempts = 50; 4343 while (state->id_rx_list.dl_bufs_outstanding > 0) { 4344 delay(drv_usectohz(100000)); 4345 if (--attempts == 0) { 4346 /* 4347 * There are pending bufs with the network 4348 * layer and we have no choice but to wait 4349 * for them to be done with. Reap all the 4350 * Tx/Rx completions that were posted since 4351 * we turned off the notification and 4352 * return failure. 4353 */ 4354 DPRINT(2, "ibd_undo_start: " 4355 "reclaiming failed"); 4356 ibd_poll_compq(state, state->id_rcq_hdl); 4357 ibt_set_cq_handler(state->id_rcq_hdl, 4358 ibd_rcq_handler, state); 4359 return (DDI_FAILURE); 4360 } 4361 } 4362 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4363 } 4364 4365 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4366 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4367 4368 mutex_enter(&state->id_trap_lock); 4369 state->id_trap_stop = B_TRUE; 4370 while (state->id_trap_inprog > 0) 4371 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4372 mutex_exit(&state->id_trap_lock); 4373 4374 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4375 } 4376 4377 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4378 /* 4379 * Flushing the channel ensures that all pending WQE's 4380 * are marked with flush_error and handed to the CQ. It 4381 * does not guarantee the invocation of the CQ handler. 4382 * This call is guaranteed to return successfully for 4383 * UD QPNs. 4384 */ 4385 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 4386 IBT_SUCCESS) { 4387 DPRINT(10, "ibd_undo_start: flush_channel " 4388 "failed, ret=%d", ret); 4389 } 4390 4391 /* 4392 * Turn off Tx interrupts and poll. By the time the polling 4393 * returns an empty indicator, we are sure we have seen all 4394 * pending Tx callbacks. Note that after the call to 4395 * ibt_set_cq_handler() returns, the old handler is 4396 * guaranteed not to be invoked anymore. 4397 */ 4398 if (ibd_separate_cqs == 1) { 4399 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4400 } 4401 ibd_poll_compq(state, state->id_scq_hdl); 4402 4403 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4404 } 4405 4406 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4407 /* 4408 * No new async requests will be posted since the device 4409 * link state has been marked as unknown; completion handlers 4410 * have been turned off, so Tx handler will not cause any 4411 * more IBD_ASYNC_REAP requests. 4412 * 4413 * Queue a request for the async thread to exit, which will 4414 * be serviced after any pending ones. This can take a while, 4415 * specially if the SM is unreachable, since IBMF will slowly 4416 * timeout each SM request issued by the async thread. Reap 4417 * the thread before continuing on, we do not want it to be 4418 * lingering in modunloaded code (or we could move the reap 4419 * to ibd_detach(), provided we keep track of the current 4420 * id_async_thrid somewhere safe). 4421 */ 4422 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4423 thread_join(state->id_async_thrid); 4424 4425 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4426 } 4427 4428 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4429 /* 4430 * Drop all residual full/non membership. This includes full 4431 * membership to the broadcast group, and any nonmembership 4432 * acquired during transmits. We do this after the Tx completion 4433 * handlers are done, since those might result in some late 4434 * leaves; this also eliminates a potential race with that 4435 * path wrt the mc full list insert/delete. Trap handling 4436 * has also been suppressed at this point. Thus, no locks 4437 * are required while traversing the mc full list. 4438 */ 4439 DPRINT(2, "ibd_undo_start: clear full cache entries"); 4440 mce = list_head(&state->id_mc_full); 4441 while (mce != NULL) { 4442 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4443 jstate = mce->mc_jstate; 4444 mce = list_next(&state->id_mc_full, mce); 4445 ibd_leave_group(state, mgid, jstate); 4446 } 4447 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4448 } 4449 4450 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4451 ibd_fini_rxlist(state); 4452 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4453 } 4454 4455 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4456 ibd_fini_txlist(state); 4457 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4458 } 4459 4460 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4461 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 4462 IBT_SUCCESS) { 4463 DPRINT(10, "ibd_undo_start: free_channel " 4464 "failed, ret=%d", ret); 4465 } 4466 4467 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4468 } 4469 4470 if (progress & IBD_DRV_CQS_ALLOCD) { 4471 if (ibd_separate_cqs == 1) { 4472 kmem_free(state->id_txwcs, 4473 sizeof (ibt_wc_t) * state->id_txwcs_size); 4474 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 4475 IBT_SUCCESS) { 4476 DPRINT(10, "ibd_undo_start: free_cq(scq) " 4477 "failed, ret=%d", ret); 4478 } 4479 } 4480 4481 kmem_free(state->id_rxwcs, 4482 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4483 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 4484 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 4485 "ret=%d", ret); 4486 } 4487 4488 state->id_txwcs = NULL; 4489 state->id_rxwcs = NULL; 4490 state->id_scq_hdl = NULL; 4491 state->id_rcq_hdl = NULL; 4492 4493 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4494 } 4495 4496 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4497 mod_hash_destroy_hash(state->id_ah_active_hash); 4498 ibd_acache_fini(state); 4499 4500 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4501 } 4502 4503 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4504 /* 4505 * If we'd created the ipoib broadcast group and had 4506 * successfully joined it, leave it now 4507 */ 4508 if (state->id_bgroup_created) { 4509 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 4510 jstate = IB_MC_JSTATE_FULL; 4511 (void) ibt_leave_mcg(state->id_sgid, mgid, 4512 state->id_sgid, jstate); 4513 } 4514 ibt_free_mcg_info(state->id_mcinfo, 1); 4515 4516 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4517 } 4518 4519 return (DDI_SUCCESS); 4520 } 4521 4522 /* 4523 * These pair of routines are used to set/clear the condition that 4524 * the caller is likely to do something to change the id_mac_state. 4525 * If there's already someone doing either a start or a stop (possibly 4526 * due to the async handler detecting a pkey relocation event, a plumb 4527 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 4528 * that's done. 4529 */ 4530 static void 4531 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 4532 { 4533 mutex_enter(&state->id_macst_lock); 4534 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 4535 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 4536 4537 state->id_mac_state |= flag; 4538 mutex_exit(&state->id_macst_lock); 4539 } 4540 4541 static void 4542 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 4543 { 4544 mutex_enter(&state->id_macst_lock); 4545 state->id_mac_state &= (~flag); 4546 cv_signal(&state->id_macst_cv); 4547 mutex_exit(&state->id_macst_lock); 4548 } 4549 4550 /* 4551 * GLDv3 entry point to start hardware. 4552 */ 4553 /*ARGSUSED*/ 4554 static int 4555 ibd_m_start(void *arg) 4556 { 4557 ibd_state_t *state = arg; 4558 int ret; 4559 4560 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4561 4562 ret = ibd_start(state); 4563 4564 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4565 4566 return (ret); 4567 } 4568 4569 static int 4570 ibd_start(ibd_state_t *state) 4571 { 4572 kthread_t *kht; 4573 int err; 4574 ibt_status_t ret; 4575 4576 if (state->id_mac_state & IBD_DRV_STARTED) 4577 return (DDI_SUCCESS); 4578 4579 /* 4580 * Get port details; if we fail here, very likely the port 4581 * state is inactive or the pkey can't be verified. 4582 */ 4583 if ((err = ibd_get_port_details(state)) != 0) { 4584 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 4585 goto start_fail; 4586 } 4587 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4588 4589 /* 4590 * Find the IPoIB broadcast group 4591 */ 4592 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4593 DPRINT(10, "ibd_start: ibd_find_bgroup() failed"); 4594 err = ENOTACTIVE; 4595 goto start_fail; 4596 } 4597 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4598 4599 /* 4600 * Initialize per-interface caches and lists; if we fail here, 4601 * it is most likely due to a lack of resources 4602 */ 4603 if (ibd_acache_init(state) != DDI_SUCCESS) { 4604 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 4605 err = ENOMEM; 4606 goto start_fail; 4607 } 4608 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4609 4610 /* 4611 * Allocate send and receive completion queues 4612 */ 4613 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4614 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 4615 err = ENOMEM; 4616 goto start_fail; 4617 } 4618 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4619 4620 /* 4621 * Setup a UD channel 4622 */ 4623 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4624 err = ENOMEM; 4625 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 4626 goto start_fail; 4627 } 4628 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4629 4630 /* 4631 * Allocate and initialize the tx buffer list 4632 */ 4633 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4634 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 4635 err = ENOMEM; 4636 goto start_fail; 4637 } 4638 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4639 4640 /* 4641 * If we have separate cqs, create the send cq handler here 4642 */ 4643 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 4644 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4645 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 4646 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4647 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 4648 "failed, ret=%d", ret); 4649 err = EINVAL; 4650 goto start_fail; 4651 } 4652 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4653 } 4654 4655 /* 4656 * Allocate and initialize the rx buffer list 4657 */ 4658 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4659 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 4660 err = ENOMEM; 4661 goto start_fail; 4662 } 4663 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4664 4665 /* 4666 * Join IPoIB broadcast group 4667 */ 4668 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4669 DPRINT(10, "ibd_start: ibd_join_group() failed"); 4670 err = ENOTACTIVE; 4671 goto start_fail; 4672 } 4673 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4674 4675 /* 4676 * Create the async thread; thread_create never fails. 4677 */ 4678 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4679 TS_RUN, minclsyspri); 4680 state->id_async_thrid = kht->t_did; 4681 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4682 4683 /* 4684 * When we did mac_register() in ibd_attach(), we didn't register 4685 * the real macaddr and we didn't have the true port mtu. Now that 4686 * we're almost ready, set the local mac address and broadcast 4687 * addresses and update gldv3 about the real values of these 4688 * parameters. 4689 */ 4690 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4691 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4692 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4693 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4694 4695 mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); 4696 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4697 4698 /* 4699 * Setup the receive cq handler 4700 */ 4701 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4702 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 4703 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4704 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 4705 "failed, ret=%d", ret); 4706 err = EINVAL; 4707 goto start_fail; 4708 } 4709 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4710 4711 /* 4712 * Setup the subnet notices handler after we've initialized the acache/ 4713 * mcache and started the async thread, both of which are required for 4714 * the trap handler to function properly. 4715 * 4716 * Now that the async thread has been started (and we've already done 4717 * a mac_register() during attach so mac_tx_update() can be called 4718 * if necessary without any problem), we can enable the trap handler 4719 * to queue requests to the async thread. 4720 */ 4721 ibt_register_subnet_notices(state->id_ibt_hdl, 4722 ibd_snet_notices_handler, state); 4723 mutex_enter(&state->id_trap_lock); 4724 state->id_trap_stop = B_FALSE; 4725 mutex_exit(&state->id_trap_lock); 4726 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4727 4728 /* 4729 * Indicate link status to GLDv3 and higher layers. By default, 4730 * we assume we are in up state (which must have been true at 4731 * least at the time the broadcast mcg's were probed); if there 4732 * were any up/down transitions till the time we come here, the 4733 * async handler will have updated last known state, which we 4734 * use to tell GLDv3. The async handler will not send any 4735 * notifications to GLDv3 till we reach here in the initialization 4736 * sequence. 4737 */ 4738 state->id_mac_state |= IBD_DRV_STARTED; 4739 mac_link_update(state->id_mh, state->id_link_state); 4740 4741 return (DDI_SUCCESS); 4742 4743 start_fail: 4744 /* 4745 * If we ran into a problem during ibd_start() and ran into 4746 * some other problem during undoing our partial work, we can't 4747 * do anything about it. Ignore any errors we might get from 4748 * ibd_undo_start() and just return the original error we got. 4749 */ 4750 (void) ibd_undo_start(state, LINK_STATE_DOWN); 4751 return (err); 4752 } 4753 4754 /* 4755 * GLDv3 entry point to stop hardware from receiving packets. 4756 */ 4757 /*ARGSUSED*/ 4758 static void 4759 ibd_m_stop(void *arg) 4760 { 4761 ibd_state_t *state = (ibd_state_t *)arg; 4762 4763 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4764 4765 (void) ibd_undo_start(state, state->id_link_state); 4766 4767 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4768 } 4769 4770 /* 4771 * GLDv3 entry point to modify device's mac address. We do not 4772 * allow address modifications. 4773 */ 4774 static int 4775 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4776 { 4777 ibd_state_t *state = arg; 4778 4779 /* 4780 * Don't bother even comparing the macaddr if we haven't 4781 * completed ibd_m_start(). 4782 */ 4783 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4784 return (0); 4785 4786 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4787 return (0); 4788 else 4789 return (EINVAL); 4790 } 4791 4792 /* 4793 * The blocking part of the IBA join/leave operations are done out 4794 * of here on the async thread. 4795 */ 4796 static void 4797 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4798 { 4799 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4800 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4801 4802 if (op == IBD_ASYNC_JOIN) { 4803 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4804 ibd_print_warn(state, "Joint multicast group failed :" 4805 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4806 } 4807 } else { 4808 /* 4809 * Here, we must search for the proper mcg_info and 4810 * use that to leave the group. 4811 */ 4812 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4813 } 4814 } 4815 4816 /* 4817 * GLDv3 entry point for multicast enable/disable requests. 4818 * This function queues the operation to the async thread and 4819 * return success for a valid multicast address. 4820 */ 4821 static int 4822 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4823 { 4824 ibd_state_t *state = (ibd_state_t *)arg; 4825 ipoib_mac_t maddr, *mcast; 4826 ib_gid_t mgid; 4827 ibd_req_t *req; 4828 4829 /* 4830 * If we haven't completed ibd_m_start(), async thread wouldn't 4831 * have been started and id_bcaddr wouldn't be set, so there's 4832 * no point in continuing. 4833 */ 4834 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4835 return (0); 4836 4837 /* 4838 * The incoming multicast address might not be aligned properly 4839 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4840 * it to look like one though, to get the offsets of the mc gid, 4841 * since we know we are not going to dereference any values with 4842 * the ipoib_mac_t pointer. 4843 */ 4844 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 4845 mcast = &maddr; 4846 4847 /* 4848 * Check validity of MCG address. We could additionally check 4849 * that a enable/disable is not being issued on the "broadcast" 4850 * mcg, but since this operation is only invokable by priviledged 4851 * programs anyway, we allow the flexibility to those dlpi apps. 4852 * Note that we do not validate the "scope" of the IBA mcg. 4853 */ 4854 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 4855 return (EINVAL); 4856 4857 /* 4858 * fill in multicast pkey and scope 4859 */ 4860 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 4861 4862 /* 4863 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4864 * nothing (i.e. we stay JOINed to the broadcast group done in 4865 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 4866 * requires to be joined to broadcast groups at all times. 4867 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4868 * depends on this. 4869 */ 4870 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4871 return (0); 4872 4873 ibd_n2h_gid(mcast, &mgid); 4874 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4875 if (req == NULL) 4876 return (ENOMEM); 4877 4878 req->rq_gid = mgid; 4879 4880 if (add) { 4881 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 4882 mgid.gid_prefix, mgid.gid_guid); 4883 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 4884 } else { 4885 DPRINT(1, "ibd_m_multicst : unset_multicast : " 4886 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4887 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 4888 } 4889 return (0); 4890 } 4891 4892 /* 4893 * The blocking part of the IBA promiscuous operations are done 4894 * out of here on the async thread. The dlpireq parameter indicates 4895 * whether this invocation is due to a dlpi request or due to 4896 * a port up/down event. 4897 */ 4898 static void 4899 ibd_async_unsetprom(ibd_state_t *state) 4900 { 4901 ibd_mce_t *mce = list_head(&state->id_mc_non); 4902 ib_gid_t mgid; 4903 4904 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4905 4906 while (mce != NULL) { 4907 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4908 mce = list_next(&state->id_mc_non, mce); 4909 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4910 } 4911 state->id_prom_op = IBD_OP_NOTSTARTED; 4912 } 4913 4914 /* 4915 * The blocking part of the IBA promiscuous operations are done 4916 * out of here on the async thread. The dlpireq parameter indicates 4917 * whether this invocation is due to a dlpi request or due to 4918 * a port up/down event. 4919 */ 4920 static void 4921 ibd_async_setprom(ibd_state_t *state) 4922 { 4923 ibt_mcg_attr_t mcg_attr; 4924 ibt_mcg_info_t *mcg_info; 4925 ib_gid_t mgid; 4926 uint_t numg; 4927 int i; 4928 char ret = IBD_OP_COMPLETED; 4929 4930 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4931 4932 /* 4933 * Obtain all active MC groups on the IB fabric with 4934 * specified criteria (scope + Pkey + Qkey + mtu). 4935 */ 4936 bzero(&mcg_attr, sizeof (mcg_attr)); 4937 mcg_attr.mc_pkey = state->id_pkey; 4938 mcg_attr.mc_scope = state->id_scope; 4939 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4940 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4941 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4942 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4943 IBT_SUCCESS) { 4944 ibd_print_warn(state, "Could not get list of IBA multicast " 4945 "groups"); 4946 ret = IBD_OP_ERRORED; 4947 goto done; 4948 } 4949 4950 /* 4951 * Iterate over the returned mcg's and join as NonMember 4952 * to the IP mcg's. 4953 */ 4954 for (i = 0; i < numg; i++) { 4955 /* 4956 * Do a NonMember JOIN on the MC group. 4957 */ 4958 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4959 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4960 ibd_print_warn(state, "IBA promiscuous mode missed " 4961 "multicast gid %016llx:%016llx", 4962 (u_longlong_t)mgid.gid_prefix, 4963 (u_longlong_t)mgid.gid_guid); 4964 } 4965 4966 ibt_free_mcg_info(mcg_info, numg); 4967 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4968 done: 4969 state->id_prom_op = ret; 4970 } 4971 4972 /* 4973 * GLDv3 entry point for multicast promiscuous enable/disable requests. 4974 * GLDv3 assumes phys state receives more packets than multi state, 4975 * which is not true for IPoIB. Thus, treat the multi and phys 4976 * promiscuous states the same way to work with GLDv3's assumption. 4977 */ 4978 static int 4979 ibd_m_promisc(void *arg, boolean_t on) 4980 { 4981 ibd_state_t *state = (ibd_state_t *)arg; 4982 ibd_req_t *req; 4983 4984 /* 4985 * Async thread wouldn't have been started if we haven't 4986 * passed ibd_m_start() 4987 */ 4988 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4989 return (0); 4990 4991 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4992 if (req == NULL) 4993 return (ENOMEM); 4994 if (on) { 4995 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 4996 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 4997 } else { 4998 DPRINT(1, "ibd_m_promisc : unset_promisc"); 4999 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 5000 } 5001 5002 return (0); 5003 } 5004 5005 /* 5006 * GLDv3 entry point for gathering statistics. 5007 */ 5008 static int 5009 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 5010 { 5011 ibd_state_t *state = (ibd_state_t *)arg; 5012 5013 switch (stat) { 5014 case MAC_STAT_IFSPEED: 5015 *val = state->id_link_speed; 5016 break; 5017 case MAC_STAT_MULTIRCV: 5018 *val = state->id_multi_rcv; 5019 break; 5020 case MAC_STAT_BRDCSTRCV: 5021 *val = state->id_brd_rcv; 5022 break; 5023 case MAC_STAT_MULTIXMT: 5024 *val = state->id_multi_xmt; 5025 break; 5026 case MAC_STAT_BRDCSTXMT: 5027 *val = state->id_brd_xmt; 5028 break; 5029 case MAC_STAT_RBYTES: 5030 *val = state->id_rcv_bytes; 5031 break; 5032 case MAC_STAT_IPACKETS: 5033 *val = state->id_rcv_pkt; 5034 break; 5035 case MAC_STAT_OBYTES: 5036 *val = state->id_xmt_bytes; 5037 break; 5038 case MAC_STAT_OPACKETS: 5039 *val = state->id_xmt_pkt; 5040 break; 5041 case MAC_STAT_OERRORS: 5042 *val = state->id_ah_error; /* failed AH translation */ 5043 break; 5044 case MAC_STAT_IERRORS: 5045 *val = 0; 5046 break; 5047 case MAC_STAT_NOXMTBUF: 5048 *val = state->id_tx_short; 5049 break; 5050 case MAC_STAT_NORCVBUF: 5051 default: 5052 return (ENOTSUP); 5053 } 5054 5055 return (0); 5056 } 5057 5058 static void 5059 ibd_async_txsched(ibd_state_t *state) 5060 { 5061 ibd_req_t *req; 5062 int ret; 5063 5064 if (ibd_txcomp_poll) 5065 ibd_poll_compq(state, state->id_scq_hdl); 5066 5067 ret = ibd_resume_transmission(state); 5068 if (ret && ibd_txcomp_poll) { 5069 if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP)) 5070 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5071 else { 5072 ibd_print_warn(state, "ibd_async_txsched: " 5073 "no memory, can't schedule work slot"); 5074 } 5075 } 5076 } 5077 5078 static int 5079 ibd_resume_transmission(ibd_state_t *state) 5080 { 5081 int flag; 5082 int met_thresh = 0; 5083 int ret = -1; 5084 5085 mutex_enter(&state->id_sched_lock); 5086 if (state->id_sched_needed & IBD_RSRC_SWQE) { 5087 met_thresh = (state->id_tx_list.dl_cnt > 5088 IBD_FREE_SWQES_THRESH); 5089 flag = IBD_RSRC_SWQE; 5090 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 5091 ASSERT(state->id_lso != NULL); 5092 met_thresh = (state->id_lso->bkt_nfree > 5093 IBD_FREE_LSOS_THRESH); 5094 flag = IBD_RSRC_LSOBUF; 5095 } 5096 if (met_thresh) { 5097 state->id_sched_needed &= ~flag; 5098 ret = 0; 5099 } 5100 mutex_exit(&state->id_sched_lock); 5101 5102 if (ret == 0) 5103 mac_tx_update(state->id_mh); 5104 5105 return (ret); 5106 } 5107 5108 /* 5109 * Release the send wqe back into free list. 5110 */ 5111 static void 5112 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 5113 { 5114 /* 5115 * Add back on Tx list for reuse. 5116 */ 5117 swqe->swqe_next = NULL; 5118 mutex_enter(&state->id_tx_list.dl_mutex); 5119 if (state->id_tx_list.dl_pending_sends) { 5120 state->id_tx_list.dl_pending_sends = B_FALSE; 5121 } 5122 if (state->id_tx_list.dl_head == NULL) { 5123 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 5124 } else { 5125 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 5126 } 5127 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 5128 state->id_tx_list.dl_cnt++; 5129 mutex_exit(&state->id_tx_list.dl_mutex); 5130 } 5131 5132 /* 5133 * Acquire a send wqe from free list. 5134 * Returns error number and send wqe pointer. 5135 */ 5136 static int 5137 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe) 5138 { 5139 int rc = 0; 5140 ibd_swqe_t *wqe; 5141 5142 /* 5143 * Check and reclaim some of the completed Tx requests. 5144 * If someone else is already in this code and pulling Tx 5145 * completions, no need to poll, since the current lock holder 5146 * will do the work anyway. Normally, we poll for completions 5147 * every few Tx attempts, but if we are short on Tx descriptors, 5148 * we always try to poll. 5149 */ 5150 if ((ibd_txcomp_poll == 1) && 5151 (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) { 5152 ibd_poll_compq(state, state->id_scq_hdl); 5153 } 5154 5155 /* 5156 * Grab required transmit wqes. 5157 */ 5158 mutex_enter(&state->id_tx_list.dl_mutex); 5159 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 5160 if (wqe != NULL) { 5161 state->id_tx_list.dl_cnt -= 1; 5162 state->id_tx_list.dl_head = wqe->swqe_next; 5163 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) 5164 state->id_tx_list.dl_tail = NULL; 5165 } else { 5166 /* 5167 * If we did not find the number we were looking for, flag 5168 * no resource. Adjust list appropriately in either case. 5169 */ 5170 rc = ENOENT; 5171 state->id_tx_list.dl_pending_sends = B_TRUE; 5172 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 5173 atomic_add_64(&state->id_tx_short, 1); 5174 } 5175 mutex_exit(&state->id_tx_list.dl_mutex); 5176 *swqe = wqe; 5177 5178 return (rc); 5179 } 5180 5181 static int 5182 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 5183 ibt_ud_dest_hdl_t ud_dest) 5184 { 5185 mblk_t *nmp; 5186 int iph_len, tcph_len; 5187 ibt_wr_lso_t *lso; 5188 uintptr_t ip_start, tcp_start; 5189 uint8_t *dst; 5190 uint_t pending, mblen; 5191 5192 /* 5193 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 5194 * we need to adjust it here for lso. 5195 */ 5196 lso = &(node->w_swr.wr.ud_lso); 5197 lso->lso_ud_dest = ud_dest; 5198 lso->lso_mss = mss; 5199 5200 /* 5201 * Calculate the LSO header size and set it in the UD LSO structure. 5202 * Note that the only assumption we make is that each of the IPoIB, 5203 * IP and TCP headers will be contained in a single mblk fragment; 5204 * together, the headers may span multiple mblk fragments. 5205 */ 5206 nmp = mp; 5207 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 5208 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 5209 ip_start = (uintptr_t)nmp->b_cont->b_rptr 5210 + (ip_start - (uintptr_t)(nmp->b_wptr)); 5211 nmp = nmp->b_cont; 5212 5213 } 5214 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 5215 5216 tcp_start = ip_start + iph_len; 5217 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 5218 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 5219 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 5220 nmp = nmp->b_cont; 5221 } 5222 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5223 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5224 5225 /* 5226 * If the lso header fits entirely within a single mblk fragment, 5227 * we'll avoid an additional copy of the lso header here and just 5228 * pass the b_rptr of the mblk directly. 5229 * 5230 * If this isn't true, we'd have to allocate for it explicitly. 5231 */ 5232 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5233 lso->lso_hdr = mp->b_rptr; 5234 } else { 5235 /* On work completion, remember to free this allocated hdr */ 5236 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5237 if (lso->lso_hdr == NULL) { 5238 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5239 "sz = %d", lso->lso_hdr_sz); 5240 lso->lso_hdr_sz = 0; 5241 lso->lso_mss = 0; 5242 return (-1); 5243 } 5244 } 5245 5246 /* 5247 * Copy in the lso header only if we need to 5248 */ 5249 if (lso->lso_hdr != mp->b_rptr) { 5250 dst = lso->lso_hdr; 5251 pending = lso->lso_hdr_sz; 5252 5253 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5254 mblen = MBLKL(nmp); 5255 if (pending > mblen) { 5256 bcopy(nmp->b_rptr, dst, mblen); 5257 dst += mblen; 5258 pending -= mblen; 5259 } else { 5260 bcopy(nmp->b_rptr, dst, pending); 5261 break; 5262 } 5263 } 5264 } 5265 5266 return (0); 5267 } 5268 5269 static void 5270 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5271 { 5272 ibt_wr_lso_t *lso; 5273 5274 if ((!node) || (!mp)) 5275 return; 5276 5277 /* 5278 * Free any header space that we might've allocated if we 5279 * did an LSO 5280 */ 5281 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5282 lso = &(node->w_swr.wr.ud_lso); 5283 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5284 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5285 lso->lso_hdr = NULL; 5286 lso->lso_hdr_sz = 0; 5287 } 5288 } 5289 } 5290 5291 static void 5292 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5293 { 5294 uint_t i; 5295 uint_t num_posted; 5296 uint_t n_wrs; 5297 ibt_status_t ibt_status; 5298 ibt_send_wr_t wrs[IBD_MAX_POST_MULTIPLE]; 5299 ibd_swqe_t *elem; 5300 ibd_swqe_t *nodes[IBD_MAX_POST_MULTIPLE]; 5301 5302 node->swqe_next = NULL; 5303 5304 mutex_enter(&state->id_txpost_lock); 5305 5306 /* 5307 * Enqueue the new node in chain of wqes to send 5308 */ 5309 if (state->id_tx_head) { 5310 *(state->id_tx_tailp) = (ibd_wqe_t *)node; 5311 } else { 5312 state->id_tx_head = node; 5313 } 5314 state->id_tx_tailp = &(node->swqe_next); 5315 5316 /* 5317 * If someone else is helping out with the sends, 5318 * just go back 5319 */ 5320 if (state->id_tx_busy) { 5321 mutex_exit(&state->id_txpost_lock); 5322 return; 5323 } 5324 5325 /* 5326 * Otherwise, mark the flag to indicate that we'll be 5327 * doing the dispatch of what's there in the wqe chain 5328 */ 5329 state->id_tx_busy = 1; 5330 5331 while (state->id_tx_head) { 5332 /* 5333 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs 5334 * at a time if possible, and keep posting them. 5335 */ 5336 for (n_wrs = 0, elem = state->id_tx_head; 5337 (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE); 5338 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5339 5340 nodes[n_wrs] = elem; 5341 wrs[n_wrs] = elem->w_swr; 5342 } 5343 state->id_tx_head = elem; 5344 5345 /* 5346 * Release the txpost lock before posting the 5347 * send request to the hca; if the posting fails 5348 * for some reason, we'll never receive completion 5349 * intimation, so we'll need to cleanup. 5350 */ 5351 mutex_exit(&state->id_txpost_lock); 5352 5353 ASSERT(n_wrs != 0); 5354 5355 /* 5356 * If posting fails for some reason, we'll never receive 5357 * completion intimation, so we'll need to cleanup. But 5358 * we need to make sure we don't clean up nodes whose 5359 * wrs have been successfully posted. We assume that the 5360 * hca driver returns on the first failure to post and 5361 * therefore the first 'num_posted' entries don't need 5362 * cleanup here. 5363 */ 5364 num_posted = 0; 5365 ibt_status = ibt_post_send(state->id_chnl_hdl, 5366 wrs, n_wrs, &num_posted); 5367 if (ibt_status != IBT_SUCCESS) { 5368 5369 ibd_print_warn(state, "ibd_post_send: " 5370 "posting multiple wrs failed: " 5371 "requested=%d, done=%d, ret=%d", 5372 n_wrs, num_posted, ibt_status); 5373 5374 for (i = num_posted; i < n_wrs; i++) 5375 ibd_tx_cleanup(state, nodes[i]); 5376 } 5377 5378 /* 5379 * Grab the mutex before we go and check the tx Q again 5380 */ 5381 mutex_enter(&state->id_txpost_lock); 5382 } 5383 5384 state->id_tx_busy = 0; 5385 mutex_exit(&state->id_txpost_lock); 5386 } 5387 5388 static int 5389 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5390 uint_t lsohdr_sz) 5391 { 5392 ibt_wr_ds_t *sgl; 5393 ibt_status_t ibt_status; 5394 mblk_t *nmp; 5395 mblk_t *data_mp; 5396 uchar_t *bufp; 5397 size_t blksize; 5398 size_t skip; 5399 size_t avail; 5400 uint_t pktsize; 5401 uint_t frag_len; 5402 uint_t pending_hdr; 5403 uint_t hiwm; 5404 int nmblks; 5405 int i; 5406 5407 /* 5408 * Let's skip ahead to the data if this is LSO 5409 */ 5410 data_mp = mp; 5411 pending_hdr = 0; 5412 if (lsohdr_sz) { 5413 pending_hdr = lsohdr_sz; 5414 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5415 frag_len = nmp->b_wptr - nmp->b_rptr; 5416 if (frag_len > pending_hdr) 5417 break; 5418 pending_hdr -= frag_len; 5419 } 5420 data_mp = nmp; /* start of data past lso header */ 5421 ASSERT(data_mp != NULL); 5422 } 5423 5424 /* 5425 * Calculate the size of message data and number of msg blocks 5426 */ 5427 pktsize = 0; 5428 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5429 nmp = nmp->b_cont, nmblks++) { 5430 pktsize += MBLKL(nmp); 5431 } 5432 pktsize -= pending_hdr; 5433 5434 /* 5435 * Translating the virtual address regions into physical regions 5436 * for using the Reserved LKey feature results in a wr sgl that 5437 * is a little longer. Since failing ibt_map_mem_iov() is costly, 5438 * we'll fix a high-water mark (65%) for when we should stop. 5439 */ 5440 hiwm = (state->id_max_sqseg * 65) / 100; 5441 5442 /* 5443 * We only do ibt_map_mem_iov() if the pktsize is above the 5444 * "copy-threshold", and if the number of mp fragments is less than 5445 * the maximum acceptable. 5446 */ 5447 if ((state->id_hca_res_lkey_capab) && 5448 (pktsize > IBD_TX_COPY_THRESH) && 5449 (nmblks < hiwm)) { 5450 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5451 ibt_iov_attr_t iov_attr; 5452 5453 iov_attr.iov_as = NULL; 5454 iov_attr.iov = iov_arr; 5455 iov_attr.iov_buf = NULL; 5456 iov_attr.iov_list_len = nmblks; 5457 iov_attr.iov_wr_nds = state->id_max_sqseg; 5458 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5459 iov_attr.iov_flags = IBT_IOV_SLEEP; 5460 5461 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5462 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5463 iov_arr[i].iov_len = MBLKL(nmp); 5464 if (i == 0) { 5465 iov_arr[i].iov_addr += pending_hdr; 5466 iov_arr[i].iov_len -= pending_hdr; 5467 } 5468 } 5469 5470 node->w_buftype = IBD_WQE_MAPPED; 5471 node->w_swr.wr_sgl = node->w_sgl; 5472 5473 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5474 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5475 if (ibt_status != IBT_SUCCESS) { 5476 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5477 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5478 goto ibd_copy_path; 5479 } 5480 5481 return (0); 5482 } 5483 5484 ibd_copy_path: 5485 if (pktsize <= state->id_tx_buf_sz) { 5486 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5487 node->w_swr.wr_nds = 1; 5488 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5489 node->w_buftype = IBD_WQE_TXBUF; 5490 5491 /* 5492 * Even though this is the copy path for transfers less than 5493 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5494 * is possible the first data mblk fragment (data_mp) still 5495 * contains part of the LSO header that we need to skip. 5496 */ 5497 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5498 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5499 blksize = MBLKL(nmp) - pending_hdr; 5500 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5501 bufp += blksize; 5502 pending_hdr = 0; 5503 } 5504 5505 return (0); 5506 } 5507 5508 /* 5509 * Copy path for transfers greater than id_tx_buf_sz 5510 */ 5511 node->w_swr.wr_sgl = node->w_sgl; 5512 if (ibd_acquire_lsobufs(state, pktsize, 5513 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5514 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5515 return (-1); 5516 } 5517 node->w_buftype = IBD_WQE_LSOBUF; 5518 5519 /* 5520 * Copy the larger-than-id_tx_buf_sz packet into a set of 5521 * fixed-sized, pre-mapped LSO buffers. Note that we might 5522 * need to skip part of the LSO header in the first fragment 5523 * as before. 5524 */ 5525 nmp = data_mp; 5526 skip = pending_hdr; 5527 for (i = 0; i < node->w_swr.wr_nds; i++) { 5528 sgl = node->w_swr.wr_sgl + i; 5529 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5530 avail = IBD_LSO_BUFSZ; 5531 while (nmp && avail) { 5532 blksize = MBLKL(nmp) - skip; 5533 if (blksize > avail) { 5534 bcopy(nmp->b_rptr + skip, bufp, avail); 5535 skip += avail; 5536 avail = 0; 5537 } else { 5538 bcopy(nmp->b_rptr + skip, bufp, blksize); 5539 skip = 0; 5540 avail -= blksize; 5541 bufp += blksize; 5542 nmp = nmp->b_cont; 5543 } 5544 } 5545 } 5546 5547 return (0); 5548 } 5549 5550 /* 5551 * Schedule a completion queue polling to reap the resource we're 5552 * short on. If we implement the change to reap tx completions 5553 * in a separate thread, we'll need to wake up that thread here. 5554 */ 5555 static int 5556 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5557 { 5558 ibd_req_t *req; 5559 5560 mutex_enter(&state->id_sched_lock); 5561 state->id_sched_needed |= resource_type; 5562 mutex_exit(&state->id_sched_lock); 5563 5564 /* 5565 * If we are asked to queue a work entry, we need to do it 5566 */ 5567 if (q_flag) { 5568 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5569 if (req == NULL) 5570 return (-1); 5571 5572 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5573 } 5574 5575 return (0); 5576 } 5577 5578 /* 5579 * The passed in packet has this format: 5580 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5581 */ 5582 static boolean_t 5583 ibd_send(ibd_state_t *state, mblk_t *mp) 5584 { 5585 ibd_ace_t *ace; 5586 ibd_swqe_t *node; 5587 ipoib_mac_t *dest; 5588 ib_header_info_t *ipibp; 5589 ip6_t *ip6h; 5590 uint_t pktsize; 5591 uint32_t mss; 5592 uint32_t hckflags; 5593 uint32_t lsoflags = 0; 5594 uint_t lsohdr_sz = 0; 5595 int ret, len; 5596 boolean_t dofree = B_FALSE; 5597 boolean_t rc; 5598 5599 /* 5600 * If we aren't done with the device initialization and start, 5601 * we shouldn't be here. 5602 */ 5603 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5604 return (B_FALSE); 5605 5606 node = NULL; 5607 if (ibd_acquire_swqe(state, &node) != 0) { 5608 /* 5609 * If we don't have an swqe available, schedule a transmit 5610 * completion queue cleanup and hold off on sending more 5611 * more packets until we have some free swqes 5612 */ 5613 if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0) 5614 return (B_FALSE); 5615 5616 /* 5617 * If a poll cannot be scheduled, we have no choice but 5618 * to drop this packet 5619 */ 5620 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5621 return (B_TRUE); 5622 } 5623 5624 /* 5625 * Initialize the commonly used fields in swqe to NULL to protect 5626 * against ibd_tx_cleanup accidentally misinterpreting these on a 5627 * failure. 5628 */ 5629 node->swqe_im_mblk = NULL; 5630 node->w_swr.wr_nds = 0; 5631 node->w_swr.wr_sgl = NULL; 5632 node->w_swr.wr_opcode = IBT_WRC_SEND; 5633 5634 /* 5635 * Obtain an address handle for the destination. 5636 */ 5637 ipibp = (ib_header_info_t *)mp->b_rptr; 5638 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5639 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5640 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5641 5642 pktsize = msgsize(mp); 5643 5644 atomic_add_64(&state->id_xmt_bytes, pktsize); 5645 atomic_inc_64(&state->id_xmt_pkt); 5646 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5647 atomic_inc_64(&state->id_brd_xmt); 5648 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5649 atomic_inc_64(&state->id_multi_xmt); 5650 5651 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 5652 node->w_ahandle = ace; 5653 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5654 } else { 5655 DPRINT(5, 5656 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5657 ((ret == EFAULT) ? "failed" : "queued"), 5658 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5659 htonl(dest->ipoib_gidpref[1]), 5660 htonl(dest->ipoib_gidsuff[0]), 5661 htonl(dest->ipoib_gidsuff[1])); 5662 node->w_ahandle = NULL; 5663 5664 /* 5665 * for the poll mode, it is probably some cqe pending in the 5666 * cq. So ibd has to poll cq here, otherwise acache probably 5667 * may not be recycled. 5668 */ 5669 if (ibd_txcomp_poll == 1) 5670 ibd_poll_compq(state, state->id_scq_hdl); 5671 5672 /* 5673 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5674 * can not find a path for the specific dest address. We 5675 * should get rid of this kind of packet. We also should get 5676 * rid of the packet if we cannot schedule a poll via the 5677 * async thread. For the normal case, ibd will return the 5678 * packet to upper layer and wait for AH creating. 5679 * 5680 * Note that we always queue a work slot entry for the async 5681 * thread when we fail AH lookup (even in intr mode); this is 5682 * due to the convoluted way the code currently looks for AH. 5683 */ 5684 if (ret == EFAULT) { 5685 dofree = B_TRUE; 5686 rc = B_TRUE; 5687 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5688 dofree = B_TRUE; 5689 rc = B_TRUE; 5690 } else { 5691 dofree = B_FALSE; 5692 rc = B_FALSE; 5693 } 5694 goto ibd_send_fail; 5695 } 5696 5697 /* 5698 * For ND6 packets, padding is at the front of the source lladdr. 5699 * Insert the padding at front. 5700 */ 5701 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) { 5702 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 5703 if (!pullupmsg(mp, IPV6_HDR_LEN + 5704 sizeof (ib_header_info_t))) { 5705 DPRINT(10, "ibd_send: pullupmsg failure "); 5706 dofree = B_TRUE; 5707 rc = B_TRUE; 5708 goto ibd_send_fail; 5709 } 5710 ipibp = (ib_header_info_t *)mp->b_rptr; 5711 } 5712 ip6h = (ip6_t *)((uchar_t *)ipibp + 5713 sizeof (ib_header_info_t)); 5714 len = ntohs(ip6h->ip6_plen); 5715 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5716 mblk_t *pad; 5717 5718 pad = allocb(4, 0); 5719 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 5720 linkb(mp, pad); 5721 if (MBLKL(mp) < sizeof (ib_header_info_t) + 5722 IPV6_HDR_LEN + len + 4) { 5723 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 5724 IPV6_HDR_LEN + len + 4)) { 5725 DPRINT(10, "ibd_send: pullupmsg " 5726 "failure "); 5727 dofree = B_TRUE; 5728 rc = B_TRUE; 5729 goto ibd_send_fail; 5730 } 5731 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5732 sizeof (ib_header_info_t)); 5733 } 5734 5735 /* LINTED: E_CONSTANT_CONDITION */ 5736 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5737 } 5738 } 5739 5740 mp->b_rptr += sizeof (ib_addrs_t); 5741 5742 /* 5743 * Do LSO and checksum related work here. For LSO send, adjust the 5744 * ud destination, the opcode and the LSO header information to the 5745 * work request. 5746 */ 5747 lso_info_get(mp, &mss, &lsoflags); 5748 if ((lsoflags & HW_LSO) != HW_LSO) { 5749 node->w_swr.wr_opcode = IBT_WRC_SEND; 5750 lsohdr_sz = 0; 5751 } else { 5752 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 5753 /* 5754 * The routine can only fail if there's no memory; we 5755 * can only drop the packet if this happens 5756 */ 5757 ibd_print_warn(state, 5758 "ibd_send: no memory, lso posting failed"); 5759 dofree = B_TRUE; 5760 rc = B_TRUE; 5761 goto ibd_send_fail; 5762 } 5763 5764 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 5765 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 5766 } 5767 5768 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 5769 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 5770 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 5771 else 5772 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 5773 5774 /* 5775 * Prepare the sgl for posting; the routine can only fail if there's 5776 * no lso buf available for posting. If this is the case, we should 5777 * probably resched for lso bufs to become available and then try again. 5778 */ 5779 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 5780 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 5781 dofree = B_TRUE; 5782 rc = B_TRUE; 5783 } else { 5784 dofree = B_FALSE; 5785 rc = B_FALSE; 5786 } 5787 goto ibd_send_fail; 5788 } 5789 node->swqe_im_mblk = mp; 5790 5791 /* 5792 * Queue the wqe to hardware; since we can now simply queue a 5793 * post instead of doing it serially, we cannot assume anything 5794 * about the 'node' after ibd_post_send() returns. 5795 */ 5796 ibd_post_send(state, node); 5797 5798 return (B_TRUE); 5799 5800 ibd_send_fail: 5801 if (node && mp) 5802 ibd_free_lsohdr(node, mp); 5803 5804 if (dofree) 5805 freemsg(mp); 5806 5807 if (node != NULL) 5808 ibd_tx_cleanup(state, node); 5809 5810 return (rc); 5811 } 5812 5813 /* 5814 * GLDv3 entry point for transmitting datagram. 5815 */ 5816 static mblk_t * 5817 ibd_m_tx(void *arg, mblk_t *mp) 5818 { 5819 ibd_state_t *state = (ibd_state_t *)arg; 5820 mblk_t *next; 5821 5822 if (state->id_link_state != LINK_STATE_UP) { 5823 freemsgchain(mp); 5824 mp = NULL; 5825 } 5826 5827 while (mp != NULL) { 5828 next = mp->b_next; 5829 mp->b_next = NULL; 5830 if (ibd_send(state, mp) == B_FALSE) { 5831 /* Send fail */ 5832 mp->b_next = next; 5833 break; 5834 } 5835 mp = next; 5836 } 5837 5838 return (mp); 5839 } 5840 5841 /* 5842 * this handles Tx and Rx completions. With separate CQs, this handles 5843 * only Rx completions. 5844 */ 5845 static uint_t 5846 ibd_intr(char *arg) 5847 { 5848 ibd_state_t *state = (ibd_state_t *)arg; 5849 5850 ibd_poll_compq(state, state->id_rcq_hdl); 5851 5852 return (DDI_INTR_CLAIMED); 5853 } 5854 5855 /* 5856 * Poll and drain the cq 5857 */ 5858 static uint_t 5859 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs, 5860 uint_t numwcs) 5861 { 5862 ibd_wqe_t *wqe; 5863 ibt_wc_t *wc; 5864 uint_t total_polled = 0; 5865 uint_t num_polled; 5866 int i; 5867 5868 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 5869 total_polled += num_polled; 5870 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 5871 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5872 ASSERT((wqe->w_type == IBD_WQE_SEND) || 5873 (wqe->w_type == IBD_WQE_RECV)); 5874 if (wc->wc_status != IBT_WC_SUCCESS) { 5875 /* 5876 * Channel being torn down. 5877 */ 5878 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5879 DPRINT(5, "ibd_drain_cq: flush error"); 5880 /* 5881 * Only invoke the Tx handler to 5882 * release possibly held resources 5883 * like AH refcount etc. Can not 5884 * invoke Rx handler because it might 5885 * try adding buffers to the Rx pool 5886 * when we are trying to deinitialize. 5887 */ 5888 if (wqe->w_type == IBD_WQE_RECV) { 5889 continue; 5890 } else { 5891 DPRINT(10, "ibd_drain_cq: Bad " 5892 "status %d", wc->wc_status); 5893 } 5894 } 5895 } 5896 if (wqe->w_type == IBD_WQE_SEND) { 5897 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); 5898 } else { 5899 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5900 } 5901 } 5902 } 5903 5904 return (total_polled); 5905 } 5906 5907 /* 5908 * Common code for interrupt handling as well as for polling 5909 * for all completed wqe's while detaching. 5910 */ 5911 static void 5912 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5913 { 5914 ibt_wc_t *wcs; 5915 uint_t numwcs; 5916 int flag, redo_flag; 5917 int redo = 1; 5918 uint_t num_polled = 0; 5919 5920 if (ibd_separate_cqs == 1) { 5921 if (cq_hdl == state->id_rcq_hdl) { 5922 flag = IBD_RX_CQ_POLLING; 5923 redo_flag = IBD_REDO_RX_CQ_POLLING; 5924 } else { 5925 flag = IBD_TX_CQ_POLLING; 5926 redo_flag = IBD_REDO_TX_CQ_POLLING; 5927 } 5928 } else { 5929 flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING; 5930 redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING; 5931 } 5932 5933 mutex_enter(&state->id_cq_poll_lock); 5934 if (state->id_cq_poll_busy & flag) { 5935 state->id_cq_poll_busy |= redo_flag; 5936 mutex_exit(&state->id_cq_poll_lock); 5937 return; 5938 } 5939 state->id_cq_poll_busy |= flag; 5940 mutex_exit(&state->id_cq_poll_lock); 5941 5942 /* 5943 * In some cases (eg detaching), this code can be invoked on 5944 * any cpu after disabling cq notification (thus no concurrency 5945 * exists). Apart from that, the following applies normally: 5946 * The receive completion handling is always on the Rx interrupt 5947 * cpu. Transmit completion handling could be from any cpu if 5948 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 5949 * is interrupt driven. Combined completion handling is always 5950 * on the interrupt cpu. Thus, lock accordingly and use the 5951 * proper completion array. 5952 */ 5953 if (ibd_separate_cqs == 1) { 5954 if (cq_hdl == state->id_rcq_hdl) { 5955 wcs = state->id_rxwcs; 5956 numwcs = state->id_rxwcs_size; 5957 } else { 5958 wcs = state->id_txwcs; 5959 numwcs = state->id_txwcs_size; 5960 } 5961 } else { 5962 wcs = state->id_rxwcs; 5963 numwcs = state->id_rxwcs_size; 5964 } 5965 5966 /* 5967 * Poll and drain the CQ 5968 */ 5969 num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5970 5971 /* 5972 * Enable CQ notifications and redrain the cq to catch any 5973 * completions we might have missed after the ibd_drain_cq() 5974 * above and before the ibt_enable_cq_notify() that follows. 5975 * Finally, service any new requests to poll the cq that 5976 * could've come in after the ibt_enable_cq_notify(). 5977 */ 5978 do { 5979 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 5980 IBT_SUCCESS) { 5981 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 5982 } 5983 5984 num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5985 5986 mutex_enter(&state->id_cq_poll_lock); 5987 if (state->id_cq_poll_busy & redo_flag) 5988 state->id_cq_poll_busy &= ~redo_flag; 5989 else { 5990 state->id_cq_poll_busy &= ~flag; 5991 redo = 0; 5992 } 5993 mutex_exit(&state->id_cq_poll_lock); 5994 5995 } while (redo); 5996 5997 /* 5998 * If we polled the receive cq and found anything, we need to flush 5999 * it out to the nw layer here. 6000 */ 6001 if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) { 6002 ibd_flush_rx(state, NULL); 6003 } 6004 } 6005 6006 /* 6007 * Unmap the memory area associated with a given swqe. 6008 */ 6009 static void 6010 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 6011 { 6012 ibt_status_t stat; 6013 6014 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 6015 6016 if (swqe->w_mi_hdl) { 6017 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 6018 swqe->w_mi_hdl)) != IBT_SUCCESS) { 6019 DPRINT(10, 6020 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 6021 } 6022 swqe->w_mi_hdl = NULL; 6023 } 6024 swqe->w_swr.wr_nds = 0; 6025 } 6026 6027 /* 6028 * Common code that deals with clean ups after a successful or 6029 * erroneous transmission attempt. 6030 */ 6031 static void 6032 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 6033 { 6034 ibd_ace_t *ace = swqe->w_ahandle; 6035 6036 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 6037 6038 /* 6039 * If this was a dynamic mapping in ibd_send(), we need to 6040 * unmap here. If this was an lso buffer we'd used for sending, 6041 * we need to release the lso buf to the pool, since the resource 6042 * is scarce. However, if this was simply a normal send using 6043 * the copybuf (present in each swqe), we don't need to release it. 6044 */ 6045 if (swqe->swqe_im_mblk != NULL) { 6046 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6047 ibd_unmap_mem(state, swqe); 6048 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6049 ibd_release_lsobufs(state, 6050 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6051 } 6052 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6053 freemsg(swqe->swqe_im_mblk); 6054 swqe->swqe_im_mblk = NULL; 6055 } 6056 6057 /* 6058 * Drop the reference count on the AH; it can be reused 6059 * now for a different destination if there are no more 6060 * posted sends that will use it. This can be eliminated 6061 * if we can always associate each Tx buffer with an AH. 6062 * The ace can be null if we are cleaning up from the 6063 * ibd_send() error path. 6064 */ 6065 if (ace != NULL) { 6066 /* 6067 * The recycling logic can be eliminated from here 6068 * and put into the async thread if we create another 6069 * list to hold ACE's for unjoined mcg's. 6070 */ 6071 if (DEC_REF_DO_CYCLE(ace)) { 6072 ibd_mce_t *mce; 6073 6074 /* 6075 * Check with the lock taken: we decremented 6076 * reference count without the lock, and some 6077 * transmitter might alreay have bumped the 6078 * reference count (possible in case of multicast 6079 * disable when we leave the AH on the active 6080 * list). If not still 0, get out, leaving the 6081 * recycle bit intact. 6082 * 6083 * Atomically transition the AH from active 6084 * to free list, and queue a work request to 6085 * leave the group and destroy the mce. No 6086 * transmitter can be looking at the AH or 6087 * the MCE in between, since we have the 6088 * ac_mutex lock. In the SendOnly reap case, 6089 * it is not neccesary to hold the ac_mutex 6090 * and recheck the ref count (since the AH was 6091 * taken off the active list), we just do it 6092 * to have uniform processing with the Full 6093 * reap case. 6094 */ 6095 mutex_enter(&state->id_ac_mutex); 6096 mce = ace->ac_mce; 6097 if (GET_REF_CYCLE(ace) == 0) { 6098 CLEAR_REFCYCLE(ace); 6099 /* 6100 * Identify the case of fullmember reap as 6101 * opposed to mcg trap reap. Also, port up 6102 * might set ac_mce to NULL to indicate Tx 6103 * cleanup should do no more than put the 6104 * AH in the free list (see ibd_async_link). 6105 */ 6106 if (mce != NULL) { 6107 ace->ac_mce = NULL; 6108 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 6109 /* 6110 * mc_req was initialized at mce 6111 * creation time. 6112 */ 6113 ibd_queue_work_slot(state, 6114 &mce->mc_req, IBD_ASYNC_REAP); 6115 } 6116 IBD_ACACHE_INSERT_FREE(state, ace); 6117 } 6118 mutex_exit(&state->id_ac_mutex); 6119 } 6120 } 6121 6122 /* 6123 * Release the send wqe for reuse. 6124 */ 6125 ibd_release_swqe(state, swqe); 6126 } 6127 6128 /* 6129 * Hand off the processed rx mp chain to mac_rx() 6130 */ 6131 static void 6132 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc) 6133 { 6134 if (mpc == NULL) { 6135 mutex_enter(&state->id_rx_lock); 6136 6137 mpc = state->id_rx_mp; 6138 6139 state->id_rx_mp = NULL; 6140 state->id_rx_mp_tail = NULL; 6141 state->id_rx_mp_len = 0; 6142 6143 mutex_exit(&state->id_rx_lock); 6144 } 6145 6146 if (mpc) { 6147 mac_rx(state->id_mh, state->id_rh, mpc); 6148 } 6149 } 6150 6151 /* 6152 * Processing to be done after receipt of a packet; hand off to GLD 6153 * in the format expected by GLD. The received packet has this 6154 * format: 2b sap :: 00 :: data. 6155 */ 6156 static void 6157 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 6158 { 6159 ib_header_info_t *phdr; 6160 mblk_t *mp; 6161 mblk_t *mpc = NULL; 6162 ipoib_hdr_t *ipibp; 6163 ipha_t *iphap; 6164 ip6_t *ip6h; 6165 int rxcnt, len; 6166 6167 /* 6168 * Track number handed to upper layer, and number still 6169 * available to receive packets. 6170 */ 6171 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 6172 ASSERT(rxcnt >= 0); 6173 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 6174 6175 /* 6176 * Adjust write pointer depending on how much data came in. 6177 */ 6178 mp = rwqe->rwqe_im_mblk; 6179 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 6180 6181 /* 6182 * Make sure this is NULL or we're in trouble. 6183 */ 6184 if (mp->b_next != NULL) { 6185 ibd_print_warn(state, 6186 "ibd_process_rx: got duplicate mp from rcq?"); 6187 mp->b_next = NULL; 6188 } 6189 6190 /* 6191 * the IB link will deliver one of the IB link layer 6192 * headers called, the Global Routing Header (GRH). 6193 * ibd driver uses the information in GRH to build the 6194 * Header_info structure and pass it with the datagram up 6195 * to GLDv3. 6196 * If the GRH is not valid, indicate to GLDv3 by setting 6197 * the VerTcFlow field to 0. 6198 */ 6199 phdr = (ib_header_info_t *)mp->b_rptr; 6200 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 6201 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 6202 6203 /* if it is loop back packet, just drop it. */ 6204 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 6205 IPOIB_ADDRL) == 0) { 6206 freemsg(mp); 6207 return; 6208 } 6209 6210 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 6211 sizeof (ipoib_mac_t)); 6212 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 6213 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 6214 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 6215 } else { 6216 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 6217 } 6218 } else { 6219 /* 6220 * It can not be a IBA multicast packet. Must have been 6221 * unicast for us. Just copy the interface address to dst. 6222 */ 6223 phdr->ib_grh.ipoib_vertcflow = 0; 6224 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 6225 sizeof (ipoib_mac_t)); 6226 } 6227 6228 /* 6229 * For ND6 packets, padding is at the front of the source/target 6230 * lladdr. However the inet6 layer is not aware of it, hence remove 6231 * the padding from such packets. 6232 */ 6233 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6234 if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { 6235 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 6236 if (!pullupmsg(mp, IPV6_HDR_LEN + 6237 sizeof (ipoib_hdr_t))) { 6238 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 6239 freemsg(mp); 6240 return; 6241 } 6242 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + 6243 sizeof (ipoib_pgrh_t)); 6244 } 6245 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6246 len = ntohs(ip6h->ip6_plen); 6247 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6248 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 6249 IPV6_HDR_LEN + len) { 6250 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 6251 IPV6_HDR_LEN + len)) { 6252 DPRINT(10, "ibd_process_rx: pullupmsg" 6253 " failed"); 6254 freemsg(mp); 6255 return; 6256 } 6257 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 6258 sizeof (ipoib_pgrh_t) + 6259 sizeof (ipoib_hdr_t)); 6260 } 6261 /* LINTED: E_CONSTANT_CONDITION */ 6262 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6263 } 6264 } 6265 6266 /* 6267 * Update statistics 6268 */ 6269 atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer); 6270 atomic_inc_64(&state->id_rcv_pkt); 6271 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6272 atomic_inc_64(&state->id_brd_rcv); 6273 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6274 atomic_inc_64(&state->id_multi_rcv); 6275 6276 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6277 /* 6278 * Set receive checksum status in mp 6279 * Hardware checksumming can be considered valid only if: 6280 * 1. CQE.IP_OK bit is set 6281 * 2. CQE.CKSUM = 0xffff 6282 * 3. IPv6 routing header is not present in the packet 6283 * 4. If there are no IP_OPTIONS in the IP HEADER 6284 */ 6285 6286 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6287 (wc->wc_cksum == 0xFFFF) && 6288 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6289 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 6290 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 6291 } 6292 6293 /* 6294 * Add this mp to the list of processed mp's to send to 6295 * the nw layer 6296 */ 6297 mutex_enter(&state->id_rx_lock); 6298 if (state->id_rx_mp) { 6299 ASSERT(state->id_rx_mp_tail != NULL); 6300 state->id_rx_mp_tail->b_next = mp; 6301 } else { 6302 ASSERT(state->id_rx_mp_tail == NULL); 6303 state->id_rx_mp = mp; 6304 } 6305 6306 state->id_rx_mp_tail = mp; 6307 state->id_rx_mp_len++; 6308 6309 if (state->id_rx_mp_len >= IBD_MAX_RX_MP_LEN) { 6310 mpc = state->id_rx_mp; 6311 6312 state->id_rx_mp = NULL; 6313 state->id_rx_mp_tail = NULL; 6314 state->id_rx_mp_len = 0; 6315 } 6316 6317 mutex_exit(&state->id_rx_lock); 6318 6319 if (mpc) { 6320 ibd_flush_rx(state, mpc); 6321 } 6322 } 6323 6324 /* 6325 * Callback code invoked from STREAMs when the receive data buffer is 6326 * free for recycling. 6327 */ 6328 static void 6329 ibd_freemsg_cb(char *arg) 6330 { 6331 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6332 ibd_state_t *state = rwqe->w_state; 6333 6334 /* 6335 * If the wqe is being destructed, do not attempt recycling. 6336 */ 6337 if (rwqe->w_freeing_wqe == B_TRUE) { 6338 DPRINT(6, "ibd_freemsg: wqe being freed"); 6339 return; 6340 } else { 6341 /* 6342 * Upper layer has released held mblk, so we have 6343 * no more use for keeping the old pointer in 6344 * our rwqe. 6345 */ 6346 rwqe->rwqe_im_mblk = NULL; 6347 } 6348 6349 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6350 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6351 if (rwqe->rwqe_im_mblk == NULL) { 6352 ibd_delete_rwqe(state, rwqe); 6353 ibd_free_rwqe(state, rwqe); 6354 DPRINT(6, "ibd_freemsg: desballoc failed"); 6355 return; 6356 } 6357 6358 if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) { 6359 ibd_delete_rwqe(state, rwqe); 6360 ibd_free_rwqe(state, rwqe); 6361 return; 6362 } 6363 6364 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 6365 } 6366 6367 static uint_t 6368 ibd_tx_recycle(char *arg) 6369 { 6370 ibd_state_t *state = (ibd_state_t *)arg; 6371 6372 /* 6373 * Poll for completed entries 6374 */ 6375 ibd_poll_compq(state, state->id_scq_hdl); 6376 6377 /* 6378 * Resume any blocked transmissions if possible 6379 */ 6380 (void) ibd_resume_transmission(state); 6381 6382 return (DDI_INTR_CLAIMED); 6383 } 6384 6385 #ifdef IBD_LOGGING 6386 static void 6387 ibd_log_init(void) 6388 { 6389 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 6390 ibd_lbuf_ndx = 0; 6391 6392 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 6393 } 6394 6395 static void 6396 ibd_log_fini(void) 6397 { 6398 if (ibd_lbuf) 6399 kmem_free(ibd_lbuf, IBD_LOG_SZ); 6400 ibd_lbuf_ndx = 0; 6401 ibd_lbuf = NULL; 6402 6403 mutex_destroy(&ibd_lbuf_lock); 6404 } 6405 6406 static void 6407 ibd_log(const char *fmt, ...) 6408 { 6409 va_list ap; 6410 uint32_t off; 6411 uint32_t msglen; 6412 char tmpbuf[IBD_DMAX_LINE]; 6413 6414 if (ibd_lbuf == NULL) 6415 return; 6416 6417 va_start(ap, fmt); 6418 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 6419 va_end(ap); 6420 6421 if (msglen >= IBD_DMAX_LINE) 6422 msglen = IBD_DMAX_LINE - 1; 6423 6424 mutex_enter(&ibd_lbuf_lock); 6425 6426 off = ibd_lbuf_ndx; /* current msg should go here */ 6427 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 6428 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 6429 6430 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 6431 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 6432 6433 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 6434 ibd_lbuf_ndx = 0; 6435 6436 mutex_exit(&ibd_lbuf_lock); 6437 6438 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 6439 } 6440 #endif 6441