1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip6.h> /* for ip6_t */ 54 #include <inet/tcp.h> /* for tcph_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/multidata.h> 63 64 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 65 66 /* 67 * Per-interface tunables 68 * 69 * ibd_tx_copy_thresh 70 * This sets the threshold at which ibd will attempt to do a bcopy of the 71 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 72 * is restricted by various parameters, so setting of this value must be 73 * made after careful considerations only. For instance, IB HCAs currently 74 * impose a relatively small limit (when compared to ethernet NICs) on the 75 * length of the SGL for transmit. On the other hand, the ip stack could 76 * send down mp chains that are quite long when LSO is enabled. 77 * 78 * ibd_num_swqe 79 * Number of "send WQE" elements that will be allocated and used by ibd. 80 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 81 * buffer in each of these send wqes must be taken into account. This 82 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 83 * currently set to the same value of ibd_tx_copy_thresh, but may be 84 * changed independently if needed). 85 * 86 * ibd_num_rwqe 87 * Number of "receive WQE" elements that will be allocated and used by 88 * ibd. This parameter is limited by the maximum channel size of the HCA. 89 * Each buffer in the receive wqe will be of MTU size. 90 * 91 * ibd_num_lso_bufs 92 * Number of "larger-than-MTU" copy buffers to use for cases when the 93 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 94 * and too large to be used with regular MTU-sized copy buffers. It is 95 * not recommended to tune this variable without understanding the 96 * application environment and/or memory resources. The size of each of 97 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 98 * 99 * ibd_num_ah 100 * Number of AH cache entries to allocate 101 * 102 * ibd_hash_size 103 * Hash table size for the active AH list 104 * 105 * ibd_separate_cqs 106 * ibd_txcomp_poll 107 * These boolean variables (1 or 0) may be used to tune the behavior of 108 * ibd in managing the send and receive completion queues and in deciding 109 * whether or not transmit completions should be polled or interrupt 110 * driven (when the completion queues are separate). If both the completion 111 * queues are interrupt driven, it may not be possible for the handlers to 112 * be invoked concurrently, depending on how the interrupts are tied on 113 * the PCI intr line. Note that some combination of these two parameters 114 * may not be meaningful (and therefore not allowed). 115 * 116 * ibd_tx_softintr 117 * ibd_rx_softintr 118 * The softintr mechanism allows ibd to avoid event queue overflows if 119 * the receive/completion handlers are to be expensive. These are enabled 120 * by default. 121 * 122 * ibd_log_sz 123 * This specifies the size of the ibd log buffer in bytes. The buffer is 124 * allocated and logging is enabled only when IBD_LOGGING is defined. 125 * 126 */ 127 uint_t ibd_tx_copy_thresh = 0x1000; 128 uint_t ibd_num_swqe = 4000; 129 uint_t ibd_num_rwqe = 4000; 130 uint_t ibd_num_lso_bufs = 0x400; 131 uint_t ibd_num_ah = 64; 132 uint_t ibd_hash_size = 32; 133 uint_t ibd_separate_cqs = 1; 134 uint_t ibd_txcomp_poll = 0; 135 uint_t ibd_rx_softintr = 1; 136 uint_t ibd_tx_softintr = 1; 137 uint_t ibd_create_broadcast_group = 1; 138 #ifdef IBD_LOGGING 139 uint_t ibd_log_sz = 0x20000; 140 #endif 141 142 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 143 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 144 #define IBD_NUM_SWQE ibd_num_swqe 145 #define IBD_NUM_RWQE ibd_num_rwqe 146 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 147 #define IBD_NUM_AH ibd_num_ah 148 #define IBD_HASH_SIZE ibd_hash_size 149 #ifdef IBD_LOGGING 150 #define IBD_LOG_SZ ibd_log_sz 151 #endif 152 153 /* 154 * Receive CQ moderation parameters: NOT tunables 155 */ 156 static uint_t ibd_rxcomp_count = 4; 157 static uint_t ibd_rxcomp_usec = 10; 158 159 /* 160 * Send CQ moderation parameters: NOT tunables 161 */ 162 #define IBD_TXCOMP_COUNT 10 163 #define IBD_TXCOMP_USEC 300 164 165 /* 166 * Thresholds 167 * 168 * When waiting for resources (swqes or lso buffers) to become available, 169 * the first two thresholds below determine how long to wait before informing 170 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 171 * determines how low the available swqes should go before we start polling 172 * the completion queue. 173 */ 174 #define IBD_FREE_LSOS_THRESH 8 175 #define IBD_FREE_SWQES_THRESH 20 176 #define IBD_TX_POLL_THRESH 80 177 178 /* 179 * When doing multiple-send-wr or multiple-recv-wr posts, this value 180 * determines how many to do at a time (in a single ibt_post_send/recv). 181 */ 182 #define IBD_MAX_POST_MULTIPLE 4 183 184 /* 185 * Maximum length for returning chained mps back to crossbow 186 */ 187 #define IBD_MAX_RX_MP_LEN 16 188 189 /* 190 * LSO parameters 191 */ 192 #define IBD_LSO_MAXLEN 65536 193 #define IBD_LSO_BUFSZ 8192 194 #define IBD_PROP_LSO_POLICY "lso-policy" 195 196 /* 197 * Completion queue polling control 198 */ 199 #define IBD_RX_CQ_POLLING 0x1 200 #define IBD_TX_CQ_POLLING 0x2 201 #define IBD_REDO_RX_CQ_POLLING 0x4 202 #define IBD_REDO_TX_CQ_POLLING 0x8 203 204 /* 205 * Flag bits for resources to reap 206 */ 207 #define IBD_RSRC_SWQE 0x1 208 #define IBD_RSRC_LSOBUF 0x2 209 210 /* 211 * Async operation types 212 */ 213 #define IBD_ASYNC_GETAH 1 214 #define IBD_ASYNC_JOIN 2 215 #define IBD_ASYNC_LEAVE 3 216 #define IBD_ASYNC_PROMON 4 217 #define IBD_ASYNC_PROMOFF 5 218 #define IBD_ASYNC_REAP 6 219 #define IBD_ASYNC_TRAP 7 220 #define IBD_ASYNC_SCHED 8 221 #define IBD_ASYNC_LINK 9 222 #define IBD_ASYNC_EXIT 10 223 224 /* 225 * Async operation states 226 */ 227 #define IBD_OP_NOTSTARTED 0 228 #define IBD_OP_ONGOING 1 229 #define IBD_OP_COMPLETED 2 230 #define IBD_OP_ERRORED 3 231 #define IBD_OP_ROUTERED 4 232 233 /* 234 * State of IBD driver initialization during attach/m_start 235 */ 236 #define IBD_DRV_STATE_INITIALIZED 0x00001 237 #define IBD_DRV_RXINTR_ADDED 0x00002 238 #define IBD_DRV_TXINTR_ADDED 0x00004 239 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 240 #define IBD_DRV_HCA_OPENED 0x00010 241 #define IBD_DRV_PD_ALLOCD 0x00020 242 #define IBD_DRV_MAC_REGISTERED 0x00040 243 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 244 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 245 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 246 #define IBD_DRV_CQS_ALLOCD 0x00400 247 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 248 #define IBD_DRV_TXLIST_ALLOCD 0x01000 249 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 250 #define IBD_DRV_RXLIST_ALLOCD 0x04000 251 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 252 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 253 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 254 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 255 #define IBD_DRV_STARTED 0x80000 256 257 /* 258 * Start/stop in-progress flags; note that restart must always remain 259 * the OR of start and stop flag values. 260 */ 261 #define IBD_DRV_START_IN_PROGRESS 0x10000000 262 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 263 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 264 265 /* 266 * Miscellaneous constants 267 */ 268 #define IBD_SEND 0 269 #define IBD_RECV 1 270 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 271 #define IBD_DEF_MAX_SDU 2044 272 #define IBD_DEFAULT_QKEY 0xB1B 273 #ifdef IBD_LOGGING 274 #define IBD_DMAX_LINE 100 275 #endif 276 277 /* 278 * Enumerations for link states 279 */ 280 typedef enum { 281 IBD_LINK_DOWN, 282 IBD_LINK_UP, 283 IBD_LINK_UP_ABSENT 284 } ibd_link_op_t; 285 286 /* 287 * Driver State Pointer 288 */ 289 void *ibd_list; 290 291 /* 292 * Logging 293 */ 294 #ifdef IBD_LOGGING 295 kmutex_t ibd_lbuf_lock; 296 uint8_t *ibd_lbuf; 297 uint32_t ibd_lbuf_ndx; 298 #endif 299 300 /* 301 * Required system entry points 302 */ 303 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 304 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 305 306 /* 307 * Required driver entry points for GLDv3 308 */ 309 static int ibd_m_stat(void *, uint_t, uint64_t *); 310 static int ibd_m_start(void *); 311 static void ibd_m_stop(void *); 312 static int ibd_m_promisc(void *, boolean_t); 313 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 314 static int ibd_m_unicst(void *, const uint8_t *); 315 static mblk_t *ibd_m_tx(void *, mblk_t *); 316 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 317 318 /* 319 * Private driver entry points for GLDv3 320 */ 321 322 /* 323 * Initialization 324 */ 325 static int ibd_state_init(ibd_state_t *, dev_info_t *); 326 static int ibd_init_txlist(ibd_state_t *); 327 static int ibd_init_rxlist(ibd_state_t *); 328 static int ibd_acache_init(ibd_state_t *); 329 #ifdef IBD_LOGGING 330 static void ibd_log_init(void); 331 #endif 332 333 /* 334 * Termination/cleanup 335 */ 336 static void ibd_state_fini(ibd_state_t *); 337 static void ibd_fini_txlist(ibd_state_t *); 338 static void ibd_fini_rxlist(ibd_state_t *); 339 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 340 static void ibd_acache_fini(ibd_state_t *); 341 #ifdef IBD_LOGGING 342 static void ibd_log_fini(void); 343 #endif 344 345 /* 346 * Allocation/acquire/map routines 347 */ 348 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t); 349 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 350 static int ibd_alloc_tx_copybufs(ibd_state_t *); 351 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 352 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **); 353 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 354 uint32_t *); 355 356 /* 357 * Free/release/unmap routines 358 */ 359 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 360 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 361 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *); 362 static void ibd_free_tx_copybufs(ibd_state_t *); 363 static void ibd_free_tx_lsobufs(ibd_state_t *); 364 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *); 365 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 366 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 367 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 368 369 /* 370 * Handlers/callback routines 371 */ 372 static uint_t ibd_intr(char *); 373 static uint_t ibd_tx_recycle(char *); 374 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 375 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 376 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 377 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t); 378 static void ibd_freemsg_cb(char *); 379 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 380 ibt_async_event_t *); 381 static void ibd_snet_notices_handler(void *, ib_gid_t, 382 ibt_subnet_event_code_t, ibt_subnet_event_t *); 383 384 /* 385 * Send/receive routines 386 */ 387 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 388 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 389 static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t); 390 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 391 static void ibd_flush_rx(ibd_state_t *, mblk_t *); 392 393 /* 394 * Threads 395 */ 396 static void ibd_async_work(ibd_state_t *); 397 398 /* 399 * Async tasks 400 */ 401 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 402 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 403 static void ibd_async_setprom(ibd_state_t *); 404 static void ibd_async_unsetprom(ibd_state_t *); 405 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 406 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 407 static void ibd_async_txsched(ibd_state_t *); 408 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 409 410 /* 411 * Async task helpers 412 */ 413 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 414 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 415 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 416 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 417 ipoib_mac_t *, ipoib_mac_t *); 418 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 419 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 420 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 421 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 422 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 423 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 424 static uint64_t ibd_get_portspeed(ibd_state_t *); 425 static boolean_t ibd_async_safe(ibd_state_t *); 426 static void ibd_async_done(ibd_state_t *); 427 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 428 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 429 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 430 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 431 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 432 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 433 434 /* 435 * Helpers for attach/start routines 436 */ 437 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 438 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 439 static int ibd_unattach(ibd_state_t *, dev_info_t *); 440 static int ibd_get_port_details(ibd_state_t *); 441 static int ibd_alloc_cqs(ibd_state_t *); 442 static int ibd_setup_ud_channel(ibd_state_t *); 443 static int ibd_start(ibd_state_t *); 444 static int ibd_undo_start(ibd_state_t *, link_state_t); 445 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 446 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 447 448 449 /* 450 * Miscellaneous helpers 451 */ 452 static int ibd_sched_poll(ibd_state_t *, int, int); 453 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 454 static int ibd_resume_transmission(ibd_state_t *); 455 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 456 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 457 static void *list_get_head(list_t *); 458 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 459 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 460 static void ibd_print_warn(ibd_state_t *, char *, ...); 461 #ifdef IBD_LOGGING 462 static void ibd_log(const char *, ...); 463 #endif 464 465 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 466 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 467 468 /* Module Driver Info */ 469 static struct modldrv ibd_modldrv = { 470 &mod_driverops, /* This one is a driver */ 471 "InfiniBand GLDv3 Driver", /* short description */ 472 &ibd_dev_ops /* driver specific ops */ 473 }; 474 475 /* Module Linkage */ 476 static struct modlinkage ibd_modlinkage = { 477 MODREV_1, (void *)&ibd_modldrv, NULL 478 }; 479 480 /* 481 * Module (static) info passed to IBTL during ibt_attach 482 */ 483 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 484 IBTI_V_CURR, 485 IBT_NETWORK, 486 ibd_async_handler, 487 NULL, 488 "IPIB" 489 }; 490 491 /* 492 * GLDv3 entry points 493 */ 494 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 495 static mac_callbacks_t ibd_m_callbacks = { 496 IBD_M_CALLBACK_FLAGS, 497 ibd_m_stat, 498 ibd_m_start, 499 ibd_m_stop, 500 ibd_m_promisc, 501 ibd_m_multicst, 502 ibd_m_unicst, 503 ibd_m_tx, 504 NULL, 505 ibd_m_getcapab 506 }; 507 508 /* 509 * Fill/clear <scope> and <p_key> in multicast/broadcast address 510 */ 511 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 512 { \ 513 *(uint32_t *)((char *)(maddr) + 4) |= \ 514 htonl((uint32_t)(scope) << 16); \ 515 *(uint32_t *)((char *)(maddr) + 8) |= \ 516 htonl((uint32_t)(pkey) << 16); \ 517 } 518 519 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 520 { \ 521 *(uint32_t *)((char *)(maddr) + 4) &= \ 522 htonl(~((uint32_t)0xF << 16)); \ 523 *(uint32_t *)((char *)(maddr) + 8) &= \ 524 htonl(~((uint32_t)0xFFFF << 16)); \ 525 } 526 527 /* 528 * Rudimentary debugging support 529 */ 530 #ifdef DEBUG 531 int ibd_debuglevel = 100; 532 static void 533 debug_print(int l, char *fmt, ...) 534 { 535 va_list ap; 536 537 if (l < ibd_debuglevel) 538 return; 539 va_start(ap, fmt); 540 vcmn_err(CE_CONT, fmt, ap); 541 va_end(ap); 542 } 543 #define DPRINT debug_print 544 #else 545 #define DPRINT 546 #endif 547 548 /* 549 * Common routine to print warning messages; adds in hca guid, port number 550 * and pkey to be able to identify the IBA interface. 551 */ 552 static void 553 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 554 { 555 ib_guid_t hca_guid; 556 char ibd_print_buf[256]; 557 int len; 558 va_list ap; 559 560 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 561 0, "hca-guid", 0); 562 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 563 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 564 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 565 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 566 va_start(ap, fmt); 567 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 568 fmt, ap); 569 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 570 va_end(ap); 571 } 572 573 /* 574 * Warlock directives 575 */ 576 577 /* 578 * id_lso_lock 579 * 580 * state->id_lso->bkt_nfree may be accessed without a lock to 581 * determine the threshold at which we have to ask the nw layer 582 * to resume transmission (see ibd_resume_transmission()). 583 */ 584 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 585 ibd_state_t::id_lso)) 586 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 587 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 588 589 /* 590 * id_cq_poll_lock 591 */ 592 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock, 593 ibd_state_t::id_cq_poll_busy)) 594 595 /* 596 * id_txpost_lock 597 */ 598 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 599 ibd_state_t::id_tx_head)) 600 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 601 ibd_state_t::id_tx_busy)) 602 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 603 ibd_state_t::id_tx_tailp)) 604 605 /* 606 * id_rxpost_lock 607 */ 608 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 609 ibd_state_t::id_rx_head)) 610 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 611 ibd_state_t::id_rx_busy)) 612 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 613 ibd_state_t::id_rx_tailp)) 614 615 /* 616 * id_acache_req_lock 617 */ 618 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 619 ibd_state_t::id_acache_req_cv)) 620 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 621 ibd_state_t::id_req_list)) 622 623 /* 624 * id_ac_mutex 625 * 626 * This mutex is actually supposed to protect id_ah_op as well, 627 * but this path of the code isn't clean (see update of id_ah_op 628 * in ibd_async_acache(), immediately after the call to 629 * ibd_async_mcache()). For now, we'll skip this check by 630 * declaring that id_ah_op is protected by some internal scheme 631 * that warlock isn't aware of. 632 */ 633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 634 ibd_state_t::id_ah_active)) 635 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 636 ibd_state_t::id_ah_free)) 637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 638 ibd_state_t::id_ah_addr)) 639 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 640 ibd_state_t::id_ah_op)) 641 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 642 ibd_state_t::id_ah_error)) 643 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 644 645 /* 646 * id_mc_mutex 647 */ 648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 649 ibd_state_t::id_mc_full)) 650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 651 ibd_state_t::id_mc_non)) 652 653 /* 654 * id_trap_lock 655 */ 656 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 657 ibd_state_t::id_trap_cv)) 658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 659 ibd_state_t::id_trap_stop)) 660 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 661 ibd_state_t::id_trap_inprog)) 662 663 /* 664 * id_prom_op 665 */ 666 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 667 ibd_state_t::id_prom_op)) 668 669 /* 670 * id_sched_lock 671 */ 672 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 673 ibd_state_t::id_sched_needed)) 674 675 /* 676 * id_link_mutex 677 */ 678 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 679 ibd_state_t::id_link_state)) 680 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 681 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 682 ibd_state_t::id_link_speed)) 683 684 /* 685 * id_tx_list.dl_mutex 686 */ 687 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 688 ibd_state_t::id_tx_list.dl_head)) 689 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 690 ibd_state_t::id_tx_list.dl_tail)) 691 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 692 ibd_state_t::id_tx_list.dl_pending_sends)) 693 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 694 ibd_state_t::id_tx_list.dl_cnt)) 695 696 /* 697 * id_rx_list.dl_mutex 698 */ 699 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 700 ibd_state_t::id_rx_list.dl_head)) 701 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 702 ibd_state_t::id_rx_list.dl_tail)) 703 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 704 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 705 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 706 ibd_state_t::id_rx_list.dl_cnt)) 707 708 709 /* 710 * Items protected by atomic updates 711 */ 712 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 713 ibd_state_s::id_brd_rcv 714 ibd_state_s::id_brd_xmt 715 ibd_state_s::id_multi_rcv 716 ibd_state_s::id_multi_xmt 717 ibd_state_s::id_num_intrs 718 ibd_state_s::id_rcv_bytes 719 ibd_state_s::id_rcv_pkt 720 ibd_state_s::id_tx_short 721 ibd_state_s::id_xmt_bytes 722 ibd_state_s::id_xmt_pkt)) 723 724 /* 725 * Non-mutex protection schemes for data elements. Almost all of 726 * these are non-shared items. 727 */ 728 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 729 callb_cpr 730 ib_gid_s 731 ib_header_info 732 ibd_acache_rq 733 ibd_acache_s::ac_mce 734 ibd_mcache::mc_fullreap 735 ibd_mcache::mc_jstate 736 ibd_mcache::mc_req 737 ibd_rwqe_s 738 ibd_swqe_s 739 ibd_wqe_s 740 ibt_wr_ds_s::ds_va 741 ibt_wr_lso_s 742 ipoib_mac::ipoib_qpn 743 mac_capab_lso_s 744 msgb::b_next 745 msgb::b_rptr 746 msgb::b_wptr)) 747 748 int 749 _init() 750 { 751 int status; 752 753 /* 754 * Sanity check some parameter settings. Tx completion polling 755 * only makes sense with separate CQs for Tx and Rx. 756 */ 757 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 758 cmn_err(CE_NOTE, "!ibd: %s", 759 "Setting ibd_txcomp_poll = 0 for combined CQ"); 760 ibd_txcomp_poll = 0; 761 } 762 763 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 764 if (status != 0) { 765 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 766 return (status); 767 } 768 769 mac_init_ops(&ibd_dev_ops, "ibd"); 770 status = mod_install(&ibd_modlinkage); 771 if (status != 0) { 772 DPRINT(10, "_init:failed in mod_install()"); 773 ddi_soft_state_fini(&ibd_list); 774 mac_fini_ops(&ibd_dev_ops); 775 return (status); 776 } 777 778 #ifdef IBD_LOGGING 779 ibd_log_init(); 780 #endif 781 return (0); 782 } 783 784 int 785 _info(struct modinfo *modinfop) 786 { 787 return (mod_info(&ibd_modlinkage, modinfop)); 788 } 789 790 int 791 _fini() 792 { 793 int status; 794 795 status = mod_remove(&ibd_modlinkage); 796 if (status != 0) 797 return (status); 798 799 mac_fini_ops(&ibd_dev_ops); 800 ddi_soft_state_fini(&ibd_list); 801 #ifdef IBD_LOGGING 802 ibd_log_fini(); 803 #endif 804 return (0); 805 } 806 807 /* 808 * Convert the GID part of the mac address from network byte order 809 * to host order. 810 */ 811 static void 812 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 813 { 814 ib_sn_prefix_t nbopref; 815 ib_guid_t nboguid; 816 817 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 818 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 819 dgid->gid_prefix = b2h64(nbopref); 820 dgid->gid_guid = b2h64(nboguid); 821 } 822 823 /* 824 * Create the IPoIB address in network byte order from host order inputs. 825 */ 826 static void 827 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 828 ib_guid_t guid) 829 { 830 ib_sn_prefix_t nbopref; 831 ib_guid_t nboguid; 832 833 mac->ipoib_qpn = htonl(qpn); 834 nbopref = h2b64(prefix); 835 nboguid = h2b64(guid); 836 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 837 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 838 } 839 840 /* 841 * Send to the appropriate all-routers group when the IBA multicast group 842 * does not exist, based on whether the target group is v4 or v6. 843 */ 844 static boolean_t 845 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 846 ipoib_mac_t *rmac) 847 { 848 boolean_t retval = B_TRUE; 849 uint32_t adjscope = state->id_scope << 16; 850 uint32_t topword; 851 852 /* 853 * Copy the first 4 bytes in without assuming any alignment of 854 * input mac address; this will have IPoIB signature, flags and 855 * scope bits. 856 */ 857 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 858 topword = ntohl(topword); 859 860 /* 861 * Generate proper address for IPv4/v6, adding in the Pkey properly. 862 */ 863 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 864 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 865 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 866 ((uint32_t)(state->id_pkey << 16))), 867 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 868 else 869 /* 870 * Does not have proper bits in the mgid address. 871 */ 872 retval = B_FALSE; 873 874 return (retval); 875 } 876 877 /* 878 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 879 * front of optional src/tgt link layer address. Right now Solaris inserts 880 * padding by default at the end. The routine which is doing is nce_xmit() 881 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 882 * the packet comes down from IP layer to the IBD driver, it is in the 883 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 884 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 885 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 886 * 887 * The send routine at IBD driver changes this packet as follows: 888 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 889 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 890 * aligned. 891 * 892 * At the receiving side again ibd_process_rx takes the above packet and 893 * removes the two bytes of front padding and inserts it at the end. This 894 * is since the IP layer does not understand padding at the front. 895 */ 896 #define IBD_PAD_NSNA(ip6h, len, type) { \ 897 uchar_t *nd_lla_ptr; \ 898 icmp6_t *icmp6; \ 899 nd_opt_hdr_t *opt; \ 900 int i; \ 901 \ 902 icmp6 = (icmp6_t *)&ip6h[1]; \ 903 len -= sizeof (nd_neighbor_advert_t); \ 904 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 905 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 906 (len != 0)) { \ 907 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 908 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 909 ASSERT(opt != NULL); \ 910 nd_lla_ptr = (uchar_t *)&opt[1]; \ 911 if (type == IBD_SEND) { \ 912 for (i = IPOIB_ADDRL; i > 0; i--) \ 913 *(nd_lla_ptr + i + 1) = \ 914 *(nd_lla_ptr + i - 1); \ 915 } else { \ 916 for (i = 0; i < IPOIB_ADDRL; i++) \ 917 *(nd_lla_ptr + i) = \ 918 *(nd_lla_ptr + i + 2); \ 919 } \ 920 *(nd_lla_ptr + i) = 0; \ 921 *(nd_lla_ptr + i + 1) = 0; \ 922 } \ 923 } 924 925 /* 926 * Address handle entries maintained by the driver are kept in the 927 * free and active lists. Each entry starts out in the free list; 928 * it migrates to the active list when primed using ibt_get_paths() 929 * and ibt_modify_ud_dest() for transmission to a specific destination. 930 * In the active list, the entry has a reference count indicating the 931 * number of ongoing/uncompleted transmits that reference it. The 932 * entry is left in the active list even after the reference count 933 * goes to 0, since successive transmits can find it there and do 934 * not need to set up another entry (ie the path information is 935 * cached using the active list). Entries on the active list are 936 * also hashed using the destination link address as a key for faster 937 * lookups during transmits. 938 * 939 * For any destination address (unicast or multicast, whatever the 940 * join states), there will be at most one entry in the active list. 941 * Entries with a 0 reference count on the active list can be reused 942 * for a transmit to a new destination, if the free list is empty. 943 * 944 * The AH free list insertion/deletion is protected with the id_ac_mutex, 945 * since the async thread and Tx callback handlers insert/delete. The 946 * active list does not need a lock (all operations are done by the 947 * async thread) but updates to the reference count are atomically 948 * done (increments done by Tx path, decrements by the Tx callback handler). 949 */ 950 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 951 list_insert_head(&state->id_ah_free, ce) 952 #define IBD_ACACHE_GET_FREE(state) \ 953 list_get_head(&state->id_ah_free) 954 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 955 int _ret_; \ 956 list_insert_head(&state->id_ah_active, ce); \ 957 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 958 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 959 ASSERT(_ret_ == 0); \ 960 } 961 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 962 list_remove(&state->id_ah_active, ce); \ 963 (void) mod_hash_remove(state->id_ah_active_hash, \ 964 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 965 } 966 #define IBD_ACACHE_GET_ACTIVE(state) \ 967 list_get_head(&state->id_ah_active) 968 969 /* 970 * Membership states for different mcg's are tracked by two lists: 971 * the "non" list is used for promiscuous mode, when all mcg traffic 972 * needs to be inspected. This type of membership is never used for 973 * transmission, so there can not be an AH in the active list 974 * corresponding to a member in this list. This list does not need 975 * any protection, since all operations are performed by the async 976 * thread. 977 * 978 * "Full" and "SendOnly" membership is tracked using a single list, 979 * the "full" list. This is because this single list can then be 980 * searched during transmit to a multicast group (if an AH for the 981 * mcg is not found in the active list), since at least one type 982 * of membership must be present before initiating the transmit. 983 * This list is also emptied during driver detach, since sendonly 984 * membership acquired during transmit is dropped at detach time 985 * alongwith ipv4 broadcast full membership. Insert/deletes to 986 * this list are done only by the async thread, but it is also 987 * searched in program context (see multicast disable case), thus 988 * the id_mc_mutex protects the list. The driver detach path also 989 * deconstructs the "full" list, but it ensures that the async 990 * thread will not be accessing the list (by blocking out mcg 991 * trap handling and making sure no more Tx reaping will happen). 992 * 993 * Currently, an IBA attach is done in the SendOnly case too, 994 * although this is not required. 995 */ 996 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 997 list_insert_head(&state->id_mc_full, mce) 998 #define IBD_MCACHE_INSERT_NON(state, mce) \ 999 list_insert_head(&state->id_mc_non, mce) 1000 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1001 ibd_mcache_find(mgid, &state->id_mc_full) 1002 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1003 ibd_mcache_find(mgid, &state->id_mc_non) 1004 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1005 list_remove(&state->id_mc_full, mce) 1006 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1007 list_remove(&state->id_mc_non, mce) 1008 1009 /* 1010 * AH and MCE active list manipulation: 1011 * 1012 * Multicast disable requests and MCG delete traps are two cases 1013 * where the active AH entry for the mcg (if any unreferenced one exists) 1014 * will be moved to the free list (to force the next Tx to the mcg to 1015 * join the MCG in SendOnly mode). Port up handling will also move AHs 1016 * from active to free list. 1017 * 1018 * In the case when some transmits are still pending on an entry 1019 * for an mcg, but a multicast disable has already been issued on the 1020 * mcg, there are some options to consider to preserve the join state 1021 * to ensure the emitted packet is properly routed on the IBA fabric. 1022 * For the AH, we can 1023 * 1. take out of active list at multicast disable time. 1024 * 2. take out of active list only when last pending Tx completes. 1025 * For the MCE, we can 1026 * 3. take out of active list at multicast disable time. 1027 * 4. take out of active list only when last pending Tx completes. 1028 * 5. move from active list to stale list at multicast disable time. 1029 * We choose to use 2,4. We use option 4 so that if a multicast enable 1030 * is tried before the pending Tx completes, the enable code finds the 1031 * mce in the active list and just has to make sure it will not be reaped 1032 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1033 * a stale list (#5) that would be checked in the enable code would need 1034 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1035 * after the multicast disable would try to put an AH in the active list, 1036 * and associate the mce it finds in the active list to this new AH, 1037 * whereas the mce is already associated with the previous AH (taken off 1038 * the active list), and will be removed once the pending Tx's complete 1039 * (unless a reference count on mce's is implemented). One implication of 1040 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1041 * grab new references on the AH, further delaying the leave. 1042 * 1043 * In the case of mcg delete (or create) trap when the port is sendonly 1044 * joined, the AH and MCE handling is different: the AH and MCE has to be 1045 * immediately taken off the active lists (forcing a join and path lookup 1046 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1047 * to an mcg as it is repeatedly created and deleted and goes thru 1048 * reincarnations). 1049 * 1050 * When a port is already sendonly joined, and a multicast enable is 1051 * attempted, the same mce structure is promoted; this ensures only a 1052 * single mce on the active list tracks the most powerful join state. 1053 * 1054 * In the case of port up event handling, the MCE for sendonly membership 1055 * is freed up, and the ACE is put into the free list as soon as possible 1056 * (depending on whether posted Tx's have completed). For fullmembership 1057 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1058 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1059 * done; else the mce is deconstructed (mc_fullreap case). 1060 * 1061 * MCG creation and deletion trap handling: 1062 * 1063 * These traps are unreliable (meaning sometimes the trap might never 1064 * be delivered to the subscribed nodes) and may arrive out-of-order 1065 * since they use UD transport. An alternative to relying on these 1066 * unreliable traps is to poll for mcg presence every so often, but 1067 * instead of doing that, we try to be as conservative as possible 1068 * while handling the traps, and hope that the traps do arrive at 1069 * the subscribed nodes soon. Note that if a node is fullmember 1070 * joined to an mcg, it can not possibly receive a mcg create/delete 1071 * trap for that mcg (by fullmember definition); if it does, it is 1072 * an old trap from a previous incarnation of the mcg. 1073 * 1074 * Whenever a trap is received, the driver cleans up its sendonly 1075 * membership to the group; we choose to do a sendonly leave even 1076 * on a creation trap to handle the case of a prior deletion of the mcg 1077 * having gone unnoticed. Consider an example scenario: 1078 * T1: MCG M is deleted, and fires off deletion trap D1. 1079 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1080 * T3: Node N tries to transmit to M, joining in sendonly mode. 1081 * T4: MCG M is deleted, and fires off deletion trap D2. 1082 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1083 * If the trap is D2, then a LEAVE is not required, since the mcg 1084 * is already deleted; but if it is D1, a LEAVE is required. A safe 1085 * approach is to always LEAVE, but the SM may be confused if it 1086 * receives a LEAVE without a prior JOIN. 1087 * 1088 * Management of the non-membership to an mcg is similar to the above, 1089 * except that if the interface is in promiscuous mode, it is required 1090 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1091 * if the re-join attempt fails (in which case a warning message needs 1092 * to be printed), it is not clear whether it failed due to the mcg not 1093 * existing, or some fabric/hca issues, due to the delayed nature of 1094 * trap delivery. Querying the SA to establish presence/absence of the 1095 * mcg is also racy at best. Thus, the driver just prints a warning 1096 * message when it can not rejoin after receiving a create trap, although 1097 * this might be (on rare occassions) a mis-warning if the create trap is 1098 * received after the mcg was deleted. 1099 */ 1100 1101 /* 1102 * Implementation of atomic "recycle" bits and reference count 1103 * on address handles. This utilizes the fact that max reference 1104 * count on any handle is limited by number of send wqes, thus 1105 * high bits in the ac_ref field can be used as the recycle bits, 1106 * and only the low bits hold the number of pending Tx requests. 1107 * This atomic AH reference counting allows the Tx completion 1108 * handler not to acquire the id_ac_mutex to process every completion, 1109 * thus reducing lock contention problems between completion and 1110 * the Tx path. 1111 */ 1112 #define CYCLEVAL 0x80000 1113 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1114 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1115 #define GET_REF(ace) ((ace)->ac_ref) 1116 #define GET_REF_CYCLE(ace) ( \ 1117 /* \ 1118 * Make sure "cycle" bit is set. \ 1119 */ \ 1120 ASSERT(CYCLE_SET(ace)), \ 1121 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1122 ) 1123 #define INC_REF(ace, num) { \ 1124 atomic_add_32(&(ace)->ac_ref, num); \ 1125 } 1126 #define SET_CYCLE_IF_REF(ace) ( \ 1127 CYCLE_SET(ace) ? B_TRUE : \ 1128 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1129 CYCLEVAL ? \ 1130 /* \ 1131 * Clear the "cycle" bit we just set; \ 1132 * ref count known to be 0 from above. \ 1133 */ \ 1134 CLEAR_REFCYCLE(ace), B_FALSE : \ 1135 /* \ 1136 * We set "cycle" bit; let caller know. \ 1137 */ \ 1138 B_TRUE \ 1139 ) 1140 #define DEC_REF_DO_CYCLE(ace) ( \ 1141 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1142 CYCLEVAL ? \ 1143 /* \ 1144 * Ref count known to be 0 from above. \ 1145 */ \ 1146 B_TRUE : \ 1147 B_FALSE \ 1148 ) 1149 1150 static void * 1151 list_get_head(list_t *list) 1152 { 1153 list_node_t *lhead = list_head(list); 1154 1155 if (lhead != NULL) 1156 list_remove(list, lhead); 1157 return (lhead); 1158 } 1159 1160 /* 1161 * This is always guaranteed to be able to queue the work. 1162 */ 1163 static void 1164 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1165 { 1166 /* Initialize request */ 1167 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1168 ptr->rq_op = op; 1169 1170 /* 1171 * Queue provided slot onto request pool. 1172 */ 1173 mutex_enter(&state->id_acache_req_lock); 1174 list_insert_tail(&state->id_req_list, ptr); 1175 1176 /* Go, fetch, async thread */ 1177 cv_signal(&state->id_acache_req_cv); 1178 mutex_exit(&state->id_acache_req_lock); 1179 } 1180 1181 /* 1182 * Main body of the per interface async thread. 1183 */ 1184 static void 1185 ibd_async_work(ibd_state_t *state) 1186 { 1187 ibd_req_t *ptr; 1188 callb_cpr_t cprinfo; 1189 1190 mutex_enter(&state->id_acache_req_lock); 1191 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1192 callb_generic_cpr, "ibd_async_work"); 1193 1194 for (;;) { 1195 ptr = list_get_head(&state->id_req_list); 1196 if (ptr != NULL) { 1197 mutex_exit(&state->id_acache_req_lock); 1198 1199 /* 1200 * Once we have done the operation, there is no 1201 * guarantee the request slot is going to be valid, 1202 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1203 * TRAP). 1204 * 1205 * Perform the request. 1206 */ 1207 switch (ptr->rq_op) { 1208 case IBD_ASYNC_GETAH: 1209 ibd_async_acache(state, &ptr->rq_mac); 1210 break; 1211 case IBD_ASYNC_JOIN: 1212 case IBD_ASYNC_LEAVE: 1213 ibd_async_multicast(state, 1214 ptr->rq_gid, ptr->rq_op); 1215 break; 1216 case IBD_ASYNC_PROMON: 1217 ibd_async_setprom(state); 1218 break; 1219 case IBD_ASYNC_PROMOFF: 1220 ibd_async_unsetprom(state); 1221 break; 1222 case IBD_ASYNC_REAP: 1223 ibd_async_reap_group(state, 1224 ptr->rq_ptr, ptr->rq_gid, 1225 IB_MC_JSTATE_FULL); 1226 /* 1227 * the req buf contains in mce 1228 * structure, so we do not need 1229 * to free it here. 1230 */ 1231 ptr = NULL; 1232 break; 1233 case IBD_ASYNC_TRAP: 1234 ibd_async_trap(state, ptr); 1235 break; 1236 case IBD_ASYNC_SCHED: 1237 ibd_async_txsched(state); 1238 break; 1239 case IBD_ASYNC_LINK: 1240 ibd_async_link(state, ptr); 1241 break; 1242 case IBD_ASYNC_EXIT: 1243 mutex_enter(&state->id_acache_req_lock); 1244 #ifndef __lock_lint 1245 CALLB_CPR_EXIT(&cprinfo); 1246 #else 1247 mutex_exit(&state->id_acache_req_lock); 1248 #endif 1249 return; 1250 } 1251 if (ptr != NULL) 1252 kmem_cache_free(state->id_req_kmc, ptr); 1253 1254 mutex_enter(&state->id_acache_req_lock); 1255 } else { 1256 #ifndef __lock_lint 1257 /* 1258 * Nothing to do: wait till new request arrives. 1259 */ 1260 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1261 cv_wait(&state->id_acache_req_cv, 1262 &state->id_acache_req_lock); 1263 CALLB_CPR_SAFE_END(&cprinfo, 1264 &state->id_acache_req_lock); 1265 #endif 1266 } 1267 } 1268 1269 /*NOTREACHED*/ 1270 _NOTE(NOT_REACHED) 1271 } 1272 1273 /* 1274 * Return when it is safe to queue requests to the async daemon; primarily 1275 * for subnet trap and async event handling. Disallow requests before the 1276 * daemon is created, and when interface deinitilization starts. 1277 */ 1278 static boolean_t 1279 ibd_async_safe(ibd_state_t *state) 1280 { 1281 mutex_enter(&state->id_trap_lock); 1282 if (state->id_trap_stop) { 1283 mutex_exit(&state->id_trap_lock); 1284 return (B_FALSE); 1285 } 1286 state->id_trap_inprog++; 1287 mutex_exit(&state->id_trap_lock); 1288 return (B_TRUE); 1289 } 1290 1291 /* 1292 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1293 * trap or event handling to complete to kill the async thread and deconstruct 1294 * the mcg/ace list. 1295 */ 1296 static void 1297 ibd_async_done(ibd_state_t *state) 1298 { 1299 mutex_enter(&state->id_trap_lock); 1300 if (--state->id_trap_inprog == 0) 1301 cv_signal(&state->id_trap_cv); 1302 mutex_exit(&state->id_trap_lock); 1303 } 1304 1305 /* 1306 * Hash functions: 1307 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1308 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1309 * These operate on mac addresses input into ibd_send, but there is no 1310 * guarantee on the alignment of the ipoib_mac_t structure. 1311 */ 1312 /*ARGSUSED*/ 1313 static uint_t 1314 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1315 { 1316 ulong_t ptraddr = (ulong_t)key; 1317 uint_t hval; 1318 1319 /* 1320 * If the input address is 4 byte aligned, we can just dereference 1321 * it. This is most common, since IP will send in a 4 byte aligned 1322 * IP header, which implies the 24 byte IPoIB psuedo header will be 1323 * 4 byte aligned too. 1324 */ 1325 if ((ptraddr & 3) == 0) 1326 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1327 1328 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1329 return (hval); 1330 } 1331 1332 static int 1333 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1334 { 1335 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1336 return (0); 1337 else 1338 return (1); 1339 } 1340 1341 /* 1342 * Initialize all the per interface caches and lists; AH cache, 1343 * MCG list etc. 1344 */ 1345 static int 1346 ibd_acache_init(ibd_state_t *state) 1347 { 1348 ibd_ace_t *ce; 1349 int i; 1350 1351 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1352 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1353 1354 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1355 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1356 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1357 offsetof(ibd_ace_t, ac_list)); 1358 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1359 offsetof(ibd_ace_t, ac_list)); 1360 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1361 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1362 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1363 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1364 offsetof(ibd_mce_t, mc_list)); 1365 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1366 offsetof(ibd_mce_t, mc_list)); 1367 list_create(&state->id_req_list, sizeof (ibd_req_t), 1368 offsetof(ibd_req_t, rq_list)); 1369 1370 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1371 IBD_NUM_AH, KM_SLEEP); 1372 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1373 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1374 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1375 ibd_acache_fini(state); 1376 return (DDI_FAILURE); 1377 } else { 1378 CLEAR_REFCYCLE(ce); 1379 ce->ac_mce = NULL; 1380 IBD_ACACHE_INSERT_FREE(state, ce); 1381 } 1382 } 1383 return (DDI_SUCCESS); 1384 } 1385 1386 static void 1387 ibd_acache_fini(ibd_state_t *state) 1388 { 1389 ibd_ace_t *ptr; 1390 1391 mutex_enter(&state->id_ac_mutex); 1392 1393 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1394 ASSERT(GET_REF(ptr) == 0); 1395 (void) ibt_free_ud_dest(ptr->ac_dest); 1396 } 1397 1398 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1399 ASSERT(GET_REF(ptr) == 0); 1400 (void) ibt_free_ud_dest(ptr->ac_dest); 1401 } 1402 1403 list_destroy(&state->id_ah_free); 1404 list_destroy(&state->id_ah_active); 1405 list_destroy(&state->id_mc_full); 1406 list_destroy(&state->id_mc_non); 1407 list_destroy(&state->id_req_list); 1408 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1409 mutex_exit(&state->id_ac_mutex); 1410 mutex_destroy(&state->id_ac_mutex); 1411 mutex_destroy(&state->id_mc_mutex); 1412 mutex_destroy(&state->id_acache_req_lock); 1413 cv_destroy(&state->id_acache_req_cv); 1414 } 1415 1416 /* 1417 * Search AH active hash list for a cached path to input destination. 1418 * If we are "just looking", hold == F. When we are in the Tx path, 1419 * we set hold == T to grab a reference on the AH so that it can not 1420 * be recycled to a new destination while the Tx request is posted. 1421 */ 1422 static ibd_ace_t * 1423 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1424 { 1425 ibd_ace_t *ptr; 1426 1427 ASSERT(mutex_owned(&state->id_ac_mutex)); 1428 1429 /* 1430 * Do hash search. 1431 */ 1432 if (mod_hash_find(state->id_ah_active_hash, 1433 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1434 if (hold) 1435 INC_REF(ptr, num); 1436 return (ptr); 1437 } 1438 return (NULL); 1439 } 1440 1441 /* 1442 * This is called by the tx side; if an initialized AH is found in 1443 * the active list, it is locked down and can be used; if no entry 1444 * is found, an async request is queued to do path resolution. 1445 */ 1446 static ibd_ace_t * 1447 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1448 { 1449 ibd_ace_t *ptr; 1450 ibd_req_t *req; 1451 1452 /* 1453 * Only attempt to print when we can; in the mdt pattr case, the 1454 * address is not aligned properly. 1455 */ 1456 if (((ulong_t)mac & 3) == 0) { 1457 DPRINT(4, 1458 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1459 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1460 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1461 htonl(mac->ipoib_gidsuff[1])); 1462 } 1463 1464 mutex_enter(&state->id_ac_mutex); 1465 1466 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1467 mutex_exit(&state->id_ac_mutex); 1468 return (ptr); 1469 } 1470 1471 /* 1472 * Implementation of a single outstanding async request; if 1473 * the operation is not started yet, queue a request and move 1474 * to ongoing state. Remember in id_ah_addr for which address 1475 * we are queueing the request, in case we need to flag an error; 1476 * Any further requests, for the same or different address, until 1477 * the operation completes, is sent back to GLDv3 to be retried. 1478 * The async thread will update id_ah_op with an error indication 1479 * or will set it to indicate the next look up can start; either 1480 * way, it will mac_tx_update() so that all blocked requests come 1481 * back here. 1482 */ 1483 *err = EAGAIN; 1484 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1485 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1486 if (req != NULL) { 1487 /* 1488 * We did not even find the entry; queue a request 1489 * for it. 1490 */ 1491 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1492 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1493 state->id_ah_op = IBD_OP_ONGOING; 1494 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1495 } 1496 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1497 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1498 /* 1499 * Check the status of the pathrecord lookup request 1500 * we had queued before. 1501 */ 1502 if (state->id_ah_op == IBD_OP_ERRORED) { 1503 *err = EFAULT; 1504 state->id_ah_error++; 1505 } else { 1506 /* 1507 * IBD_OP_ROUTERED case: We need to send to the 1508 * all-router MCG. If we can find the AH for 1509 * the mcg, the Tx will be attempted. If we 1510 * do not find the AH, we return NORESOURCES 1511 * to retry. 1512 */ 1513 ipoib_mac_t routermac; 1514 1515 (void) ibd_get_allroutergroup(state, mac, &routermac); 1516 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1517 numwqe); 1518 } 1519 state->id_ah_op = IBD_OP_NOTSTARTED; 1520 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1521 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1522 /* 1523 * This case can happen when we get a higher band 1524 * packet. The easiest way is to reset the state machine 1525 * to accommodate the higher priority packet. 1526 */ 1527 state->id_ah_op = IBD_OP_NOTSTARTED; 1528 } 1529 mutex_exit(&state->id_ac_mutex); 1530 1531 return (ptr); 1532 } 1533 1534 /* 1535 * Grab a not-currently-in-use AH/PathRecord from the active 1536 * list to recycle to a new destination. Only the async thread 1537 * executes this code. 1538 */ 1539 static ibd_ace_t * 1540 ibd_acache_get_unref(ibd_state_t *state) 1541 { 1542 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1543 1544 ASSERT(mutex_owned(&state->id_ac_mutex)); 1545 1546 /* 1547 * Do plain linear search. 1548 */ 1549 while (ptr != NULL) { 1550 /* 1551 * Note that it is possible that the "cycle" bit 1552 * is set on the AH w/o any reference count. The 1553 * mcg must have been deleted, and the tx cleanup 1554 * just decremented the reference count to 0, but 1555 * hasn't gotten around to grabbing the id_ac_mutex 1556 * to move the AH into the free list. 1557 */ 1558 if (GET_REF(ptr) == 0) { 1559 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1560 break; 1561 } 1562 ptr = list_next(&state->id_ah_active, ptr); 1563 } 1564 return (ptr); 1565 } 1566 1567 /* 1568 * Invoked to clean up AH from active list in case of multicast 1569 * disable and to handle sendonly memberships during mcg traps. 1570 * And for port up processing for multicast and unicast AHs. 1571 * Normally, the AH is taken off the active list, and put into 1572 * the free list to be recycled for a new destination. In case 1573 * Tx requests on the AH have not completed yet, the AH is marked 1574 * for reaping (which will put the AH on the free list) once the Tx's 1575 * complete; in this case, depending on the "force" input, we take 1576 * out the AH from the active list right now, or leave it also for 1577 * the reap operation. Returns TRUE if the AH is taken off the active 1578 * list (and either put into the free list right now, or arranged for 1579 * later), FALSE otherwise. 1580 */ 1581 static boolean_t 1582 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1583 { 1584 ibd_ace_t *acactive; 1585 boolean_t ret = B_TRUE; 1586 1587 ASSERT(mutex_owned(&state->id_ac_mutex)); 1588 1589 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1590 1591 /* 1592 * Note that the AH might already have the cycle bit set 1593 * on it; this might happen if sequences of multicast 1594 * enables and disables are coming so fast, that posted 1595 * Tx's to the mcg have not completed yet, and the cycle 1596 * bit is set successively by each multicast disable. 1597 */ 1598 if (SET_CYCLE_IF_REF(acactive)) { 1599 if (!force) { 1600 /* 1601 * The ace is kept on the active list, further 1602 * Tx's can still grab a reference on it; the 1603 * ace is reaped when all pending Tx's 1604 * referencing the AH complete. 1605 */ 1606 ret = B_FALSE; 1607 } else { 1608 /* 1609 * In the mcg trap case, we always pull the 1610 * AH from the active list. And also the port 1611 * up multi/unicast case. 1612 */ 1613 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1614 acactive->ac_mce = NULL; 1615 } 1616 } else { 1617 /* 1618 * Determined the ref count is 0, thus reclaim 1619 * immediately after pulling out the ace from 1620 * the active list. 1621 */ 1622 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1623 acactive->ac_mce = NULL; 1624 IBD_ACACHE_INSERT_FREE(state, acactive); 1625 } 1626 1627 } 1628 return (ret); 1629 } 1630 1631 /* 1632 * Helper function for async path record lookup. If we are trying to 1633 * Tx to a MCG, check our membership, possibly trying to join the 1634 * group if required. If that fails, try to send the packet to the 1635 * all router group (indicated by the redirect output), pointing 1636 * the input mac address to the router mcg address. 1637 */ 1638 static ibd_mce_t * 1639 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1640 { 1641 ib_gid_t mgid; 1642 ibd_mce_t *mce; 1643 ipoib_mac_t routermac; 1644 1645 *redirect = B_FALSE; 1646 ibd_n2h_gid(mac, &mgid); 1647 1648 /* 1649 * Check the FullMember+SendOnlyNonMember list. 1650 * Since we are the only one who manipulates the 1651 * id_mc_full list, no locks are needed. 1652 */ 1653 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1654 if (mce != NULL) { 1655 DPRINT(4, "ibd_async_mcache : already joined to group"); 1656 return (mce); 1657 } 1658 1659 /* 1660 * Not found; try to join(SendOnlyNonMember) and attach. 1661 */ 1662 DPRINT(4, "ibd_async_mcache : not joined to group"); 1663 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1664 NULL) { 1665 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1666 return (mce); 1667 } 1668 1669 /* 1670 * MCGroup not present; try to join the all-router group. If 1671 * any of the following steps succeed, we will be redirecting 1672 * to the all router group. 1673 */ 1674 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1675 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1676 return (NULL); 1677 *redirect = B_TRUE; 1678 ibd_n2h_gid(&routermac, &mgid); 1679 bcopy(&routermac, mac, IPOIB_ADDRL); 1680 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1681 mgid.gid_prefix, mgid.gid_guid); 1682 1683 /* 1684 * Are we already joined to the router group? 1685 */ 1686 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1687 DPRINT(4, "ibd_async_mcache : using already joined router" 1688 "group\n"); 1689 return (mce); 1690 } 1691 1692 /* 1693 * Can we join(SendOnlyNonMember) the router group? 1694 */ 1695 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1696 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1697 NULL) { 1698 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1699 return (mce); 1700 } 1701 1702 return (NULL); 1703 } 1704 1705 /* 1706 * Async path record lookup code. 1707 */ 1708 static void 1709 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1710 { 1711 ibd_ace_t *ce; 1712 ibd_mce_t *mce = NULL; 1713 ibt_path_attr_t path_attr; 1714 ibt_path_info_t path_info; 1715 ib_gid_t destgid; 1716 char ret = IBD_OP_NOTSTARTED; 1717 1718 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1719 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1720 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1721 htonl(mac->ipoib_gidsuff[1])); 1722 1723 /* 1724 * Check whether we are trying to transmit to a MCG. 1725 * In that case, we need to make sure we are a member of 1726 * the MCG. 1727 */ 1728 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1729 boolean_t redirected; 1730 1731 /* 1732 * If we can not find or join the group or even 1733 * redirect, error out. 1734 */ 1735 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1736 NULL) { 1737 state->id_ah_op = IBD_OP_ERRORED; 1738 return; 1739 } 1740 1741 /* 1742 * If we got redirected, we need to determine whether 1743 * the AH for the new mcg is in the cache already, and 1744 * not pull it in then; otherwise proceed to get the 1745 * path for the new mcg. There is no guarantee that 1746 * if the AH is currently in the cache, it will still be 1747 * there when we look in ibd_acache_lookup(), but that's 1748 * okay, we will come back here. 1749 */ 1750 if (redirected) { 1751 ret = IBD_OP_ROUTERED; 1752 DPRINT(4, "ibd_async_acache : redirected to " 1753 "%08X:%08X:%08X:%08X:%08X", 1754 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1755 htonl(mac->ipoib_gidpref[1]), 1756 htonl(mac->ipoib_gidsuff[0]), 1757 htonl(mac->ipoib_gidsuff[1])); 1758 1759 mutex_enter(&state->id_ac_mutex); 1760 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1761 state->id_ah_op = IBD_OP_ROUTERED; 1762 mutex_exit(&state->id_ac_mutex); 1763 DPRINT(4, "ibd_async_acache : router AH found"); 1764 return; 1765 } 1766 mutex_exit(&state->id_ac_mutex); 1767 } 1768 } 1769 1770 /* 1771 * Get an AH from the free list. 1772 */ 1773 mutex_enter(&state->id_ac_mutex); 1774 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1775 /* 1776 * No free ones; try to grab an unreferenced active 1777 * one. Maybe we need to make the active list LRU, 1778 * but that will create more work for Tx callbacks. 1779 * Is there a way of not having to pull out the 1780 * entry from the active list, but just indicate it 1781 * is being recycled? Yes, but that creates one more 1782 * check in the fast lookup path. 1783 */ 1784 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1785 /* 1786 * Pretty serious shortage now. 1787 */ 1788 state->id_ah_op = IBD_OP_NOTSTARTED; 1789 mutex_exit(&state->id_ac_mutex); 1790 DPRINT(10, "ibd_async_acache : failed to find AH " 1791 "slot\n"); 1792 return; 1793 } 1794 /* 1795 * We could check whether ac_mce points to a SendOnly 1796 * member and drop that membership now. Or do it lazily 1797 * at detach time. 1798 */ 1799 ce->ac_mce = NULL; 1800 } 1801 mutex_exit(&state->id_ac_mutex); 1802 ASSERT(ce->ac_mce == NULL); 1803 1804 /* 1805 * Update the entry. 1806 */ 1807 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1808 1809 bzero(&path_info, sizeof (path_info)); 1810 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1811 path_attr.pa_sgid = state->id_sgid; 1812 path_attr.pa_num_dgids = 1; 1813 ibd_n2h_gid(&ce->ac_mac, &destgid); 1814 path_attr.pa_dgids = &destgid; 1815 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1816 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1817 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1818 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1819 goto error; 1820 } 1821 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1822 ntohl(ce->ac_mac.ipoib_qpn), 1823 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1824 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1825 goto error; 1826 } 1827 1828 /* 1829 * mce is set whenever an AH is being associated with a 1830 * MCG; this will come in handy when we leave the MCG. The 1831 * lock protects Tx fastpath from scanning the active list. 1832 */ 1833 if (mce != NULL) 1834 ce->ac_mce = mce; 1835 mutex_enter(&state->id_ac_mutex); 1836 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1837 state->id_ah_op = ret; 1838 mutex_exit(&state->id_ac_mutex); 1839 return; 1840 error: 1841 /* 1842 * We might want to drop SendOnly membership here if we 1843 * joined above. The lock protects Tx callbacks inserting 1844 * into the free list. 1845 */ 1846 mutex_enter(&state->id_ac_mutex); 1847 state->id_ah_op = IBD_OP_ERRORED; 1848 IBD_ACACHE_INSERT_FREE(state, ce); 1849 mutex_exit(&state->id_ac_mutex); 1850 } 1851 1852 /* 1853 * While restoring port's presence on the subnet on a port up, it is possible 1854 * that the port goes down again. 1855 */ 1856 static void 1857 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1858 { 1859 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1860 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1861 LINK_STATE_UP; 1862 ibd_mce_t *mce, *pmce; 1863 ibd_ace_t *ace, *pace; 1864 1865 DPRINT(10, "ibd_async_link(): %d", opcode); 1866 1867 /* 1868 * On a link up, revalidate the link speed/width. No point doing 1869 * this on a link down, since we will be unable to do SA operations, 1870 * defaulting to the lowest speed. Also notice that we update our 1871 * notion of speed before calling mac_link_update(), which will do 1872 * neccesary higher level notifications for speed changes. 1873 */ 1874 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1875 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1876 state->id_link_speed = ibd_get_portspeed(state); 1877 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1878 } 1879 1880 /* 1881 * Do all the work required to establish our presence on 1882 * the subnet. 1883 */ 1884 if (opcode == IBD_LINK_UP_ABSENT) { 1885 /* 1886 * If in promiscuous mode ... 1887 */ 1888 if (state->id_prom_op == IBD_OP_COMPLETED) { 1889 /* 1890 * Drop all nonmembership. 1891 */ 1892 ibd_async_unsetprom(state); 1893 1894 /* 1895 * Then, try to regain nonmembership to all mcg's. 1896 */ 1897 ibd_async_setprom(state); 1898 1899 } 1900 1901 /* 1902 * Drop all sendonly membership (which also gets rid of the 1903 * AHs); try to reacquire all full membership. 1904 */ 1905 mce = list_head(&state->id_mc_full); 1906 while ((pmce = mce) != NULL) { 1907 mce = list_next(&state->id_mc_full, mce); 1908 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1909 ibd_leave_group(state, 1910 pmce->mc_info.mc_adds_vect.av_dgid, 1911 IB_MC_JSTATE_SEND_ONLY_NON); 1912 else 1913 ibd_reacquire_group(state, pmce); 1914 } 1915 1916 /* 1917 * Recycle all active AHs to free list (and if there are 1918 * pending posts, make sure they will go into the free list 1919 * once the Tx's complete). Grab the lock to prevent 1920 * concurrent Tx's as well as Tx cleanups. 1921 */ 1922 mutex_enter(&state->id_ac_mutex); 1923 ace = list_head(&state->id_ah_active); 1924 while ((pace = ace) != NULL) { 1925 boolean_t cycled; 1926 1927 ace = list_next(&state->id_ah_active, ace); 1928 mce = pace->ac_mce; 1929 cycled = ibd_acache_recycle(state, &pace->ac_mac, 1930 B_TRUE); 1931 /* 1932 * If this is for an mcg, it must be for a fullmember, 1933 * since we got rid of send-only members above when 1934 * processing the mce list. 1935 */ 1936 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1937 IB_MC_JSTATE_FULL))); 1938 1939 /* 1940 * Check if the fullmember mce needs to be torn down, 1941 * ie whether the DLPI disable has already been done. 1942 * If so, do some of the work of tx_cleanup, namely 1943 * causing leave (which will fail), detach and 1944 * mce-freeing. tx_cleanup will put the AH into free 1945 * list. The reason to duplicate some of this 1946 * tx_cleanup work is because we want to delete the 1947 * AH right now instead of waiting for tx_cleanup, to 1948 * force subsequent Tx's to reacquire an AH. 1949 */ 1950 if ((mce != NULL) && (mce->mc_fullreap)) 1951 ibd_async_reap_group(state, mce, 1952 mce->mc_info.mc_adds_vect.av_dgid, 1953 mce->mc_jstate); 1954 } 1955 mutex_exit(&state->id_ac_mutex); 1956 } 1957 1958 /* 1959 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1960 * (which stops further events from being delivered) before 1961 * mac_unregister(). At this point, it is guaranteed that mac_register 1962 * has already been done. 1963 */ 1964 mutex_enter(&state->id_link_mutex); 1965 state->id_link_state = lstate; 1966 mac_link_update(state->id_mh, lstate); 1967 mutex_exit(&state->id_link_mutex); 1968 1969 ibd_async_done(state); 1970 } 1971 1972 /* 1973 * Check the pkey table to see if we can find the pkey we're looking for. 1974 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1975 * failure. 1976 */ 1977 static int 1978 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1979 uint16_t *pkix) 1980 { 1981 uint16_t ndx; 1982 1983 ASSERT(pkix != NULL); 1984 1985 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 1986 if (pkey_tbl[ndx] == pkey) { 1987 *pkix = ndx; 1988 return (0); 1989 } 1990 } 1991 return (-1); 1992 } 1993 1994 /* 1995 * When the link is notified up, we need to do a few things, based 1996 * on the port's current p_init_type_reply claiming a reinit has been 1997 * done or not. The reinit steps are: 1998 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 1999 * the old Pkey and GID0 are correct. 2000 * 2. Register for mcg traps (already done by ibmf). 2001 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2002 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2003 * 4. Give up all sendonly memberships. 2004 * 5. Acquire all full memberships. 2005 * 6. In promiscuous mode, acquire all non memberships. 2006 * 7. Recycle all AHs to free list. 2007 */ 2008 static void 2009 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2010 { 2011 ibt_hca_portinfo_t *port_infop = NULL; 2012 ibt_status_t ibt_status; 2013 uint_t psize, port_infosz; 2014 ibd_link_op_t opcode; 2015 ibd_req_t *req; 2016 link_state_t new_link_state = LINK_STATE_UP; 2017 uint8_t itreply; 2018 uint16_t pkix; 2019 int ret; 2020 2021 /* 2022 * Let's not race with a plumb or an unplumb; if we detect a 2023 * pkey relocation event later on here, we may have to restart. 2024 */ 2025 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2026 2027 mutex_enter(&state->id_link_mutex); 2028 2029 /* 2030 * If the init code in ibd_m_start hasn't yet set up the 2031 * pkey/gid, nothing to do; that code will set the link state. 2032 */ 2033 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2034 mutex_exit(&state->id_link_mutex); 2035 goto link_mod_return; 2036 } 2037 2038 /* 2039 * If this routine was called in response to a port down event, 2040 * we just need to see if this should be informed. 2041 */ 2042 if (code == IBT_ERROR_PORT_DOWN) { 2043 new_link_state = LINK_STATE_DOWN; 2044 goto update_link_state; 2045 } 2046 2047 /* 2048 * If it's not a port down event we've received, try to get the port 2049 * attributes first. If we fail here, the port is as good as down. 2050 * Otherwise, if the link went down by the time the handler gets 2051 * here, give up - we cannot even validate the pkey/gid since those 2052 * are not valid and this is as bad as a port down anyway. 2053 */ 2054 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2055 &port_infop, &psize, &port_infosz); 2056 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2057 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2058 new_link_state = LINK_STATE_DOWN; 2059 goto update_link_state; 2060 } 2061 2062 /* 2063 * Check the SM InitTypeReply flags. If both NoLoadReply and 2064 * PreserveContentReply are 0, we don't know anything about the 2065 * data loaded into the port attributes, so we need to verify 2066 * if gid0 and pkey are still valid. 2067 */ 2068 itreply = port_infop->p_init_type_reply; 2069 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2070 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2071 /* 2072 * Check to see if the subnet part of GID0 has changed. If 2073 * not, check the simple case first to see if the pkey 2074 * index is the same as before; finally check to see if the 2075 * pkey has been relocated to a different index in the table. 2076 */ 2077 if (bcmp(port_infop->p_sgid_tbl, 2078 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2079 2080 new_link_state = LINK_STATE_DOWN; 2081 2082 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2083 state->id_pkey) { 2084 2085 new_link_state = LINK_STATE_UP; 2086 2087 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2088 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2089 2090 ibt_free_portinfo(port_infop, port_infosz); 2091 mutex_exit(&state->id_link_mutex); 2092 2093 /* 2094 * Currently a restart is required if our pkey has moved 2095 * in the pkey table. If we get the ibt_recycle_ud() to 2096 * work as documented (expected), we may be able to 2097 * avoid a complete restart. Note that we've already 2098 * marked both the start and stop 'in-progress' flags, 2099 * so it is ok to go ahead and do this restart. 2100 */ 2101 ibd_undo_start(state, LINK_STATE_DOWN); 2102 if ((ret = ibd_start(state)) != 0) { 2103 DPRINT(10, "ibd_restart: cannot restart, " 2104 "ret=%d", ret); 2105 } 2106 2107 goto link_mod_return; 2108 } else { 2109 new_link_state = LINK_STATE_DOWN; 2110 } 2111 } 2112 2113 update_link_state: 2114 if (port_infop) { 2115 ibt_free_portinfo(port_infop, port_infosz); 2116 } 2117 2118 /* 2119 * If the old state is the same as the new state, nothing to do 2120 */ 2121 if (state->id_link_state == new_link_state) { 2122 mutex_exit(&state->id_link_mutex); 2123 goto link_mod_return; 2124 } 2125 2126 /* 2127 * Ok, so there was a link state change; see if it's safe to ask 2128 * the async thread to do the work 2129 */ 2130 if (!ibd_async_safe(state)) { 2131 state->id_link_state = new_link_state; 2132 mutex_exit(&state->id_link_mutex); 2133 goto link_mod_return; 2134 } 2135 2136 mutex_exit(&state->id_link_mutex); 2137 2138 /* 2139 * If we're reporting a link up, check InitTypeReply to see if 2140 * the SM has ensured that the port's presence in mcg, traps, 2141 * etc. is intact. 2142 */ 2143 if (new_link_state == LINK_STATE_DOWN) { 2144 opcode = IBD_LINK_DOWN; 2145 } else { 2146 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2147 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2148 opcode = IBD_LINK_UP; 2149 } else { 2150 opcode = IBD_LINK_UP_ABSENT; 2151 } 2152 } 2153 2154 /* 2155 * Queue up a request for ibd_async_link() to handle this link 2156 * state change event 2157 */ 2158 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2159 req->rq_ptr = (void *)opcode; 2160 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2161 2162 link_mod_return: 2163 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2164 } 2165 2166 /* 2167 * For the port up/down events, IBTL guarantees there will not be concurrent 2168 * invocations of the handler. IBTL might coalesce link transition events, 2169 * and not invoke the handler for _each_ up/down transition, but it will 2170 * invoke the handler with last known state 2171 */ 2172 static void 2173 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2174 ibt_async_code_t code, ibt_async_event_t *event) 2175 { 2176 ibd_state_t *state = (ibd_state_t *)clnt_private; 2177 2178 switch (code) { 2179 case IBT_ERROR_CATASTROPHIC_CHAN: 2180 ibd_print_warn(state, "catastrophic channel error"); 2181 break; 2182 case IBT_ERROR_CQ: 2183 ibd_print_warn(state, "completion queue error"); 2184 break; 2185 case IBT_PORT_CHANGE_EVENT: 2186 /* 2187 * Events will be delivered to all instances that have 2188 * done ibt_open_hca() but not yet done ibt_close_hca(). 2189 * Only need to do work for our port; IBTF will deliver 2190 * events for other ports on the hca we have ibt_open_hca'ed 2191 * too. Note that id_port is initialized in ibd_attach() 2192 * before we do an ibt_open_hca() in ibd_attach(). 2193 */ 2194 ASSERT(state->id_hca_hdl == hca_hdl); 2195 if (state->id_port != event->ev_port) 2196 break; 2197 2198 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2199 IBT_PORT_CHANGE_PKEY) { 2200 ibd_link_mod(state, code); 2201 } 2202 break; 2203 case IBT_ERROR_PORT_DOWN: 2204 case IBT_CLNT_REREG_EVENT: 2205 case IBT_EVENT_PORT_UP: 2206 /* 2207 * Events will be delivered to all instances that have 2208 * done ibt_open_hca() but not yet done ibt_close_hca(). 2209 * Only need to do work for our port; IBTF will deliver 2210 * events for other ports on the hca we have ibt_open_hca'ed 2211 * too. Note that id_port is initialized in ibd_attach() 2212 * before we do an ibt_open_hca() in ibd_attach(). 2213 */ 2214 ASSERT(state->id_hca_hdl == hca_hdl); 2215 if (state->id_port != event->ev_port) 2216 break; 2217 2218 ibd_link_mod(state, code); 2219 break; 2220 2221 case IBT_HCA_ATTACH_EVENT: 2222 case IBT_HCA_DETACH_EVENT: 2223 /* 2224 * When a new card is plugged to the system, attach_event is 2225 * invoked. Additionally, a cfgadm needs to be run to make the 2226 * card known to the system, and an ifconfig needs to be run to 2227 * plumb up any ibd interfaces on the card. In the case of card 2228 * unplug, a cfgadm is run that will trigger any RCM scripts to 2229 * unplumb the ibd interfaces on the card; when the card is 2230 * actually unplugged, the detach_event is invoked; 2231 * additionally, if any ibd instances are still active on the 2232 * card (eg there were no associated RCM scripts), driver's 2233 * detach routine is invoked. 2234 */ 2235 break; 2236 default: 2237 break; 2238 } 2239 } 2240 2241 static int 2242 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2243 { 2244 mac_register_t *macp; 2245 int ret; 2246 2247 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2248 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2249 return (DDI_FAILURE); 2250 } 2251 2252 /* 2253 * Note that when we register with mac during attach, we don't 2254 * have the id_macaddr yet, so we'll simply be registering a 2255 * zero macaddr that we'll overwrite later during plumb (in 2256 * ibd_m_start()). Similar is the case with id_mtu - we'll 2257 * update the mac layer with the correct mtu during plumb. 2258 */ 2259 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2260 macp->m_driver = state; 2261 macp->m_dip = dip; 2262 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2263 macp->m_callbacks = &ibd_m_callbacks; 2264 macp->m_min_sdu = 0; 2265 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2266 2267 /* 2268 * Register ourselves with the GLDv3 interface 2269 */ 2270 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2271 mac_free(macp); 2272 DPRINT(10, 2273 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2274 return (DDI_FAILURE); 2275 } 2276 2277 mac_free(macp); 2278 return (DDI_SUCCESS); 2279 } 2280 2281 static int 2282 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2283 { 2284 ibt_hca_attr_t hca_attrs; 2285 ibt_status_t ibt_status; 2286 2287 /* 2288 * Query the HCA and fetch its attributes 2289 */ 2290 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2291 ASSERT(ibt_status == IBT_SUCCESS); 2292 2293 /* 2294 * 1. Set the Hardware Checksum capability. Currently we only consider 2295 * full checksum offload. 2296 */ 2297 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { 2298 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2299 } 2300 2301 /* 2302 * 2. Set LSO policy, capability and maximum length 2303 */ 2304 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2305 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2306 state->id_lso_policy = B_TRUE; 2307 } else { 2308 state->id_lso_policy = B_FALSE; 2309 } 2310 2311 if (hca_attrs.hca_max_lso_size > 0) { 2312 state->id_lso_capable = B_TRUE; 2313 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2314 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2315 else 2316 state->id_lso_maxlen = hca_attrs.hca_max_lso_size; 2317 } else { 2318 state->id_lso_capable = B_FALSE; 2319 state->id_lso_maxlen = 0; 2320 } 2321 2322 /* 2323 * 3. Set Reserved L_Key capability 2324 */ 2325 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2326 state->id_hca_res_lkey_capab = 1; 2327 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2328 } 2329 2330 /* 2331 * 4. Set maximum sqseg value after checking to see if extended sgl 2332 * size information is provided by the hca 2333 */ 2334 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2335 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2336 } else { 2337 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2338 } 2339 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2340 state->id_max_sqseg = IBD_MAX_SQSEG; 2341 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2342 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2343 state->id_max_sqseg, IBD_MAX_SQSEG); 2344 } 2345 2346 /* 2347 * 5. Set number of recv and send wqes after checking hca maximum 2348 * channel size 2349 */ 2350 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2351 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2352 } else { 2353 state->id_num_rwqe = IBD_NUM_RWQE; 2354 } 2355 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2356 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2357 } else { 2358 state->id_num_swqe = IBD_NUM_SWQE; 2359 } 2360 2361 return (DDI_SUCCESS); 2362 } 2363 2364 static int 2365 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2366 { 2367 int instance; 2368 uint32_t progress = state->id_mac_state; 2369 ibt_status_t ret; 2370 2371 if (progress & IBD_DRV_MAC_REGISTERED) { 2372 (void) mac_unregister(state->id_mh); 2373 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2374 } 2375 2376 if (progress & IBD_DRV_PD_ALLOCD) { 2377 if ((ret = ibt_free_pd(state->id_hca_hdl, 2378 state->id_pd_hdl)) != IBT_SUCCESS) { 2379 ibd_print_warn(state, "failed to free " 2380 "protection domain, ret=%d", ret); 2381 } 2382 state->id_pd_hdl = NULL; 2383 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2384 } 2385 2386 if (progress & IBD_DRV_HCA_OPENED) { 2387 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2388 IBT_SUCCESS) { 2389 ibd_print_warn(state, "failed to close " 2390 "HCA device, ret=%d", ret); 2391 } 2392 state->id_hca_hdl = NULL; 2393 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2394 } 2395 2396 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2397 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 2398 ibd_print_warn(state, 2399 "ibt_detach() failed, ret=%d", ret); 2400 } 2401 state->id_ibt_hdl = NULL; 2402 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2403 } 2404 2405 if (progress & IBD_DRV_TXINTR_ADDED) { 2406 ddi_remove_softintr(state->id_tx); 2407 state->id_tx = NULL; 2408 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2409 } 2410 2411 if (progress & IBD_DRV_RXINTR_ADDED) { 2412 ddi_remove_softintr(state->id_rx); 2413 state->id_rx = NULL; 2414 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2415 } 2416 2417 if (progress & IBD_DRV_STATE_INITIALIZED) { 2418 ibd_state_fini(state); 2419 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2420 } 2421 2422 instance = ddi_get_instance(dip); 2423 ddi_soft_state_free(ibd_list, instance); 2424 2425 return (DDI_SUCCESS); 2426 } 2427 2428 /* 2429 * Attach device to the IO framework. 2430 */ 2431 static int 2432 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2433 { 2434 ibd_state_t *state = NULL; 2435 ib_guid_t hca_guid; 2436 int instance; 2437 ibt_status_t ret; 2438 int rv; 2439 2440 /* 2441 * IBD doesn't support suspend/resume 2442 */ 2443 if (cmd != DDI_ATTACH) 2444 return (DDI_FAILURE); 2445 2446 /* 2447 * Allocate softstate structure 2448 */ 2449 instance = ddi_get_instance(dip); 2450 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2451 return (DDI_FAILURE); 2452 state = ddi_get_soft_state(ibd_list, instance); 2453 2454 /* 2455 * Initialize mutexes and condition variables 2456 */ 2457 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2458 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2459 goto attach_fail; 2460 } 2461 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2462 2463 /* 2464 * Allocate rx,tx softintr 2465 */ 2466 if (ibd_rx_softintr == 1) { 2467 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2468 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2469 DPRINT(10, "ibd_attach: failed in " 2470 "ddi_add_softintr(id_rx), ret=%d", rv); 2471 goto attach_fail; 2472 } 2473 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2474 } 2475 if (ibd_tx_softintr == 1) { 2476 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2477 NULL, NULL, ibd_tx_recycle, 2478 (caddr_t)state)) != DDI_SUCCESS) { 2479 DPRINT(10, "ibd_attach: failed in " 2480 "ddi_add_softintr(id_tx), ret=%d", rv); 2481 goto attach_fail; 2482 } 2483 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2484 } 2485 2486 /* 2487 * Obtain IBA P_Key, port number and HCA guid and validate 2488 * them (for P_Key, only full members are allowed as per 2489 * IPoIB specification; neither port number nor HCA guid 2490 * can be zero) 2491 */ 2492 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2493 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2494 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2495 state->id_pkey); 2496 goto attach_fail; 2497 } 2498 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2499 "port-number", 0)) == 0) { 2500 DPRINT(10, "ibd_attach: invalid port number (%d)", 2501 state->id_port); 2502 goto attach_fail; 2503 } 2504 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2505 "hca-guid", 0)) == 0) { 2506 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2507 hca_guid); 2508 goto attach_fail; 2509 } 2510 2511 /* 2512 * Attach to IBTL 2513 */ 2514 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2515 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2516 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 2517 goto attach_fail; 2518 } 2519 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2520 2521 /* 2522 * Open the HCA 2523 */ 2524 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2525 &state->id_hca_hdl)) != IBT_SUCCESS) { 2526 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2527 goto attach_fail; 2528 } 2529 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2530 2531 /* 2532 * Record capabilities 2533 */ 2534 (void) ibd_record_capab(state, dip); 2535 2536 /* 2537 * Allocate a protection domain on the HCA 2538 */ 2539 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2540 &state->id_pd_hdl)) != IBT_SUCCESS) { 2541 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2542 goto attach_fail; 2543 } 2544 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2545 2546 2547 /* 2548 * Register ibd interfaces with the Nemo framework 2549 */ 2550 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2551 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2552 goto attach_fail; 2553 } 2554 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2555 2556 /* 2557 * We're done with everything we could to make the attach 2558 * succeed. All the buffer allocations and IPoIB broadcast 2559 * group joins are deferred to when the interface instance 2560 * is actually plumbed to avoid wasting memory. 2561 */ 2562 return (DDI_SUCCESS); 2563 2564 attach_fail: 2565 (void) ibd_unattach(state, dip); 2566 return (DDI_FAILURE); 2567 } 2568 2569 /* 2570 * Detach device from the IO framework. 2571 */ 2572 static int 2573 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2574 { 2575 ibd_state_t *state; 2576 int instance; 2577 2578 /* 2579 * IBD doesn't support suspend/resume 2580 */ 2581 if (cmd != DDI_DETACH) 2582 return (DDI_FAILURE); 2583 2584 /* 2585 * Get the instance softstate 2586 */ 2587 instance = ddi_get_instance(dip); 2588 state = ddi_get_soft_state(ibd_list, instance); 2589 2590 /* 2591 * Release all resources we're holding still. Note that if we'd 2592 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2593 * so far, we should find all the flags we need in id_mac_state. 2594 */ 2595 (void) ibd_unattach(state, dip); 2596 2597 return (DDI_SUCCESS); 2598 } 2599 2600 /* 2601 * Pre ibt_attach() driver initialization 2602 */ 2603 static int 2604 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2605 { 2606 char buf[64]; 2607 2608 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2609 state->id_link_state = LINK_STATE_UNKNOWN; 2610 2611 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2612 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2613 state->id_trap_stop = B_TRUE; 2614 state->id_trap_inprog = 0; 2615 2616 mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2617 state->id_dip = dip; 2618 2619 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2620 2621 state->id_tx_list.dl_head = NULL; 2622 state->id_tx_list.dl_tail = NULL; 2623 state->id_tx_list.dl_pending_sends = B_FALSE; 2624 state->id_tx_list.dl_cnt = 0; 2625 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2626 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2627 state->id_tx_busy = 0; 2628 2629 state->id_rx_list.dl_head = NULL; 2630 state->id_rx_list.dl_tail = NULL; 2631 state->id_rx_list.dl_bufs_outstanding = 0; 2632 state->id_rx_list.dl_cnt = 0; 2633 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2634 mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL); 2635 2636 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2637 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2638 0, NULL, NULL, NULL, NULL, NULL, 0); 2639 2640 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 2641 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 2642 2643 return (DDI_SUCCESS); 2644 } 2645 2646 /* 2647 * Post ibt_detach() driver deconstruction 2648 */ 2649 static void 2650 ibd_state_fini(ibd_state_t *state) 2651 { 2652 cv_destroy(&state->id_macst_cv); 2653 mutex_destroy(&state->id_macst_lock); 2654 2655 kmem_cache_destroy(state->id_req_kmc); 2656 2657 mutex_destroy(&state->id_rxpost_lock); 2658 mutex_destroy(&state->id_rx_list.dl_mutex); 2659 2660 mutex_destroy(&state->id_txpost_lock); 2661 mutex_destroy(&state->id_tx_list.dl_mutex); 2662 2663 mutex_destroy(&state->id_sched_lock); 2664 mutex_destroy(&state->id_cq_poll_lock); 2665 2666 cv_destroy(&state->id_trap_cv); 2667 mutex_destroy(&state->id_trap_lock); 2668 mutex_destroy(&state->id_link_mutex); 2669 } 2670 2671 /* 2672 * Fetch link speed from SA for snmp ifspeed reporting. 2673 */ 2674 static uint64_t 2675 ibd_get_portspeed(ibd_state_t *state) 2676 { 2677 int ret; 2678 ibt_path_info_t path; 2679 ibt_path_attr_t path_attr; 2680 uint8_t num_paths; 2681 uint64_t ifspeed; 2682 2683 /* 2684 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2685 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2686 * 2000000000. Start with that as default. 2687 */ 2688 ifspeed = 2000000000; 2689 2690 bzero(&path_attr, sizeof (path_attr)); 2691 2692 /* 2693 * Get the port speed from Loopback path information. 2694 */ 2695 path_attr.pa_dgids = &state->id_sgid; 2696 path_attr.pa_num_dgids = 1; 2697 path_attr.pa_sgid = state->id_sgid; 2698 2699 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2700 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2701 goto earlydone; 2702 2703 if (num_paths < 1) 2704 goto earlydone; 2705 2706 /* 2707 * In case SA does not return an expected value, report the default 2708 * speed as 1X. 2709 */ 2710 ret = 1; 2711 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2712 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2713 ret = 1; 2714 break; 2715 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2716 ret = 4; 2717 break; 2718 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2719 ret = 12; 2720 break; 2721 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2722 ret = 2; 2723 break; 2724 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2725 ret = 8; 2726 break; 2727 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2728 ret = 16; 2729 break; 2730 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2731 ret = 24; 2732 break; 2733 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2734 ret = 32; 2735 break; 2736 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2737 ret = 48; 2738 break; 2739 } 2740 2741 ifspeed *= ret; 2742 2743 earlydone: 2744 return (ifspeed); 2745 } 2746 2747 /* 2748 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2749 * representing the input mcg mgid. 2750 */ 2751 static ibd_mce_t * 2752 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2753 { 2754 ibd_mce_t *ptr = list_head(mlist); 2755 2756 /* 2757 * Do plain linear search. 2758 */ 2759 while (ptr != NULL) { 2760 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2761 sizeof (ib_gid_t)) == 0) 2762 return (ptr); 2763 ptr = list_next(mlist, ptr); 2764 } 2765 return (NULL); 2766 } 2767 2768 /* 2769 * Execute IBA JOIN. 2770 */ 2771 static ibt_status_t 2772 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2773 { 2774 ibt_mcg_attr_t mcg_attr; 2775 2776 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2777 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2778 mcg_attr.mc_mgid = mgid; 2779 mcg_attr.mc_join_state = mce->mc_jstate; 2780 mcg_attr.mc_scope = state->id_scope; 2781 mcg_attr.mc_pkey = state->id_pkey; 2782 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2783 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2784 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2785 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2786 NULL, NULL)); 2787 } 2788 2789 /* 2790 * This code JOINs the port in the proper way (depending on the join 2791 * state) so that IBA fabric will forward mcg packets to/from the port. 2792 * It also attaches the QPN to the mcg so it can receive those mcg 2793 * packets. This code makes sure not to attach the mcg to the QP if 2794 * that has been previously done due to the mcg being joined with a 2795 * different join state, even though this is not required by SWG_0216, 2796 * refid 3610. 2797 */ 2798 static ibd_mce_t * 2799 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2800 { 2801 ibt_status_t ibt_status; 2802 ibd_mce_t *mce, *tmce, *omce = NULL; 2803 boolean_t do_attach = B_TRUE; 2804 2805 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2806 jstate, mgid.gid_prefix, mgid.gid_guid); 2807 2808 /* 2809 * For enable_multicast Full member joins, we need to do some 2810 * extra work. If there is already an mce on the list that 2811 * indicates full membership, that means the membership has 2812 * not yet been dropped (since the disable_multicast was issued) 2813 * because there are pending Tx's to the mcg; in that case, just 2814 * mark the mce not to be reaped when the Tx completion queues 2815 * an async reap operation. 2816 * 2817 * If there is already an mce on the list indicating sendonly 2818 * membership, try to promote to full membership. Be careful 2819 * not to deallocate the old mce, since there might be an AH 2820 * pointing to it; instead, update the old mce with new data 2821 * that tracks the full membership. 2822 */ 2823 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2824 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2825 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2826 ASSERT(omce->mc_fullreap); 2827 omce->mc_fullreap = B_FALSE; 2828 return (omce); 2829 } else { 2830 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2831 } 2832 } 2833 2834 /* 2835 * Allocate the ibd_mce_t to track this JOIN. 2836 */ 2837 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2838 mce->mc_fullreap = B_FALSE; 2839 mce->mc_jstate = jstate; 2840 2841 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2842 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2843 ibt_status); 2844 kmem_free(mce, sizeof (ibd_mce_t)); 2845 return (NULL); 2846 } 2847 2848 /* 2849 * Is an IBA attach required? Not if the interface is already joined 2850 * to the mcg in a different appropriate join state. 2851 */ 2852 if (jstate == IB_MC_JSTATE_NON) { 2853 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2854 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2855 do_attach = B_FALSE; 2856 } else if (jstate == IB_MC_JSTATE_FULL) { 2857 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2858 do_attach = B_FALSE; 2859 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2860 do_attach = B_FALSE; 2861 } 2862 2863 if (do_attach) { 2864 /* 2865 * Do the IBA attach. 2866 */ 2867 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2868 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2869 &mce->mc_info)) != IBT_SUCCESS) { 2870 DPRINT(10, "ibd_join_group : failed qp attachment " 2871 "%d\n", ibt_status); 2872 /* 2873 * NOTE that we should probably preserve the join info 2874 * in the list and later try to leave again at detach 2875 * time. 2876 */ 2877 (void) ibt_leave_mcg(state->id_sgid, mgid, 2878 state->id_sgid, jstate); 2879 kmem_free(mce, sizeof (ibd_mce_t)); 2880 return (NULL); 2881 } 2882 } 2883 2884 /* 2885 * Insert the ibd_mce_t in the proper list. 2886 */ 2887 if (jstate == IB_MC_JSTATE_NON) { 2888 IBD_MCACHE_INSERT_NON(state, mce); 2889 } else { 2890 /* 2891 * Set up the mc_req fields used for reaping the 2892 * mcg in case of delayed tx completion (see 2893 * ibd_tx_cleanup()). Also done for sendonly join in 2894 * case we are promoted to fullmembership later and 2895 * keep using the same mce. 2896 */ 2897 mce->mc_req.rq_gid = mgid; 2898 mce->mc_req.rq_ptr = mce; 2899 /* 2900 * Check whether this is the case of trying to join 2901 * full member, and we were already joined send only. 2902 * We try to drop our SendOnly membership, but it is 2903 * possible that the mcg does not exist anymore (and 2904 * the subnet trap never reached us), so the leave 2905 * operation might fail. 2906 */ 2907 if (omce != NULL) { 2908 (void) ibt_leave_mcg(state->id_sgid, mgid, 2909 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2910 omce->mc_jstate = IB_MC_JSTATE_FULL; 2911 bcopy(&mce->mc_info, &omce->mc_info, 2912 sizeof (ibt_mcg_info_t)); 2913 kmem_free(mce, sizeof (ibd_mce_t)); 2914 return (omce); 2915 } 2916 mutex_enter(&state->id_mc_mutex); 2917 IBD_MCACHE_INSERT_FULL(state, mce); 2918 mutex_exit(&state->id_mc_mutex); 2919 } 2920 2921 return (mce); 2922 } 2923 2924 /* 2925 * Called during port up event handling to attempt to reacquire full 2926 * membership to an mcg. Stripped down version of ibd_join_group(). 2927 * Note that it is possible that the mcg might have gone away, and 2928 * gets recreated at this point. 2929 */ 2930 static void 2931 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2932 { 2933 ib_gid_t mgid; 2934 2935 /* 2936 * If the mc_fullreap flag is set, or this join fails, a subsequent 2937 * reap/leave is going to try to leave the group. We could prevent 2938 * that by adding a boolean flag into ibd_mce_t, if required. 2939 */ 2940 if (mce->mc_fullreap) 2941 return; 2942 2943 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2944 2945 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2946 mgid.gid_guid); 2947 2948 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2949 ibd_print_warn(state, "Failure on port up to rejoin " 2950 "multicast gid %016llx:%016llx", 2951 (u_longlong_t)mgid.gid_prefix, 2952 (u_longlong_t)mgid.gid_guid); 2953 } 2954 2955 /* 2956 * This code handles delayed Tx completion cleanups for mcg's to which 2957 * disable_multicast has been issued, regular mcg related cleanups during 2958 * disable_multicast, disable_promiscous and mcg traps, as well as 2959 * cleanups during driver detach time. Depending on the join state, 2960 * it deletes the mce from the appropriate list and issues the IBA 2961 * leave/detach; except in the disable_multicast case when the mce 2962 * is left on the active list for a subsequent Tx completion cleanup. 2963 */ 2964 static void 2965 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2966 uint8_t jstate) 2967 { 2968 ibd_mce_t *tmce; 2969 boolean_t do_detach = B_TRUE; 2970 2971 /* 2972 * Before detaching, we must check whether the other list 2973 * contains the mcg; if we detach blindly, the consumer 2974 * who set up the other list will also stop receiving 2975 * traffic. 2976 */ 2977 if (jstate == IB_MC_JSTATE_FULL) { 2978 /* 2979 * The following check is only relevant while coming 2980 * from the Tx completion path in the reap case. 2981 */ 2982 if (!mce->mc_fullreap) 2983 return; 2984 mutex_enter(&state->id_mc_mutex); 2985 IBD_MCACHE_PULLOUT_FULL(state, mce); 2986 mutex_exit(&state->id_mc_mutex); 2987 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2988 do_detach = B_FALSE; 2989 } else if (jstate == IB_MC_JSTATE_NON) { 2990 IBD_MCACHE_PULLOUT_NON(state, mce); 2991 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2992 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2993 do_detach = B_FALSE; 2994 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2995 mutex_enter(&state->id_mc_mutex); 2996 IBD_MCACHE_PULLOUT_FULL(state, mce); 2997 mutex_exit(&state->id_mc_mutex); 2998 do_detach = B_FALSE; 2999 } 3000 3001 /* 3002 * If we are reacting to a mcg trap and leaving our sendonly or 3003 * non membership, the mcg is possibly already gone, so attempting 3004 * to leave might fail. On the other hand, we must try to leave 3005 * anyway, since this might be a trap from long ago, and we could 3006 * have potentially sendonly joined to a recent incarnation of 3007 * the mcg and are about to loose track of this information. 3008 */ 3009 if (do_detach) { 3010 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3011 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3012 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3013 } 3014 3015 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3016 kmem_free(mce, sizeof (ibd_mce_t)); 3017 } 3018 3019 /* 3020 * Async code executed due to multicast and promiscuous disable requests 3021 * and mcg trap handling; also executed during driver detach. Mostly, a 3022 * leave and detach is done; except for the fullmember case when Tx 3023 * requests are pending, whence arrangements are made for subsequent 3024 * cleanup on Tx completion. 3025 */ 3026 static void 3027 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3028 { 3029 ipoib_mac_t mcmac; 3030 boolean_t recycled; 3031 ibd_mce_t *mce; 3032 3033 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3034 jstate, mgid.gid_prefix, mgid.gid_guid); 3035 3036 if (jstate == IB_MC_JSTATE_NON) { 3037 recycled = B_TRUE; 3038 mce = IBD_MCACHE_FIND_NON(state, mgid); 3039 /* 3040 * In case we are handling a mcg trap, we might not find 3041 * the mcg in the non list. 3042 */ 3043 if (mce == NULL) { 3044 return; 3045 } 3046 } else { 3047 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3048 3049 /* 3050 * In case we are handling a mcg trap, make sure the trap 3051 * is not arriving late; if we have an mce that indicates 3052 * that we are already a fullmember, that would be a clear 3053 * indication that the trap arrived late (ie, is for a 3054 * previous incarnation of the mcg). 3055 */ 3056 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3057 if ((mce == NULL) || (mce->mc_jstate == 3058 IB_MC_JSTATE_FULL)) { 3059 return; 3060 } 3061 } else { 3062 ASSERT(jstate == IB_MC_JSTATE_FULL); 3063 3064 /* 3065 * If join group failed, mce will be NULL here. 3066 * This is because in GLDv3 driver, set multicast 3067 * will always return success. 3068 */ 3069 if (mce == NULL) { 3070 return; 3071 } 3072 3073 mce->mc_fullreap = B_TRUE; 3074 } 3075 3076 /* 3077 * If no pending Tx's remain that reference the AH 3078 * for the mcg, recycle it from active to free list. 3079 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3080 * so the last completing Tx will cause an async reap 3081 * operation to be invoked, at which time we will drop our 3082 * membership to the mcg so that the pending Tx's complete 3083 * successfully. Refer to comments on "AH and MCE active 3084 * list manipulation" at top of this file. The lock protects 3085 * against Tx fast path and Tx cleanup code. 3086 */ 3087 mutex_enter(&state->id_ac_mutex); 3088 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3089 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3090 IB_MC_JSTATE_SEND_ONLY_NON)); 3091 mutex_exit(&state->id_ac_mutex); 3092 } 3093 3094 if (recycled) { 3095 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3096 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3097 ibd_async_reap_group(state, mce, mgid, jstate); 3098 } 3099 } 3100 3101 /* 3102 * Find the broadcast address as defined by IPoIB; implicitly 3103 * determines the IBA scope, mtu, tclass etc of the link the 3104 * interface is going to be a member of. 3105 */ 3106 static ibt_status_t 3107 ibd_find_bgroup(ibd_state_t *state) 3108 { 3109 ibt_mcg_attr_t mcg_attr; 3110 uint_t numg; 3111 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3112 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3113 IB_MC_SCOPE_GLOBAL }; 3114 int i, mcgmtu; 3115 boolean_t found = B_FALSE; 3116 int ret; 3117 ibt_mcg_info_t mcg_info; 3118 3119 state->id_bgroup_created = B_FALSE; 3120 3121 query_bcast_grp: 3122 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3123 mcg_attr.mc_pkey = state->id_pkey; 3124 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3125 3126 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3127 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3128 3129 /* 3130 * Look for the IPoIB broadcast group. 3131 */ 3132 state->id_mgid.gid_prefix = 3133 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3134 ((uint64_t)state->id_scope << 48) | 3135 ((uint32_t)(state->id_pkey << 16))); 3136 mcg_attr.mc_mgid = state->id_mgid; 3137 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3138 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3139 found = B_TRUE; 3140 break; 3141 } 3142 } 3143 3144 if (!found) { 3145 if (ibd_create_broadcast_group) { 3146 /* 3147 * If we created the broadcast group, but failed to 3148 * find it, we can't do anything except leave the 3149 * one we created and return failure. 3150 */ 3151 if (state->id_bgroup_created) { 3152 ibd_print_warn(state, "IPoIB broadcast group " 3153 "absent. Unable to query after create."); 3154 goto find_bgroup_fail; 3155 } 3156 3157 /* 3158 * Create the ipoib broadcast group if it didn't exist 3159 */ 3160 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3161 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3162 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3163 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3164 mcg_attr.mc_pkey = state->id_pkey; 3165 mcg_attr.mc_flow = 0; 3166 mcg_attr.mc_sl = 0; 3167 mcg_attr.mc_tclass = 0; 3168 state->id_mgid.gid_prefix = 3169 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3170 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3171 ((uint32_t)(state->id_pkey << 16))); 3172 mcg_attr.mc_mgid = state->id_mgid; 3173 3174 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3175 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3176 ibd_print_warn(state, "IPoIB broadcast group " 3177 "absent, create failed: ret = %d\n", ret); 3178 state->id_bgroup_created = B_FALSE; 3179 return (IBT_FAILURE); 3180 } 3181 state->id_bgroup_created = B_TRUE; 3182 goto query_bcast_grp; 3183 } else { 3184 ibd_print_warn(state, "IPoIB broadcast group absent"); 3185 return (IBT_FAILURE); 3186 } 3187 } 3188 3189 /* 3190 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3191 */ 3192 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3193 if (state->id_mtu < mcgmtu) { 3194 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3195 "greater than port's maximum MTU %d", mcgmtu, 3196 state->id_mtu); 3197 ibt_free_mcg_info(state->id_mcinfo, 1); 3198 goto find_bgroup_fail; 3199 } 3200 state->id_mtu = mcgmtu; 3201 3202 return (IBT_SUCCESS); 3203 3204 find_bgroup_fail: 3205 if (state->id_bgroup_created) { 3206 (void) ibt_leave_mcg(state->id_sgid, 3207 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3208 IB_MC_JSTATE_FULL); 3209 } 3210 3211 return (IBT_FAILURE); 3212 } 3213 3214 static int 3215 ibd_alloc_tx_copybufs(ibd_state_t *state) 3216 { 3217 ibt_mr_attr_t mem_attr; 3218 3219 /* 3220 * Allocate one big chunk for all regular tx copy bufs 3221 */ 3222 state->id_tx_buf_sz = state->id_mtu; 3223 if (state->id_lso_policy && state->id_lso_capable && 3224 (IBD_TX_BUF_SZ > state->id_mtu)) { 3225 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3226 } 3227 3228 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3229 state->id_tx_buf_sz, KM_SLEEP); 3230 3231 /* 3232 * Do one memory registration on the entire txbuf area 3233 */ 3234 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3235 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3236 mem_attr.mr_as = NULL; 3237 mem_attr.mr_flags = IBT_MR_SLEEP; 3238 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3239 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3240 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3241 kmem_free(state->id_tx_bufs, 3242 state->id_num_swqe * state->id_tx_buf_sz); 3243 state->id_tx_bufs = NULL; 3244 return (DDI_FAILURE); 3245 } 3246 3247 return (DDI_SUCCESS); 3248 } 3249 3250 static int 3251 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3252 { 3253 ibt_mr_attr_t mem_attr; 3254 ibd_lsobuf_t *buflist; 3255 ibd_lsobuf_t *lbufp; 3256 ibd_lsobuf_t *tail; 3257 ibd_lsobkt_t *bktp; 3258 uint8_t *membase; 3259 uint8_t *memp; 3260 uint_t memsz; 3261 int i; 3262 3263 /* 3264 * Allocate the lso bucket 3265 */ 3266 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3267 3268 /* 3269 * Allocate the entire lso memory and register it 3270 */ 3271 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3272 membase = kmem_zalloc(memsz, KM_SLEEP); 3273 3274 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3275 mem_attr.mr_len = memsz; 3276 mem_attr.mr_as = NULL; 3277 mem_attr.mr_flags = IBT_MR_SLEEP; 3278 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3279 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3280 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3281 kmem_free(membase, memsz); 3282 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3283 return (DDI_FAILURE); 3284 } 3285 3286 /* 3287 * Now allocate the buflist. Note that the elements in the buflist and 3288 * the buffers in the lso memory have a permanent 1-1 relation, so we 3289 * can always derive the address of a buflist entry from the address of 3290 * an lso buffer. 3291 */ 3292 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3293 KM_SLEEP); 3294 3295 /* 3296 * Set up the lso buf chain 3297 */ 3298 memp = membase; 3299 lbufp = buflist; 3300 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3301 lbufp->lb_isfree = 1; 3302 lbufp->lb_buf = memp; 3303 lbufp->lb_next = lbufp + 1; 3304 3305 tail = lbufp; 3306 3307 memp += IBD_LSO_BUFSZ; 3308 lbufp++; 3309 } 3310 tail->lb_next = NULL; 3311 3312 /* 3313 * Set up the LSO buffer information in ibd state 3314 */ 3315 bktp->bkt_bufl = buflist; 3316 bktp->bkt_free_head = buflist; 3317 bktp->bkt_mem = membase; 3318 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3319 bktp->bkt_nfree = bktp->bkt_nelem; 3320 3321 state->id_lso = bktp; 3322 3323 return (DDI_SUCCESS); 3324 } 3325 3326 /* 3327 * Statically allocate Tx buffer list(s). 3328 */ 3329 static int 3330 ibd_init_txlist(ibd_state_t *state) 3331 { 3332 ibd_swqe_t *swqe; 3333 ibt_lkey_t lkey; 3334 int i; 3335 3336 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3337 return (DDI_FAILURE); 3338 3339 if (state->id_lso_policy && state->id_lso_capable) { 3340 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3341 state->id_lso_policy = B_FALSE; 3342 } 3343 3344 /* 3345 * Allocate and setup the swqe list 3346 */ 3347 lkey = state->id_tx_mr_desc.md_lkey; 3348 for (i = 0; i < state->id_num_swqe; i++) { 3349 if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) { 3350 DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed"); 3351 ibd_fini_txlist(state); 3352 return (DDI_FAILURE); 3353 } 3354 3355 /* add to list */ 3356 state->id_tx_list.dl_cnt++; 3357 if (state->id_tx_list.dl_head == NULL) { 3358 swqe->swqe_prev = NULL; 3359 swqe->swqe_next = NULL; 3360 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3361 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3362 } else { 3363 swqe->swqe_prev = state->id_tx_list.dl_tail; 3364 swqe->swqe_next = NULL; 3365 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3366 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3367 } 3368 } 3369 3370 return (DDI_SUCCESS); 3371 } 3372 3373 static int 3374 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3375 uint32_t *nds_p) 3376 { 3377 ibd_lsobkt_t *bktp; 3378 ibd_lsobuf_t *lbufp; 3379 ibd_lsobuf_t *nextp; 3380 ibt_lkey_t lso_lkey; 3381 uint_t frag_sz; 3382 uint_t num_needed; 3383 int i; 3384 3385 ASSERT(sgl_p != NULL); 3386 ASSERT(nds_p != NULL); 3387 ASSERT(req_sz != 0); 3388 3389 /* 3390 * Determine how many bufs we'd need for the size requested 3391 */ 3392 num_needed = req_sz / IBD_LSO_BUFSZ; 3393 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3394 num_needed++; 3395 3396 mutex_enter(&state->id_lso_lock); 3397 3398 /* 3399 * If we don't have enough lso bufs, return failure 3400 */ 3401 ASSERT(state->id_lso != NULL); 3402 bktp = state->id_lso; 3403 if (bktp->bkt_nfree < num_needed) { 3404 mutex_exit(&state->id_lso_lock); 3405 return (-1); 3406 } 3407 3408 /* 3409 * Pick the first 'num_needed' bufs from the free list 3410 */ 3411 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3412 lbufp = bktp->bkt_free_head; 3413 for (i = 0; i < num_needed; i++) { 3414 ASSERT(lbufp->lb_isfree != 0); 3415 ASSERT(lbufp->lb_buf != NULL); 3416 3417 nextp = lbufp->lb_next; 3418 3419 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3420 sgl_p[i].ds_key = lso_lkey; 3421 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3422 3423 lbufp->lb_isfree = 0; 3424 lbufp->lb_next = NULL; 3425 3426 lbufp = nextp; 3427 } 3428 bktp->bkt_free_head = lbufp; 3429 3430 /* 3431 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3432 * to adjust the last sgl entry's length. Since we know we need atleast 3433 * one, the i-1 use below is ok. 3434 */ 3435 if (frag_sz) { 3436 sgl_p[i-1].ds_len = frag_sz; 3437 } 3438 3439 /* 3440 * Update nfree count and return 3441 */ 3442 bktp->bkt_nfree -= num_needed; 3443 3444 mutex_exit(&state->id_lso_lock); 3445 3446 *nds_p = num_needed; 3447 3448 return (0); 3449 } 3450 3451 static void 3452 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3453 { 3454 ibd_lsobkt_t *bktp; 3455 ibd_lsobuf_t *lbufp; 3456 uint8_t *lso_mem_end; 3457 uint_t ndx; 3458 int i; 3459 3460 mutex_enter(&state->id_lso_lock); 3461 3462 bktp = state->id_lso; 3463 ASSERT(bktp != NULL); 3464 3465 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3466 for (i = 0; i < nds; i++) { 3467 uint8_t *va; 3468 3469 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3470 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3471 3472 /* 3473 * Figure out the buflist element this sgl buffer corresponds 3474 * to and put it back at the head 3475 */ 3476 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3477 lbufp = bktp->bkt_bufl + ndx; 3478 3479 ASSERT(lbufp->lb_isfree == 0); 3480 ASSERT(lbufp->lb_buf == va); 3481 3482 lbufp->lb_isfree = 1; 3483 lbufp->lb_next = bktp->bkt_free_head; 3484 bktp->bkt_free_head = lbufp; 3485 } 3486 bktp->bkt_nfree += nds; 3487 3488 mutex_exit(&state->id_lso_lock); 3489 } 3490 3491 static void 3492 ibd_free_tx_copybufs(ibd_state_t *state) 3493 { 3494 /* 3495 * Unregister txbuf mr 3496 */ 3497 if (ibt_deregister_mr(state->id_hca_hdl, 3498 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3499 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3500 } 3501 state->id_tx_mr_hdl = NULL; 3502 3503 /* 3504 * Free txbuf memory 3505 */ 3506 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3507 state->id_tx_bufs = NULL; 3508 } 3509 3510 static void 3511 ibd_free_tx_lsobufs(ibd_state_t *state) 3512 { 3513 ibd_lsobkt_t *bktp; 3514 3515 mutex_enter(&state->id_lso_lock); 3516 3517 if ((bktp = state->id_lso) == NULL) { 3518 mutex_exit(&state->id_lso_lock); 3519 return; 3520 } 3521 3522 /* 3523 * First, free the buflist 3524 */ 3525 ASSERT(bktp->bkt_bufl != NULL); 3526 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3527 3528 /* 3529 * Unregister the LSO memory and free it 3530 */ 3531 ASSERT(bktp->bkt_mr_hdl != NULL); 3532 if (ibt_deregister_mr(state->id_hca_hdl, 3533 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3534 DPRINT(10, 3535 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3536 } 3537 ASSERT(bktp->bkt_mem); 3538 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3539 3540 /* 3541 * Finally free the bucket 3542 */ 3543 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3544 state->id_lso = NULL; 3545 3546 mutex_exit(&state->id_lso_lock); 3547 } 3548 3549 /* 3550 * Free the statically allocated Tx buffer list. 3551 */ 3552 static void 3553 ibd_fini_txlist(ibd_state_t *state) 3554 { 3555 ibd_swqe_t *node; 3556 3557 /* 3558 * Free the allocated swqes 3559 */ 3560 mutex_enter(&state->id_tx_list.dl_mutex); 3561 while (state->id_tx_list.dl_head != NULL) { 3562 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3563 state->id_tx_list.dl_head = node->swqe_next; 3564 ASSERT(state->id_tx_list.dl_cnt > 0); 3565 state->id_tx_list.dl_cnt--; 3566 ibd_free_swqe(state, node); 3567 } 3568 mutex_exit(&state->id_tx_list.dl_mutex); 3569 3570 ibd_free_tx_lsobufs(state); 3571 ibd_free_tx_copybufs(state); 3572 } 3573 3574 /* 3575 * Allocate a single send wqe and register it so it is almost 3576 * ready to be posted to the hardware. 3577 */ 3578 static int 3579 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey) 3580 { 3581 ibd_swqe_t *swqe; 3582 3583 swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP); 3584 *wqe = swqe; 3585 3586 swqe->swqe_type = IBD_WQE_SEND; 3587 swqe->swqe_next = NULL; 3588 swqe->swqe_prev = NULL; 3589 swqe->swqe_im_mblk = NULL; 3590 3591 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3592 (state->id_tx_bufs + ndx * state->id_tx_buf_sz); 3593 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3594 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3595 3596 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3597 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3598 swqe->w_swr.wr_trans = IBT_UD_SRV; 3599 3600 /* These are set in send */ 3601 swqe->w_swr.wr_nds = 0; 3602 swqe->w_swr.wr_sgl = NULL; 3603 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3604 3605 return (DDI_SUCCESS); 3606 } 3607 3608 /* 3609 * Free an allocated send wqe. 3610 */ 3611 /*ARGSUSED*/ 3612 static void 3613 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3614 { 3615 kmem_free(swqe, sizeof (ibd_swqe_t)); 3616 } 3617 3618 /* 3619 * Post a rwqe to the hardware and add it to the Rx list. The 3620 * "recycle" parameter indicates whether an old rwqe is being 3621 * recycled, or this is a new one. 3622 */ 3623 static int 3624 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3625 { 3626 ibt_status_t ibt_status; 3627 3628 if (recycle == B_FALSE) { 3629 mutex_enter(&state->id_rx_list.dl_mutex); 3630 if (state->id_rx_list.dl_head == NULL) { 3631 rwqe->rwqe_prev = NULL; 3632 rwqe->rwqe_next = NULL; 3633 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3634 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3635 } else { 3636 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3637 rwqe->rwqe_next = NULL; 3638 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3639 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3640 } 3641 mutex_exit(&state->id_rx_list.dl_mutex); 3642 } 3643 3644 mutex_enter(&state->id_rxpost_lock); 3645 if (state->id_rx_busy) { 3646 rwqe->w_post_link = NULL; 3647 if (state->id_rx_head) 3648 *(state->id_rx_tailp) = (ibd_wqe_t *)rwqe; 3649 else 3650 state->id_rx_head = rwqe; 3651 state->id_rx_tailp = &(rwqe->w_post_link); 3652 } else { 3653 state->id_rx_busy = 1; 3654 do { 3655 mutex_exit(&state->id_rxpost_lock); 3656 3657 /* 3658 * Here we should add dl_cnt before post recv, because 3659 * we would have to make sure dl_cnt is updated before 3660 * the corresponding ibd_process_rx() is called. 3661 */ 3662 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3663 3664 ibt_status = ibt_post_recv(state->id_chnl_hdl, 3665 &rwqe->w_rwr, 1, NULL); 3666 if (ibt_status != IBT_SUCCESS) { 3667 (void) atomic_add_32_nv( 3668 &state->id_rx_list.dl_cnt, -1); 3669 ibd_print_warn(state, "ibd_post_recv: " 3670 "posting failed, ret=%d", ibt_status); 3671 return (DDI_FAILURE); 3672 } 3673 3674 mutex_enter(&state->id_rxpost_lock); 3675 rwqe = state->id_rx_head; 3676 if (rwqe) { 3677 state->id_rx_head = 3678 (ibd_rwqe_t *)(rwqe->w_post_link); 3679 } 3680 } while (rwqe); 3681 state->id_rx_busy = 0; 3682 } 3683 mutex_exit(&state->id_rxpost_lock); 3684 3685 return (DDI_SUCCESS); 3686 } 3687 3688 /* 3689 * Allocate the statically allocated Rx buffer list. 3690 */ 3691 static int 3692 ibd_init_rxlist(ibd_state_t *state) 3693 { 3694 ibd_rwqe_t *rwqe; 3695 int i; 3696 3697 for (i = 0; i < state->id_num_rwqe; i++) { 3698 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3699 ibd_fini_rxlist(state); 3700 return (DDI_FAILURE); 3701 } 3702 3703 if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) { 3704 ibd_free_rwqe(state, rwqe); 3705 ibd_fini_rxlist(state); 3706 return (DDI_FAILURE); 3707 } 3708 } 3709 3710 return (DDI_SUCCESS); 3711 } 3712 3713 /* 3714 * Free the statically allocated Rx buffer list. 3715 * 3716 */ 3717 static void 3718 ibd_fini_rxlist(ibd_state_t *state) 3719 { 3720 ibd_rwqe_t *node; 3721 3722 mutex_enter(&state->id_rx_list.dl_mutex); 3723 while (state->id_rx_list.dl_head != NULL) { 3724 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3725 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3726 ASSERT(state->id_rx_list.dl_cnt > 0); 3727 state->id_rx_list.dl_cnt--; 3728 3729 ibd_free_rwqe(state, node); 3730 } 3731 mutex_exit(&state->id_rx_list.dl_mutex); 3732 } 3733 3734 /* 3735 * Allocate a single recv wqe and register it so it is almost 3736 * ready to be posted to the hardware. 3737 */ 3738 static int 3739 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3740 { 3741 ibt_mr_attr_t mem_attr; 3742 ibd_rwqe_t *rwqe; 3743 3744 if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3745 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3746 return (DDI_FAILURE); 3747 } 3748 *wqe = rwqe; 3749 rwqe->rwqe_type = IBD_WQE_RECV; 3750 rwqe->w_state = state; 3751 rwqe->rwqe_next = NULL; 3752 rwqe->rwqe_prev = NULL; 3753 rwqe->w_freeing_wqe = B_FALSE; 3754 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3755 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3756 3757 rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3758 IPOIB_GRH_SIZE, KM_NOSLEEP); 3759 if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) { 3760 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3761 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3762 return (DDI_FAILURE); 3763 } 3764 3765 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3766 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3767 NULL) { 3768 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3769 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3770 state->id_mtu + IPOIB_GRH_SIZE); 3771 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3772 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3773 return (DDI_FAILURE); 3774 } 3775 3776 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3777 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3778 mem_attr.mr_as = NULL; 3779 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3780 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3781 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3782 IBT_SUCCESS) { 3783 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3784 rwqe->w_freeing_wqe = B_TRUE; 3785 freemsg(rwqe->rwqe_im_mblk); 3786 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3787 state->id_mtu + IPOIB_GRH_SIZE); 3788 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3789 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3790 return (DDI_FAILURE); 3791 } 3792 3793 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3794 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3795 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3796 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3797 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3798 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3799 rwqe->w_rwr.wr_nds = 1; 3800 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3801 3802 return (DDI_SUCCESS); 3803 } 3804 3805 /* 3806 * Free an allocated recv wqe. 3807 */ 3808 static void 3809 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3810 { 3811 if (ibt_deregister_mr(state->id_hca_hdl, 3812 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3813 DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()"); 3814 return; 3815 } 3816 3817 /* 3818 * Indicate to the callback function that this rwqe/mblk 3819 * should not be recycled. The freemsg() will invoke 3820 * ibd_freemsg_cb(). 3821 */ 3822 if (rwqe->rwqe_im_mblk != NULL) { 3823 rwqe->w_freeing_wqe = B_TRUE; 3824 freemsg(rwqe->rwqe_im_mblk); 3825 } 3826 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3827 state->id_mtu + IPOIB_GRH_SIZE); 3828 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3829 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3830 } 3831 3832 /* 3833 * Delete the rwqe being freed from the rx list. 3834 */ 3835 static void 3836 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3837 { 3838 mutex_enter(&state->id_rx_list.dl_mutex); 3839 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3840 state->id_rx_list.dl_head = rwqe->rwqe_next; 3841 else 3842 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3843 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3844 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3845 else 3846 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3847 mutex_exit(&state->id_rx_list.dl_mutex); 3848 } 3849 3850 /* 3851 * IBA Rx/Tx completion queue handler. Guaranteed to be single 3852 * threaded and nonreentrant for this CQ. When using combined CQ, 3853 * this handles Tx and Rx completions. With separate CQs, this handles 3854 * only Rx completions. 3855 */ 3856 /* ARGSUSED */ 3857 static void 3858 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3859 { 3860 ibd_state_t *state = (ibd_state_t *)arg; 3861 3862 atomic_add_64(&state->id_num_intrs, 1); 3863 3864 if (ibd_rx_softintr == 1) 3865 ddi_trigger_softintr(state->id_rx); 3866 else 3867 (void) ibd_intr((char *)state); 3868 } 3869 3870 /* 3871 * Separate CQ handler for Tx completions, when the Tx CQ is in 3872 * interrupt driven mode. 3873 */ 3874 /* ARGSUSED */ 3875 static void 3876 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3877 { 3878 ibd_state_t *state = (ibd_state_t *)arg; 3879 3880 atomic_add_64(&state->id_num_intrs, 1); 3881 3882 if (ibd_tx_softintr == 1) 3883 ddi_trigger_softintr(state->id_tx); 3884 else 3885 (void) ibd_tx_recycle((char *)state); 3886 } 3887 3888 /* 3889 * Multicast group create/delete trap handler. These will be delivered 3890 * on a kernel thread (handling can thus block) and can be invoked 3891 * concurrently. The handler can be invoked anytime after it is 3892 * registered and before ibt_detach(). 3893 */ 3894 /* ARGSUSED */ 3895 static void 3896 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 3897 ibt_subnet_event_t *event) 3898 { 3899 ibd_state_t *state = (ibd_state_t *)arg; 3900 ibd_req_t *req; 3901 3902 /* 3903 * The trap handler will get invoked once for every event for 3904 * evert port. The input "gid" is the GID0 of the port the 3905 * trap came in on; we just need to act on traps that came 3906 * to our port, meaning the port on which the ipoib interface 3907 * resides. Since ipoib uses GID0 of the port, we just match 3908 * the gids to check whether we need to handle the trap. 3909 */ 3910 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 3911 return; 3912 3913 DPRINT(10, "ibd_notices_handler : %d\n", code); 3914 3915 switch (code) { 3916 case IBT_SM_EVENT_UNAVAILABLE: 3917 /* 3918 * If we are in promiscuous mode or have 3919 * sendnonmembers, we need to print a warning 3920 * message right now. Else, just store the 3921 * information, print when we enter promiscuous 3922 * mode or attempt nonmember send. We might 3923 * also want to stop caching sendnonmember. 3924 */ 3925 ibd_print_warn(state, "IBA multicast support " 3926 "degraded due to unavailability of multicast " 3927 "traps"); 3928 break; 3929 case IBT_SM_EVENT_AVAILABLE: 3930 /* 3931 * If we printed a warning message above or 3932 * while trying to nonmember send or get into 3933 * promiscuous mode, print an okay message. 3934 */ 3935 ibd_print_warn(state, "IBA multicast support " 3936 "restored due to availability of multicast " 3937 "traps"); 3938 break; 3939 case IBT_SM_EVENT_MCG_CREATED: 3940 case IBT_SM_EVENT_MCG_DELETED: 3941 /* 3942 * Common processing of creation/deletion traps. 3943 * First check if the instance is being 3944 * [de]initialized; back off then, without doing 3945 * anything more, since we are not sure if the 3946 * async thread is around, or whether we might 3947 * be racing with the detach code in ibd_m_stop() 3948 * that scans the mcg list. 3949 */ 3950 if (!ibd_async_safe(state)) 3951 return; 3952 3953 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 3954 req->rq_gid = event->sm_notice_gid; 3955 req->rq_ptr = (void *)code; 3956 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 3957 break; 3958 } 3959 } 3960 3961 static void 3962 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 3963 { 3964 ib_gid_t mgid = req->rq_gid; 3965 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 3966 3967 DPRINT(10, "ibd_async_trap : %d\n", code); 3968 3969 /* 3970 * Atomically search the nonmember and sendonlymember lists and 3971 * delete. 3972 */ 3973 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 3974 3975 if (state->id_prom_op == IBD_OP_COMPLETED) { 3976 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 3977 3978 /* 3979 * If in promiscuous mode, try to join/attach to the new 3980 * mcg. Given the unreliable out-of-order mode of trap 3981 * delivery, we can never be sure whether it is a problem 3982 * if the join fails. Thus, we warn the admin of a failure 3983 * if this was a creation trap. Note that the trap might 3984 * actually be reporting a long past event, and the mcg 3985 * might already have been deleted, thus we might be warning 3986 * in vain. 3987 */ 3988 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 3989 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 3990 ibd_print_warn(state, "IBA promiscuous mode missed " 3991 "new multicast gid %016llx:%016llx", 3992 (u_longlong_t)mgid.gid_prefix, 3993 (u_longlong_t)mgid.gid_guid); 3994 } 3995 3996 /* 3997 * Free the request slot allocated by the subnet event thread. 3998 */ 3999 ibd_async_done(state); 4000 } 4001 4002 /* 4003 * GLDv3 entry point to get capabilities. 4004 */ 4005 static boolean_t 4006 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4007 { 4008 ibd_state_t *state = arg; 4009 4010 switch (cap) { 4011 case MAC_CAPAB_HCKSUM: { 4012 uint32_t *txflags = cap_data; 4013 4014 /* 4015 * We either do full checksum or not do it at all 4016 */ 4017 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4018 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4019 else 4020 return (B_FALSE); 4021 break; 4022 } 4023 4024 case MAC_CAPAB_LSO: { 4025 mac_capab_lso_t *cap_lso = cap_data; 4026 4027 /* 4028 * In addition to the capability and policy, since LSO 4029 * relies on hw checksum, we'll not enable LSO if we 4030 * don't have hw checksum. Of course, if the HCA doesn't 4031 * provide the reserved lkey capability, enabling LSO will 4032 * actually affect performance adversely, so we'll disable 4033 * LSO even for that case. 4034 */ 4035 if (!state->id_lso_policy || !state->id_lso_capable) 4036 return (B_FALSE); 4037 4038 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4039 return (B_FALSE); 4040 4041 if (state->id_hca_res_lkey_capab == 0) { 4042 ibd_print_warn(state, "no reserved-lkey capability, " 4043 "disabling LSO"); 4044 return (B_FALSE); 4045 } 4046 4047 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4048 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4049 break; 4050 } 4051 4052 default: 4053 return (B_FALSE); 4054 } 4055 4056 return (B_TRUE); 4057 } 4058 4059 static int 4060 ibd_get_port_details(ibd_state_t *state) 4061 { 4062 ibt_hca_portinfo_t *port_infop; 4063 ibt_status_t ret; 4064 uint_t psize, port_infosz; 4065 4066 mutex_enter(&state->id_link_mutex); 4067 4068 /* 4069 * Query for port information 4070 */ 4071 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4072 &port_infop, &psize, &port_infosz); 4073 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4074 mutex_exit(&state->id_link_mutex); 4075 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4076 "failed, ret=%d", ret); 4077 return (ENETDOWN); 4078 } 4079 4080 /* 4081 * If the link already went down by the time we get here, 4082 * give up 4083 */ 4084 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 4085 mutex_exit(&state->id_link_mutex); 4086 ibt_free_portinfo(port_infop, port_infosz); 4087 DPRINT(10, "ibd_get_port_details: port is not active"); 4088 return (ENETDOWN); 4089 } 4090 4091 /* 4092 * If the link is active, verify the pkey 4093 */ 4094 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4095 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4096 mutex_exit(&state->id_link_mutex); 4097 ibt_free_portinfo(port_infop, port_infosz); 4098 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 4099 "failed, ret=%d", ret); 4100 return (ENONET); 4101 } 4102 4103 state->id_mtu = (128 << port_infop->p_mtu); 4104 state->id_sgid = *port_infop->p_sgid_tbl; 4105 state->id_link_state = LINK_STATE_UP; 4106 4107 mutex_exit(&state->id_link_mutex); 4108 ibt_free_portinfo(port_infop, port_infosz); 4109 4110 /* 4111 * Now that the port is active, record the port speed 4112 */ 4113 state->id_link_speed = ibd_get_portspeed(state); 4114 4115 return (0); 4116 } 4117 4118 static int 4119 ibd_alloc_cqs(ibd_state_t *state) 4120 { 4121 ibt_hca_attr_t hca_attrs; 4122 ibt_cq_attr_t cq_attr; 4123 ibt_status_t ret; 4124 uint32_t real_size; 4125 4126 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4127 ASSERT(ret == IBT_SUCCESS); 4128 4129 /* 4130 * Allocate Rx/combined CQ: 4131 * Theoretically, there is no point in having more than #rwqe 4132 * plus #swqe cqe's, except that the CQ will be signalled for 4133 * overflow when the last wqe completes, if none of the previous 4134 * cqe's have been polled. Thus, we allocate just a few less wqe's 4135 * to make sure such overflow does not occur. 4136 */ 4137 cq_attr.cq_sched = NULL; 4138 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4139 4140 if (ibd_separate_cqs == 1) { 4141 /* 4142 * Allocate Receive CQ. 4143 */ 4144 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4145 cq_attr.cq_size = state->id_num_rwqe + 1; 4146 } else { 4147 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4148 state->id_num_rwqe = cq_attr.cq_size - 1; 4149 } 4150 4151 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4152 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4153 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4154 "failed, ret=%d\n", ret); 4155 return (DDI_FAILURE); 4156 } 4157 4158 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4159 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4160 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4161 "moderation failed, ret=%d\n", ret); 4162 } 4163 4164 state->id_rxwcs_size = state->id_num_rwqe + 1; 4165 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4166 state->id_rxwcs_size, KM_SLEEP); 4167 4168 /* 4169 * Allocate Send CQ. 4170 */ 4171 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4172 cq_attr.cq_size = state->id_num_swqe + 1; 4173 } else { 4174 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4175 state->id_num_swqe = cq_attr.cq_size - 1; 4176 } 4177 4178 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4179 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4180 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4181 "failed, ret=%d\n", ret); 4182 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4183 state->id_rxwcs_size); 4184 (void) ibt_free_cq(state->id_rcq_hdl); 4185 return (DDI_FAILURE); 4186 } 4187 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4188 IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) { 4189 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4190 "moderation failed, ret=%d\n", ret); 4191 } 4192 4193 state->id_txwcs_size = state->id_num_swqe + 1; 4194 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4195 state->id_txwcs_size, KM_SLEEP); 4196 } else { 4197 /* 4198 * Allocate combined Send/Receive CQ. 4199 */ 4200 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 4201 state->id_num_swqe + 1)) { 4202 cq_attr.cq_size = state->id_num_rwqe + 4203 state->id_num_swqe + 1; 4204 } else { 4205 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4206 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 4207 state->id_num_rwqe) / (state->id_num_rwqe + 4208 state->id_num_swqe); 4209 state->id_num_swqe = cq_attr.cq_size - 1 - 4210 state->id_num_rwqe; 4211 } 4212 4213 state->id_rxwcs_size = cq_attr.cq_size; 4214 state->id_txwcs_size = state->id_rxwcs_size; 4215 4216 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4217 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4218 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) " 4219 "failed, ret=%d\n", ret); 4220 return (DDI_FAILURE); 4221 } 4222 state->id_scq_hdl = state->id_rcq_hdl; 4223 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4224 state->id_rxwcs_size, KM_SLEEP); 4225 state->id_txwcs = state->id_rxwcs; 4226 } 4227 4228 /* 4229 * Print message in case we could not allocate as many wqe's 4230 * as was requested. 4231 */ 4232 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4233 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4234 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4235 } 4236 if (state->id_num_swqe != IBD_NUM_SWQE) { 4237 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4238 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4239 } 4240 4241 return (DDI_SUCCESS); 4242 } 4243 4244 static int 4245 ibd_setup_ud_channel(ibd_state_t *state) 4246 { 4247 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4248 ibt_ud_chan_query_attr_t ud_chan_attr; 4249 ibt_status_t ret; 4250 4251 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 4252 if (state->id_hca_res_lkey_capab) 4253 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4254 if (state->id_lso_policy && state->id_lso_capable) 4255 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4256 4257 ud_alloc_attr.ud_hca_port_num = state->id_port; 4258 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4259 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4260 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4261 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4262 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4263 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4264 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4265 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4266 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4267 ud_alloc_attr.ud_clone_chan = NULL; 4268 4269 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4270 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4271 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4272 "failed, ret=%d\n", ret); 4273 return (DDI_FAILURE); 4274 } 4275 4276 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4277 &ud_chan_attr)) != IBT_SUCCESS) { 4278 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4279 "failed, ret=%d\n", ret); 4280 (void) ibt_free_channel(state->id_chnl_hdl); 4281 return (DDI_FAILURE); 4282 } 4283 4284 state->id_qpnum = ud_chan_attr.ud_qpn; 4285 4286 return (DDI_SUCCESS); 4287 } 4288 4289 static int 4290 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 4291 { 4292 uint32_t progress = state->id_mac_state; 4293 uint_t attempts; 4294 ibt_status_t ret; 4295 ib_gid_t mgid; 4296 ibd_mce_t *mce; 4297 uint8_t jstate; 4298 4299 /* 4300 * Before we try to stop/undo whatever we did in ibd_start(), 4301 * we need to mark the link state appropriately to prevent the 4302 * ip layer from using this instance for any new transfers. Note 4303 * that if the original state of the link was "up" when we're 4304 * here, we'll set the final link state to "unknown", to behave 4305 * in the same fashion as other ethernet drivers. 4306 */ 4307 mutex_enter(&state->id_link_mutex); 4308 if (cur_link_state == LINK_STATE_DOWN) { 4309 state->id_link_state = cur_link_state; 4310 } else { 4311 state->id_link_state = LINK_STATE_UNKNOWN; 4312 } 4313 mutex_exit(&state->id_link_mutex); 4314 mac_link_update(state->id_mh, state->id_link_state); 4315 4316 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4317 if (progress & IBD_DRV_STARTED) { 4318 state->id_mac_state &= (~IBD_DRV_STARTED); 4319 } 4320 4321 /* 4322 * First, stop receive interrupts; this stops the driver from 4323 * handing up buffers to higher layers. Wait for receive buffers 4324 * to be returned and give up after 5 seconds. 4325 */ 4326 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4327 4328 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4329 4330 attempts = 50; 4331 while (state->id_rx_list.dl_bufs_outstanding > 0) { 4332 delay(drv_usectohz(100000)); 4333 if (--attempts == 0) { 4334 /* 4335 * There are pending bufs with the network 4336 * layer and we have no choice but to wait 4337 * for them to be done with. Reap all the 4338 * Tx/Rx completions that were posted since 4339 * we turned off the notification and 4340 * return failure. 4341 */ 4342 DPRINT(2, "ibd_undo_start: " 4343 "reclaiming failed"); 4344 ibd_poll_compq(state, state->id_rcq_hdl); 4345 ibt_set_cq_handler(state->id_rcq_hdl, 4346 ibd_rcq_handler, state); 4347 return (DDI_FAILURE); 4348 } 4349 } 4350 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4351 } 4352 4353 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4354 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4355 4356 mutex_enter(&state->id_trap_lock); 4357 state->id_trap_stop = B_TRUE; 4358 while (state->id_trap_inprog > 0) 4359 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4360 mutex_exit(&state->id_trap_lock); 4361 4362 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4363 } 4364 4365 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4366 /* 4367 * Flushing the channel ensures that all pending WQE's 4368 * are marked with flush_error and handed to the CQ. It 4369 * does not guarantee the invocation of the CQ handler. 4370 * This call is guaranteed to return successfully for 4371 * UD QPNs. 4372 */ 4373 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 4374 IBT_SUCCESS) { 4375 DPRINT(10, "ibd_undo_start: flush_channel " 4376 "failed, ret=%d", ret); 4377 } 4378 4379 /* 4380 * Turn off Tx interrupts and poll. By the time the polling 4381 * returns an empty indicator, we are sure we have seen all 4382 * pending Tx callbacks. Note that after the call to 4383 * ibt_set_cq_handler() returns, the old handler is 4384 * guaranteed not to be invoked anymore. 4385 */ 4386 if (ibd_separate_cqs == 1) { 4387 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4388 } 4389 ibd_poll_compq(state, state->id_scq_hdl); 4390 4391 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4392 } 4393 4394 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4395 /* 4396 * No new async requests will be posted since the device 4397 * link state has been marked as unknown; completion handlers 4398 * have been turned off, so Tx handler will not cause any 4399 * more IBD_ASYNC_REAP requests. 4400 * 4401 * Queue a request for the async thread to exit, which will 4402 * be serviced after any pending ones. This can take a while, 4403 * specially if the SM is unreachable, since IBMF will slowly 4404 * timeout each SM request issued by the async thread. Reap 4405 * the thread before continuing on, we do not want it to be 4406 * lingering in modunloaded code (or we could move the reap 4407 * to ibd_detach(), provided we keep track of the current 4408 * id_async_thrid somewhere safe). 4409 */ 4410 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4411 thread_join(state->id_async_thrid); 4412 4413 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4414 } 4415 4416 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4417 /* 4418 * Drop all residual full/non membership. This includes full 4419 * membership to the broadcast group, and any nonmembership 4420 * acquired during transmits. We do this after the Tx completion 4421 * handlers are done, since those might result in some late 4422 * leaves; this also eliminates a potential race with that 4423 * path wrt the mc full list insert/delete. Trap handling 4424 * has also been suppressed at this point. Thus, no locks 4425 * are required while traversing the mc full list. 4426 */ 4427 DPRINT(2, "ibd_undo_start: clear full cache entries"); 4428 mce = list_head(&state->id_mc_full); 4429 while (mce != NULL) { 4430 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4431 jstate = mce->mc_jstate; 4432 mce = list_next(&state->id_mc_full, mce); 4433 ibd_leave_group(state, mgid, jstate); 4434 } 4435 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4436 } 4437 4438 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4439 ibd_fini_rxlist(state); 4440 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4441 } 4442 4443 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4444 ibd_fini_txlist(state); 4445 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4446 } 4447 4448 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4449 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 4450 IBT_SUCCESS) { 4451 DPRINT(10, "ibd_undo_start: free_channel " 4452 "failed, ret=%d", ret); 4453 } 4454 4455 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4456 } 4457 4458 if (progress & IBD_DRV_CQS_ALLOCD) { 4459 if (ibd_separate_cqs == 1) { 4460 kmem_free(state->id_txwcs, 4461 sizeof (ibt_wc_t) * state->id_txwcs_size); 4462 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 4463 IBT_SUCCESS) { 4464 DPRINT(10, "ibd_undo_start: free_cq(scq) " 4465 "failed, ret=%d", ret); 4466 } 4467 } 4468 4469 kmem_free(state->id_rxwcs, 4470 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4471 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 4472 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 4473 "ret=%d", ret); 4474 } 4475 4476 state->id_txwcs = NULL; 4477 state->id_rxwcs = NULL; 4478 state->id_scq_hdl = NULL; 4479 state->id_rcq_hdl = NULL; 4480 4481 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4482 } 4483 4484 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4485 mod_hash_destroy_hash(state->id_ah_active_hash); 4486 ibd_acache_fini(state); 4487 4488 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4489 } 4490 4491 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4492 /* 4493 * If we'd created the ipoib broadcast group and had 4494 * successfully joined it, leave it now 4495 */ 4496 if (state->id_bgroup_created) { 4497 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 4498 jstate = IB_MC_JSTATE_FULL; 4499 (void) ibt_leave_mcg(state->id_sgid, mgid, 4500 state->id_sgid, jstate); 4501 } 4502 ibt_free_mcg_info(state->id_mcinfo, 1); 4503 4504 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4505 } 4506 4507 return (DDI_SUCCESS); 4508 } 4509 4510 /* 4511 * These pair of routines are used to set/clear the condition that 4512 * the caller is likely to do something to change the id_mac_state. 4513 * If there's already someone doing either a start or a stop (possibly 4514 * due to the async handler detecting a pkey relocation event, a plumb 4515 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 4516 * that's done. 4517 */ 4518 static void 4519 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 4520 { 4521 mutex_enter(&state->id_macst_lock); 4522 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 4523 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 4524 4525 state->id_mac_state |= flag; 4526 mutex_exit(&state->id_macst_lock); 4527 } 4528 4529 static void 4530 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 4531 { 4532 mutex_enter(&state->id_macst_lock); 4533 state->id_mac_state &= (~flag); 4534 cv_signal(&state->id_macst_cv); 4535 mutex_exit(&state->id_macst_lock); 4536 } 4537 4538 /* 4539 * GLDv3 entry point to start hardware. 4540 */ 4541 /*ARGSUSED*/ 4542 static int 4543 ibd_m_start(void *arg) 4544 { 4545 ibd_state_t *state = arg; 4546 int ret; 4547 4548 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4549 4550 ret = ibd_start(state); 4551 4552 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4553 4554 return (ret); 4555 } 4556 4557 static int 4558 ibd_start(ibd_state_t *state) 4559 { 4560 kthread_t *kht; 4561 int err; 4562 ibt_status_t ret; 4563 4564 if (state->id_mac_state & IBD_DRV_STARTED) 4565 return (DDI_SUCCESS); 4566 4567 /* 4568 * Get port details; if we fail here, very likely the port 4569 * state is inactive or the pkey can't be verified. 4570 */ 4571 if ((err = ibd_get_port_details(state)) != 0) { 4572 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 4573 goto start_fail; 4574 } 4575 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4576 4577 /* 4578 * Find the IPoIB broadcast group 4579 */ 4580 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4581 DPRINT(10, "ibd_start: ibd_find_bgroup() failed"); 4582 err = ENOTACTIVE; 4583 goto start_fail; 4584 } 4585 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4586 4587 /* 4588 * Initialize per-interface caches and lists; if we fail here, 4589 * it is most likely due to a lack of resources 4590 */ 4591 if (ibd_acache_init(state) != DDI_SUCCESS) { 4592 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 4593 err = ENOMEM; 4594 goto start_fail; 4595 } 4596 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4597 4598 /* 4599 * Allocate send and receive completion queues 4600 */ 4601 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4602 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 4603 err = ENOMEM; 4604 goto start_fail; 4605 } 4606 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4607 4608 /* 4609 * Setup a UD channel 4610 */ 4611 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4612 err = ENOMEM; 4613 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 4614 goto start_fail; 4615 } 4616 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4617 4618 /* 4619 * Allocate and initialize the tx buffer list 4620 */ 4621 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4622 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 4623 err = ENOMEM; 4624 goto start_fail; 4625 } 4626 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4627 4628 /* 4629 * If we have separate cqs, create the send cq handler here 4630 */ 4631 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 4632 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4633 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 4634 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4635 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 4636 "failed, ret=%d", ret); 4637 err = EINVAL; 4638 goto start_fail; 4639 } 4640 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4641 } 4642 4643 /* 4644 * Allocate and initialize the rx buffer list 4645 */ 4646 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4647 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 4648 err = ENOMEM; 4649 goto start_fail; 4650 } 4651 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4652 4653 /* 4654 * Join IPoIB broadcast group 4655 */ 4656 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4657 DPRINT(10, "ibd_start: ibd_join_group() failed"); 4658 err = ENOTACTIVE; 4659 goto start_fail; 4660 } 4661 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4662 4663 /* 4664 * Create the async thread; thread_create never fails. 4665 */ 4666 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4667 TS_RUN, minclsyspri); 4668 state->id_async_thrid = kht->t_did; 4669 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4670 4671 /* 4672 * When we did mac_register() in ibd_attach(), we didn't register 4673 * the real macaddr and we didn't have the true port mtu. Now that 4674 * we're almost ready, set the local mac address and broadcast 4675 * addresses and update gldv3 about the real values of these 4676 * parameters. 4677 */ 4678 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4679 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4680 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4681 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4682 4683 mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); 4684 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4685 4686 /* 4687 * Setup the receive cq handler 4688 */ 4689 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4690 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 4691 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4692 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 4693 "failed, ret=%d", ret); 4694 err = EINVAL; 4695 goto start_fail; 4696 } 4697 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4698 4699 /* 4700 * Setup the subnet notices handler after we've initialized the acache/ 4701 * mcache and started the async thread, both of which are required for 4702 * the trap handler to function properly. 4703 * 4704 * Now that the async thread has been started (and we've already done 4705 * a mac_register() during attach so mac_tx_update() can be called 4706 * if necessary without any problem), we can enable the trap handler 4707 * to queue requests to the async thread. 4708 */ 4709 ibt_register_subnet_notices(state->id_ibt_hdl, 4710 ibd_snet_notices_handler, state); 4711 mutex_enter(&state->id_trap_lock); 4712 state->id_trap_stop = B_FALSE; 4713 mutex_exit(&state->id_trap_lock); 4714 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4715 4716 /* 4717 * Indicate link status to GLDv3 and higher layers. By default, 4718 * we assume we are in up state (which must have been true at 4719 * least at the time the broadcast mcg's were probed); if there 4720 * were any up/down transitions till the time we come here, the 4721 * async handler will have updated last known state, which we 4722 * use to tell GLDv3. The async handler will not send any 4723 * notifications to GLDv3 till we reach here in the initialization 4724 * sequence. 4725 */ 4726 state->id_mac_state |= IBD_DRV_STARTED; 4727 mac_link_update(state->id_mh, state->id_link_state); 4728 4729 return (DDI_SUCCESS); 4730 4731 start_fail: 4732 /* 4733 * If we ran into a problem during ibd_start() and ran into 4734 * some other problem during undoing our partial work, we can't 4735 * do anything about it. Ignore any errors we might get from 4736 * ibd_undo_start() and just return the original error we got. 4737 */ 4738 (void) ibd_undo_start(state, LINK_STATE_DOWN); 4739 return (err); 4740 } 4741 4742 /* 4743 * GLDv3 entry point to stop hardware from receiving packets. 4744 */ 4745 /*ARGSUSED*/ 4746 static void 4747 ibd_m_stop(void *arg) 4748 { 4749 ibd_state_t *state = (ibd_state_t *)arg; 4750 4751 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4752 4753 (void) ibd_undo_start(state, state->id_link_state); 4754 4755 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4756 } 4757 4758 /* 4759 * GLDv3 entry point to modify device's mac address. We do not 4760 * allow address modifications. 4761 */ 4762 static int 4763 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4764 { 4765 ibd_state_t *state = arg; 4766 4767 /* 4768 * Don't bother even comparing the macaddr if we haven't 4769 * completed ibd_m_start(). 4770 */ 4771 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4772 return (0); 4773 4774 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4775 return (0); 4776 else 4777 return (EINVAL); 4778 } 4779 4780 /* 4781 * The blocking part of the IBA join/leave operations are done out 4782 * of here on the async thread. 4783 */ 4784 static void 4785 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4786 { 4787 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4788 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4789 4790 if (op == IBD_ASYNC_JOIN) { 4791 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4792 ibd_print_warn(state, "Joint multicast group failed :" 4793 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4794 } 4795 } else { 4796 /* 4797 * Here, we must search for the proper mcg_info and 4798 * use that to leave the group. 4799 */ 4800 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4801 } 4802 } 4803 4804 /* 4805 * GLDv3 entry point for multicast enable/disable requests. 4806 * This function queues the operation to the async thread and 4807 * return success for a valid multicast address. 4808 */ 4809 static int 4810 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4811 { 4812 ibd_state_t *state = (ibd_state_t *)arg; 4813 ipoib_mac_t maddr, *mcast; 4814 ib_gid_t mgid; 4815 ibd_req_t *req; 4816 4817 /* 4818 * If we haven't completed ibd_m_start(), async thread wouldn't 4819 * have been started and id_bcaddr wouldn't be set, so there's 4820 * no point in continuing. 4821 */ 4822 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4823 return (0); 4824 4825 /* 4826 * The incoming multicast address might not be aligned properly 4827 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4828 * it to look like one though, to get the offsets of the mc gid, 4829 * since we know we are not going to dereference any values with 4830 * the ipoib_mac_t pointer. 4831 */ 4832 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 4833 mcast = &maddr; 4834 4835 /* 4836 * Check validity of MCG address. We could additionally check 4837 * that a enable/disable is not being issued on the "broadcast" 4838 * mcg, but since this operation is only invokable by priviledged 4839 * programs anyway, we allow the flexibility to those dlpi apps. 4840 * Note that we do not validate the "scope" of the IBA mcg. 4841 */ 4842 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 4843 return (EINVAL); 4844 4845 /* 4846 * fill in multicast pkey and scope 4847 */ 4848 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 4849 4850 /* 4851 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4852 * nothing (i.e. we stay JOINed to the broadcast group done in 4853 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 4854 * requires to be joined to broadcast groups at all times. 4855 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4856 * depends on this. 4857 */ 4858 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4859 return (0); 4860 4861 ibd_n2h_gid(mcast, &mgid); 4862 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4863 if (req == NULL) 4864 return (ENOMEM); 4865 4866 req->rq_gid = mgid; 4867 4868 if (add) { 4869 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 4870 mgid.gid_prefix, mgid.gid_guid); 4871 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 4872 } else { 4873 DPRINT(1, "ibd_m_multicst : unset_multicast : " 4874 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4875 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 4876 } 4877 return (0); 4878 } 4879 4880 /* 4881 * The blocking part of the IBA promiscuous operations are done 4882 * out of here on the async thread. The dlpireq parameter indicates 4883 * whether this invocation is due to a dlpi request or due to 4884 * a port up/down event. 4885 */ 4886 static void 4887 ibd_async_unsetprom(ibd_state_t *state) 4888 { 4889 ibd_mce_t *mce = list_head(&state->id_mc_non); 4890 ib_gid_t mgid; 4891 4892 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4893 4894 while (mce != NULL) { 4895 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4896 mce = list_next(&state->id_mc_non, mce); 4897 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4898 } 4899 state->id_prom_op = IBD_OP_NOTSTARTED; 4900 } 4901 4902 /* 4903 * The blocking part of the IBA promiscuous operations are done 4904 * out of here on the async thread. The dlpireq parameter indicates 4905 * whether this invocation is due to a dlpi request or due to 4906 * a port up/down event. 4907 */ 4908 static void 4909 ibd_async_setprom(ibd_state_t *state) 4910 { 4911 ibt_mcg_attr_t mcg_attr; 4912 ibt_mcg_info_t *mcg_info; 4913 ib_gid_t mgid; 4914 uint_t numg; 4915 int i; 4916 char ret = IBD_OP_COMPLETED; 4917 4918 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4919 4920 /* 4921 * Obtain all active MC groups on the IB fabric with 4922 * specified criteria (scope + Pkey + Qkey + mtu). 4923 */ 4924 bzero(&mcg_attr, sizeof (mcg_attr)); 4925 mcg_attr.mc_pkey = state->id_pkey; 4926 mcg_attr.mc_scope = state->id_scope; 4927 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4928 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4929 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4930 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4931 IBT_SUCCESS) { 4932 ibd_print_warn(state, "Could not get list of IBA multicast " 4933 "groups"); 4934 ret = IBD_OP_ERRORED; 4935 goto done; 4936 } 4937 4938 /* 4939 * Iterate over the returned mcg's and join as NonMember 4940 * to the IP mcg's. 4941 */ 4942 for (i = 0; i < numg; i++) { 4943 /* 4944 * Do a NonMember JOIN on the MC group. 4945 */ 4946 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4947 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4948 ibd_print_warn(state, "IBA promiscuous mode missed " 4949 "multicast gid %016llx:%016llx", 4950 (u_longlong_t)mgid.gid_prefix, 4951 (u_longlong_t)mgid.gid_guid); 4952 } 4953 4954 ibt_free_mcg_info(mcg_info, numg); 4955 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4956 done: 4957 state->id_prom_op = ret; 4958 } 4959 4960 /* 4961 * GLDv3 entry point for multicast promiscuous enable/disable requests. 4962 * GLDv3 assumes phys state receives more packets than multi state, 4963 * which is not true for IPoIB. Thus, treat the multi and phys 4964 * promiscuous states the same way to work with GLDv3's assumption. 4965 */ 4966 static int 4967 ibd_m_promisc(void *arg, boolean_t on) 4968 { 4969 ibd_state_t *state = (ibd_state_t *)arg; 4970 ibd_req_t *req; 4971 4972 /* 4973 * Async thread wouldn't have been started if we haven't 4974 * passed ibd_m_start() 4975 */ 4976 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4977 return (0); 4978 4979 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4980 if (req == NULL) 4981 return (ENOMEM); 4982 if (on) { 4983 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 4984 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 4985 } else { 4986 DPRINT(1, "ibd_m_promisc : unset_promisc"); 4987 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 4988 } 4989 4990 return (0); 4991 } 4992 4993 /* 4994 * GLDv3 entry point for gathering statistics. 4995 */ 4996 static int 4997 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 4998 { 4999 ibd_state_t *state = (ibd_state_t *)arg; 5000 5001 switch (stat) { 5002 case MAC_STAT_IFSPEED: 5003 *val = state->id_link_speed; 5004 break; 5005 case MAC_STAT_MULTIRCV: 5006 *val = state->id_multi_rcv; 5007 break; 5008 case MAC_STAT_BRDCSTRCV: 5009 *val = state->id_brd_rcv; 5010 break; 5011 case MAC_STAT_MULTIXMT: 5012 *val = state->id_multi_xmt; 5013 break; 5014 case MAC_STAT_BRDCSTXMT: 5015 *val = state->id_brd_xmt; 5016 break; 5017 case MAC_STAT_RBYTES: 5018 *val = state->id_rcv_bytes; 5019 break; 5020 case MAC_STAT_IPACKETS: 5021 *val = state->id_rcv_pkt; 5022 break; 5023 case MAC_STAT_OBYTES: 5024 *val = state->id_xmt_bytes; 5025 break; 5026 case MAC_STAT_OPACKETS: 5027 *val = state->id_xmt_pkt; 5028 break; 5029 case MAC_STAT_OERRORS: 5030 *val = state->id_ah_error; /* failed AH translation */ 5031 break; 5032 case MAC_STAT_IERRORS: 5033 *val = 0; 5034 break; 5035 case MAC_STAT_NOXMTBUF: 5036 *val = state->id_tx_short; 5037 break; 5038 case MAC_STAT_NORCVBUF: 5039 default: 5040 return (ENOTSUP); 5041 } 5042 5043 return (0); 5044 } 5045 5046 static void 5047 ibd_async_txsched(ibd_state_t *state) 5048 { 5049 ibd_req_t *req; 5050 int ret; 5051 5052 if (ibd_txcomp_poll) 5053 ibd_poll_compq(state, state->id_scq_hdl); 5054 5055 ret = ibd_resume_transmission(state); 5056 if (ret && ibd_txcomp_poll) { 5057 if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP)) 5058 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5059 else { 5060 ibd_print_warn(state, "ibd_async_txsched: " 5061 "no memory, can't schedule work slot"); 5062 } 5063 } 5064 } 5065 5066 static int 5067 ibd_resume_transmission(ibd_state_t *state) 5068 { 5069 int flag; 5070 int met_thresh = 0; 5071 int ret = -1; 5072 5073 mutex_enter(&state->id_sched_lock); 5074 if (state->id_sched_needed & IBD_RSRC_SWQE) { 5075 met_thresh = (state->id_tx_list.dl_cnt > 5076 IBD_FREE_SWQES_THRESH); 5077 flag = IBD_RSRC_SWQE; 5078 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 5079 ASSERT(state->id_lso != NULL); 5080 met_thresh = (state->id_lso->bkt_nfree > 5081 IBD_FREE_LSOS_THRESH); 5082 flag = IBD_RSRC_LSOBUF; 5083 } 5084 if (met_thresh) { 5085 state->id_sched_needed &= ~flag; 5086 ret = 0; 5087 } 5088 mutex_exit(&state->id_sched_lock); 5089 5090 if (ret == 0) 5091 mac_tx_update(state->id_mh); 5092 5093 return (ret); 5094 } 5095 5096 /* 5097 * Release the send wqe back into free list. 5098 */ 5099 static void 5100 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 5101 { 5102 /* 5103 * Add back on Tx list for reuse. 5104 */ 5105 swqe->swqe_next = NULL; 5106 mutex_enter(&state->id_tx_list.dl_mutex); 5107 if (state->id_tx_list.dl_pending_sends) { 5108 state->id_tx_list.dl_pending_sends = B_FALSE; 5109 } 5110 if (state->id_tx_list.dl_head == NULL) { 5111 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 5112 } else { 5113 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 5114 } 5115 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 5116 state->id_tx_list.dl_cnt++; 5117 mutex_exit(&state->id_tx_list.dl_mutex); 5118 } 5119 5120 /* 5121 * Acquire a send wqe from free list. 5122 * Returns error number and send wqe pointer. 5123 */ 5124 static int 5125 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe) 5126 { 5127 int rc = 0; 5128 ibd_swqe_t *wqe; 5129 5130 /* 5131 * Check and reclaim some of the completed Tx requests. 5132 * If someone else is already in this code and pulling Tx 5133 * completions, no need to poll, since the current lock holder 5134 * will do the work anyway. Normally, we poll for completions 5135 * every few Tx attempts, but if we are short on Tx descriptors, 5136 * we always try to poll. 5137 */ 5138 if ((ibd_txcomp_poll == 1) && 5139 (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) { 5140 ibd_poll_compq(state, state->id_scq_hdl); 5141 } 5142 5143 /* 5144 * Grab required transmit wqes. 5145 */ 5146 mutex_enter(&state->id_tx_list.dl_mutex); 5147 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 5148 if (wqe != NULL) { 5149 state->id_tx_list.dl_cnt -= 1; 5150 state->id_tx_list.dl_head = wqe->swqe_next; 5151 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) 5152 state->id_tx_list.dl_tail = NULL; 5153 } else { 5154 /* 5155 * If we did not find the number we were looking for, flag 5156 * no resource. Adjust list appropriately in either case. 5157 */ 5158 rc = ENOENT; 5159 state->id_tx_list.dl_pending_sends = B_TRUE; 5160 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 5161 atomic_add_64(&state->id_tx_short, 1); 5162 } 5163 mutex_exit(&state->id_tx_list.dl_mutex); 5164 *swqe = wqe; 5165 5166 return (rc); 5167 } 5168 5169 static int 5170 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 5171 ibt_ud_dest_hdl_t ud_dest) 5172 { 5173 mblk_t *nmp; 5174 int iph_len, tcph_len; 5175 ibt_wr_lso_t *lso; 5176 uintptr_t ip_start, tcp_start; 5177 uint8_t *dst; 5178 uint_t pending, mblen; 5179 5180 /* 5181 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 5182 * we need to adjust it here for lso. 5183 */ 5184 lso = &(node->w_swr.wr.ud_lso); 5185 lso->lso_ud_dest = ud_dest; 5186 lso->lso_mss = mss; 5187 5188 /* 5189 * Calculate the LSO header size and set it in the UD LSO structure. 5190 * Note that the only assumption we make is that each of the IPoIB, 5191 * IP and TCP headers will be contained in a single mblk fragment; 5192 * together, the headers may span multiple mblk fragments. 5193 */ 5194 nmp = mp; 5195 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 5196 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 5197 ip_start = (uintptr_t)nmp->b_cont->b_rptr 5198 + (ip_start - (uintptr_t)(nmp->b_wptr)); 5199 nmp = nmp->b_cont; 5200 5201 } 5202 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 5203 5204 tcp_start = ip_start + iph_len; 5205 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 5206 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 5207 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 5208 nmp = nmp->b_cont; 5209 } 5210 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5211 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5212 5213 /* 5214 * If the lso header fits entirely within a single mblk fragment, 5215 * we'll avoid an additional copy of the lso header here and just 5216 * pass the b_rptr of the mblk directly. 5217 * 5218 * If this isn't true, we'd have to allocate for it explicitly. 5219 */ 5220 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5221 lso->lso_hdr = mp->b_rptr; 5222 } else { 5223 /* On work completion, remember to free this allocated hdr */ 5224 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5225 if (lso->lso_hdr == NULL) { 5226 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5227 "sz = %d", lso->lso_hdr_sz); 5228 lso->lso_hdr_sz = 0; 5229 lso->lso_mss = 0; 5230 return (-1); 5231 } 5232 } 5233 5234 /* 5235 * Copy in the lso header only if we need to 5236 */ 5237 if (lso->lso_hdr != mp->b_rptr) { 5238 dst = lso->lso_hdr; 5239 pending = lso->lso_hdr_sz; 5240 5241 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5242 mblen = MBLKL(nmp); 5243 if (pending > mblen) { 5244 bcopy(nmp->b_rptr, dst, mblen); 5245 dst += mblen; 5246 pending -= mblen; 5247 } else { 5248 bcopy(nmp->b_rptr, dst, pending); 5249 break; 5250 } 5251 } 5252 } 5253 5254 return (0); 5255 } 5256 5257 static void 5258 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5259 { 5260 ibt_wr_lso_t *lso; 5261 5262 if ((!node) || (!mp)) 5263 return; 5264 5265 /* 5266 * Free any header space that we might've allocated if we 5267 * did an LSO 5268 */ 5269 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5270 lso = &(node->w_swr.wr.ud_lso); 5271 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5272 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5273 lso->lso_hdr = NULL; 5274 lso->lso_hdr_sz = 0; 5275 } 5276 } 5277 } 5278 5279 static void 5280 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5281 { 5282 uint_t i; 5283 uint_t num_posted; 5284 uint_t n_wrs; 5285 ibt_status_t ibt_status; 5286 ibt_send_wr_t wrs[IBD_MAX_POST_MULTIPLE]; 5287 ibd_swqe_t *elem; 5288 ibd_swqe_t *nodes[IBD_MAX_POST_MULTIPLE]; 5289 5290 node->swqe_next = NULL; 5291 5292 mutex_enter(&state->id_txpost_lock); 5293 5294 /* 5295 * Enqueue the new node in chain of wqes to send 5296 */ 5297 if (state->id_tx_head) { 5298 *(state->id_tx_tailp) = (ibd_wqe_t *)node; 5299 } else { 5300 state->id_tx_head = node; 5301 } 5302 state->id_tx_tailp = &(node->swqe_next); 5303 5304 /* 5305 * If someone else is helping out with the sends, 5306 * just go back 5307 */ 5308 if (state->id_tx_busy) { 5309 mutex_exit(&state->id_txpost_lock); 5310 return; 5311 } 5312 5313 /* 5314 * Otherwise, mark the flag to indicate that we'll be 5315 * doing the dispatch of what's there in the wqe chain 5316 */ 5317 state->id_tx_busy = 1; 5318 5319 while (state->id_tx_head) { 5320 /* 5321 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs 5322 * at a time if possible, and keep posting them. 5323 */ 5324 for (n_wrs = 0, elem = state->id_tx_head; 5325 (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE); 5326 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5327 5328 nodes[n_wrs] = elem; 5329 wrs[n_wrs] = elem->w_swr; 5330 } 5331 state->id_tx_head = elem; 5332 5333 /* 5334 * Release the txpost lock before posting the 5335 * send request to the hca; if the posting fails 5336 * for some reason, we'll never receive completion 5337 * intimation, so we'll need to cleanup. 5338 */ 5339 mutex_exit(&state->id_txpost_lock); 5340 5341 ASSERT(n_wrs != 0); 5342 5343 /* 5344 * If posting fails for some reason, we'll never receive 5345 * completion intimation, so we'll need to cleanup. But 5346 * we need to make sure we don't clean up nodes whose 5347 * wrs have been successfully posted. We assume that the 5348 * hca driver returns on the first failure to post and 5349 * therefore the first 'num_posted' entries don't need 5350 * cleanup here. 5351 */ 5352 num_posted = 0; 5353 ibt_status = ibt_post_send(state->id_chnl_hdl, 5354 wrs, n_wrs, &num_posted); 5355 if (ibt_status != IBT_SUCCESS) { 5356 5357 ibd_print_warn(state, "ibd_post_send: " 5358 "posting multiple wrs failed: " 5359 "requested=%d, done=%d, ret=%d", 5360 n_wrs, num_posted, ibt_status); 5361 5362 for (i = num_posted; i < n_wrs; i++) 5363 ibd_tx_cleanup(state, nodes[i]); 5364 } 5365 5366 /* 5367 * Grab the mutex before we go and check the tx Q again 5368 */ 5369 mutex_enter(&state->id_txpost_lock); 5370 } 5371 5372 state->id_tx_busy = 0; 5373 mutex_exit(&state->id_txpost_lock); 5374 } 5375 5376 static int 5377 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5378 uint_t lsohdr_sz) 5379 { 5380 ibt_wr_ds_t *sgl; 5381 ibt_status_t ibt_status; 5382 mblk_t *nmp; 5383 mblk_t *data_mp; 5384 uchar_t *bufp; 5385 size_t blksize; 5386 size_t skip; 5387 size_t avail; 5388 uint_t pktsize; 5389 uint_t frag_len; 5390 uint_t pending_hdr; 5391 uint_t hiwm; 5392 int nmblks; 5393 int i; 5394 5395 /* 5396 * Let's skip ahead to the data if this is LSO 5397 */ 5398 data_mp = mp; 5399 pending_hdr = 0; 5400 if (lsohdr_sz) { 5401 pending_hdr = lsohdr_sz; 5402 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5403 frag_len = nmp->b_wptr - nmp->b_rptr; 5404 if (frag_len > pending_hdr) 5405 break; 5406 pending_hdr -= frag_len; 5407 } 5408 data_mp = nmp; /* start of data past lso header */ 5409 ASSERT(data_mp != NULL); 5410 } 5411 5412 /* 5413 * Calculate the size of message data and number of msg blocks 5414 */ 5415 pktsize = 0; 5416 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5417 nmp = nmp->b_cont, nmblks++) { 5418 pktsize += MBLKL(nmp); 5419 } 5420 pktsize -= pending_hdr; 5421 5422 /* 5423 * Translating the virtual address regions into physical regions 5424 * for using the Reserved LKey feature results in a wr sgl that 5425 * is a little longer. Since failing ibt_map_mem_iov() is costly, 5426 * we'll fix a high-water mark (65%) for when we should stop. 5427 */ 5428 hiwm = (state->id_max_sqseg * 65) / 100; 5429 5430 /* 5431 * We only do ibt_map_mem_iov() if the pktsize is above the 5432 * "copy-threshold", and if the number of mp fragments is less than 5433 * the maximum acceptable. 5434 */ 5435 if ((state->id_hca_res_lkey_capab) && 5436 (pktsize > IBD_TX_COPY_THRESH) && 5437 (nmblks < hiwm)) { 5438 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5439 ibt_iov_attr_t iov_attr; 5440 5441 iov_attr.iov_as = NULL; 5442 iov_attr.iov = iov_arr; 5443 iov_attr.iov_buf = NULL; 5444 iov_attr.iov_list_len = nmblks; 5445 iov_attr.iov_wr_nds = state->id_max_sqseg; 5446 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5447 iov_attr.iov_flags = IBT_IOV_SLEEP; 5448 5449 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5450 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5451 iov_arr[i].iov_len = MBLKL(nmp); 5452 if (i == 0) { 5453 iov_arr[i].iov_addr += pending_hdr; 5454 iov_arr[i].iov_len -= pending_hdr; 5455 } 5456 } 5457 5458 node->w_buftype = IBD_WQE_MAPPED; 5459 node->w_swr.wr_sgl = node->w_sgl; 5460 5461 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5462 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5463 if (ibt_status != IBT_SUCCESS) { 5464 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5465 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5466 goto ibd_copy_path; 5467 } 5468 5469 return (0); 5470 } 5471 5472 ibd_copy_path: 5473 if (pktsize <= state->id_tx_buf_sz) { 5474 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5475 node->w_swr.wr_nds = 1; 5476 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5477 node->w_buftype = IBD_WQE_TXBUF; 5478 5479 /* 5480 * Even though this is the copy path for transfers less than 5481 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5482 * is possible the first data mblk fragment (data_mp) still 5483 * contains part of the LSO header that we need to skip. 5484 */ 5485 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5486 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5487 blksize = MBLKL(nmp) - pending_hdr; 5488 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5489 bufp += blksize; 5490 pending_hdr = 0; 5491 } 5492 5493 return (0); 5494 } 5495 5496 /* 5497 * Copy path for transfers greater than id_tx_buf_sz 5498 */ 5499 node->w_swr.wr_sgl = node->w_sgl; 5500 if (ibd_acquire_lsobufs(state, pktsize, 5501 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5502 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5503 return (-1); 5504 } 5505 node->w_buftype = IBD_WQE_LSOBUF; 5506 5507 /* 5508 * Copy the larger-than-id_tx_buf_sz packet into a set of 5509 * fixed-sized, pre-mapped LSO buffers. Note that we might 5510 * need to skip part of the LSO header in the first fragment 5511 * as before. 5512 */ 5513 nmp = data_mp; 5514 skip = pending_hdr; 5515 for (i = 0; i < node->w_swr.wr_nds; i++) { 5516 sgl = node->w_swr.wr_sgl + i; 5517 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5518 avail = IBD_LSO_BUFSZ; 5519 while (nmp && avail) { 5520 blksize = MBLKL(nmp) - skip; 5521 if (blksize > avail) { 5522 bcopy(nmp->b_rptr + skip, bufp, avail); 5523 skip += avail; 5524 avail = 0; 5525 } else { 5526 bcopy(nmp->b_rptr + skip, bufp, blksize); 5527 skip = 0; 5528 avail -= blksize; 5529 bufp += blksize; 5530 nmp = nmp->b_cont; 5531 } 5532 } 5533 } 5534 5535 return (0); 5536 } 5537 5538 /* 5539 * Schedule a completion queue polling to reap the resource we're 5540 * short on. If we implement the change to reap tx completions 5541 * in a separate thread, we'll need to wake up that thread here. 5542 */ 5543 static int 5544 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5545 { 5546 ibd_req_t *req; 5547 5548 mutex_enter(&state->id_sched_lock); 5549 state->id_sched_needed |= resource_type; 5550 mutex_exit(&state->id_sched_lock); 5551 5552 /* 5553 * If we are asked to queue a work entry, we need to do it 5554 */ 5555 if (q_flag) { 5556 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5557 if (req == NULL) 5558 return (-1); 5559 5560 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5561 } 5562 5563 return (0); 5564 } 5565 5566 /* 5567 * The passed in packet has this format: 5568 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5569 */ 5570 static boolean_t 5571 ibd_send(ibd_state_t *state, mblk_t *mp) 5572 { 5573 ibd_ace_t *ace; 5574 ibd_swqe_t *node; 5575 ipoib_mac_t *dest; 5576 ib_header_info_t *ipibp; 5577 ip6_t *ip6h; 5578 uint_t pktsize; 5579 uint32_t mss; 5580 uint32_t hckflags; 5581 uint32_t lsoflags = 0; 5582 uint_t lsohdr_sz = 0; 5583 int ret, len; 5584 boolean_t dofree = B_FALSE; 5585 boolean_t rc; 5586 5587 /* 5588 * If we aren't done with the device initialization and start, 5589 * we shouldn't be here. 5590 */ 5591 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5592 return (B_FALSE); 5593 5594 node = NULL; 5595 if (ibd_acquire_swqe(state, &node) != 0) { 5596 /* 5597 * If we don't have an swqe available, schedule a transmit 5598 * completion queue cleanup and hold off on sending more 5599 * more packets until we have some free swqes 5600 */ 5601 if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0) 5602 return (B_FALSE); 5603 5604 /* 5605 * If a poll cannot be scheduled, we have no choice but 5606 * to drop this packet 5607 */ 5608 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5609 return (B_TRUE); 5610 } 5611 5612 /* 5613 * Initialize the commonly used fields in swqe to NULL to protect 5614 * against ibd_tx_cleanup accidentally misinterpreting these on a 5615 * failure. 5616 */ 5617 node->swqe_im_mblk = NULL; 5618 node->w_swr.wr_nds = 0; 5619 node->w_swr.wr_sgl = NULL; 5620 node->w_swr.wr_opcode = IBT_WRC_SEND; 5621 5622 /* 5623 * Obtain an address handle for the destination. 5624 */ 5625 ipibp = (ib_header_info_t *)mp->b_rptr; 5626 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5627 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5628 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5629 5630 pktsize = msgsize(mp); 5631 5632 atomic_add_64(&state->id_xmt_bytes, pktsize); 5633 atomic_inc_64(&state->id_xmt_pkt); 5634 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5635 atomic_inc_64(&state->id_brd_xmt); 5636 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5637 atomic_inc_64(&state->id_multi_xmt); 5638 5639 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 5640 node->w_ahandle = ace; 5641 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5642 } else { 5643 DPRINT(5, 5644 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5645 ((ret == EFAULT) ? "failed" : "queued"), 5646 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5647 htonl(dest->ipoib_gidpref[1]), 5648 htonl(dest->ipoib_gidsuff[0]), 5649 htonl(dest->ipoib_gidsuff[1])); 5650 node->w_ahandle = NULL; 5651 5652 /* 5653 * for the poll mode, it is probably some cqe pending in the 5654 * cq. So ibd has to poll cq here, otherwise acache probably 5655 * may not be recycled. 5656 */ 5657 if (ibd_txcomp_poll == 1) 5658 ibd_poll_compq(state, state->id_scq_hdl); 5659 5660 /* 5661 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5662 * can not find a path for the specific dest address. We 5663 * should get rid of this kind of packet. We also should get 5664 * rid of the packet if we cannot schedule a poll via the 5665 * async thread. For the normal case, ibd will return the 5666 * packet to upper layer and wait for AH creating. 5667 * 5668 * Note that we always queue a work slot entry for the async 5669 * thread when we fail AH lookup (even in intr mode); this is 5670 * due to the convoluted way the code currently looks for AH. 5671 */ 5672 if (ret == EFAULT) { 5673 dofree = B_TRUE; 5674 rc = B_TRUE; 5675 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5676 dofree = B_TRUE; 5677 rc = B_TRUE; 5678 } else { 5679 dofree = B_FALSE; 5680 rc = B_FALSE; 5681 } 5682 goto ibd_send_fail; 5683 } 5684 5685 /* 5686 * For ND6 packets, padding is at the front of the source lladdr. 5687 * Insert the padding at front. 5688 */ 5689 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 5690 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 5691 if (!pullupmsg(mp, IPV6_HDR_LEN + 5692 sizeof (ib_header_info_t))) { 5693 DPRINT(10, "ibd_send: pullupmsg failure "); 5694 dofree = B_TRUE; 5695 rc = B_TRUE; 5696 goto ibd_send_fail; 5697 } 5698 ipibp = (ib_header_info_t *)mp->b_rptr; 5699 } 5700 ip6h = (ip6_t *)((uchar_t *)ipibp + 5701 sizeof (ib_header_info_t)); 5702 len = ntohs(ip6h->ip6_plen); 5703 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5704 mblk_t *pad; 5705 5706 pad = allocb(4, 0); 5707 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 5708 linkb(mp, pad); 5709 if (MBLKL(mp) < sizeof (ib_header_info_t) + 5710 IPV6_HDR_LEN + len + 4) { 5711 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 5712 IPV6_HDR_LEN + len + 4)) { 5713 DPRINT(10, "ibd_send: pullupmsg " 5714 "failure "); 5715 dofree = B_TRUE; 5716 rc = B_TRUE; 5717 goto ibd_send_fail; 5718 } 5719 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5720 sizeof (ib_header_info_t)); 5721 } 5722 5723 /* LINTED: E_CONSTANT_CONDITION */ 5724 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5725 } 5726 } 5727 5728 mp->b_rptr += sizeof (ib_addrs_t); 5729 5730 /* 5731 * Do LSO and checksum related work here. For LSO send, adjust the 5732 * ud destination, the opcode and the LSO header information to the 5733 * work request. 5734 */ 5735 lso_info_get(mp, &mss, &lsoflags); 5736 if ((lsoflags & HW_LSO) != HW_LSO) { 5737 node->w_swr.wr_opcode = IBT_WRC_SEND; 5738 lsohdr_sz = 0; 5739 } else { 5740 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 5741 /* 5742 * The routine can only fail if there's no memory; we 5743 * can only drop the packet if this happens 5744 */ 5745 ibd_print_warn(state, 5746 "ibd_send: no memory, lso posting failed"); 5747 dofree = B_TRUE; 5748 rc = B_TRUE; 5749 goto ibd_send_fail; 5750 } 5751 5752 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 5753 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 5754 } 5755 5756 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 5757 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 5758 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 5759 else 5760 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 5761 5762 /* 5763 * Prepare the sgl for posting; the routine can only fail if there's 5764 * no lso buf available for posting. If this is the case, we should 5765 * probably resched for lso bufs to become available and then try again. 5766 */ 5767 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 5768 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 5769 dofree = B_TRUE; 5770 rc = B_TRUE; 5771 } else { 5772 dofree = B_FALSE; 5773 rc = B_FALSE; 5774 } 5775 goto ibd_send_fail; 5776 } 5777 node->swqe_im_mblk = mp; 5778 5779 /* 5780 * Queue the wqe to hardware; since we can now simply queue a 5781 * post instead of doing it serially, we cannot assume anything 5782 * about the 'node' after ibd_post_send() returns. 5783 */ 5784 ibd_post_send(state, node); 5785 5786 return (B_TRUE); 5787 5788 ibd_send_fail: 5789 if (node && mp) 5790 ibd_free_lsohdr(node, mp); 5791 5792 if (dofree) 5793 freemsg(mp); 5794 5795 if (node != NULL) 5796 ibd_tx_cleanup(state, node); 5797 5798 return (rc); 5799 } 5800 5801 /* 5802 * GLDv3 entry point for transmitting datagram. 5803 */ 5804 static mblk_t * 5805 ibd_m_tx(void *arg, mblk_t *mp) 5806 { 5807 ibd_state_t *state = (ibd_state_t *)arg; 5808 mblk_t *next; 5809 5810 if (state->id_link_state != LINK_STATE_UP) { 5811 freemsgchain(mp); 5812 mp = NULL; 5813 } 5814 5815 while (mp != NULL) { 5816 next = mp->b_next; 5817 mp->b_next = NULL; 5818 if (ibd_send(state, mp) == B_FALSE) { 5819 /* Send fail */ 5820 mp->b_next = next; 5821 break; 5822 } 5823 mp = next; 5824 } 5825 5826 return (mp); 5827 } 5828 5829 /* 5830 * this handles Tx and Rx completions. With separate CQs, this handles 5831 * only Rx completions. 5832 */ 5833 static uint_t 5834 ibd_intr(char *arg) 5835 { 5836 ibd_state_t *state = (ibd_state_t *)arg; 5837 5838 ibd_poll_compq(state, state->id_rcq_hdl); 5839 5840 return (DDI_INTR_CLAIMED); 5841 } 5842 5843 /* 5844 * Poll and drain the cq 5845 */ 5846 static uint_t 5847 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs, 5848 uint_t numwcs) 5849 { 5850 ibd_wqe_t *wqe; 5851 ibt_wc_t *wc; 5852 uint_t total_polled = 0; 5853 uint_t num_polled; 5854 int i; 5855 5856 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 5857 total_polled += num_polled; 5858 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 5859 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5860 ASSERT((wqe->w_type == IBD_WQE_SEND) || 5861 (wqe->w_type == IBD_WQE_RECV)); 5862 if (wc->wc_status != IBT_WC_SUCCESS) { 5863 /* 5864 * Channel being torn down. 5865 */ 5866 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5867 DPRINT(5, "ibd_drain_cq: flush error"); 5868 /* 5869 * Only invoke the Tx handler to 5870 * release possibly held resources 5871 * like AH refcount etc. Can not 5872 * invoke Rx handler because it might 5873 * try adding buffers to the Rx pool 5874 * when we are trying to deinitialize. 5875 */ 5876 if (wqe->w_type == IBD_WQE_RECV) { 5877 continue; 5878 } else { 5879 DPRINT(10, "ibd_drain_cq: Bad " 5880 "status %d", wc->wc_status); 5881 } 5882 } 5883 } 5884 if (wqe->w_type == IBD_WQE_SEND) { 5885 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); 5886 } else { 5887 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5888 } 5889 } 5890 } 5891 5892 return (total_polled); 5893 } 5894 5895 /* 5896 * Common code for interrupt handling as well as for polling 5897 * for all completed wqe's while detaching. 5898 */ 5899 static void 5900 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5901 { 5902 ibt_wc_t *wcs; 5903 uint_t numwcs; 5904 int flag, redo_flag; 5905 int redo = 1; 5906 uint_t num_polled = 0; 5907 5908 if (ibd_separate_cqs == 1) { 5909 if (cq_hdl == state->id_rcq_hdl) { 5910 flag = IBD_RX_CQ_POLLING; 5911 redo_flag = IBD_REDO_RX_CQ_POLLING; 5912 } else { 5913 flag = IBD_TX_CQ_POLLING; 5914 redo_flag = IBD_REDO_TX_CQ_POLLING; 5915 } 5916 } else { 5917 flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING; 5918 redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING; 5919 } 5920 5921 mutex_enter(&state->id_cq_poll_lock); 5922 if (state->id_cq_poll_busy & flag) { 5923 state->id_cq_poll_busy |= redo_flag; 5924 mutex_exit(&state->id_cq_poll_lock); 5925 return; 5926 } 5927 state->id_cq_poll_busy |= flag; 5928 mutex_exit(&state->id_cq_poll_lock); 5929 5930 /* 5931 * In some cases (eg detaching), this code can be invoked on 5932 * any cpu after disabling cq notification (thus no concurrency 5933 * exists). Apart from that, the following applies normally: 5934 * The receive completion handling is always on the Rx interrupt 5935 * cpu. Transmit completion handling could be from any cpu if 5936 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 5937 * is interrupt driven. Combined completion handling is always 5938 * on the interrupt cpu. Thus, lock accordingly and use the 5939 * proper completion array. 5940 */ 5941 if (ibd_separate_cqs == 1) { 5942 if (cq_hdl == state->id_rcq_hdl) { 5943 wcs = state->id_rxwcs; 5944 numwcs = state->id_rxwcs_size; 5945 } else { 5946 wcs = state->id_txwcs; 5947 numwcs = state->id_txwcs_size; 5948 } 5949 } else { 5950 wcs = state->id_rxwcs; 5951 numwcs = state->id_rxwcs_size; 5952 } 5953 5954 /* 5955 * Poll and drain the CQ 5956 */ 5957 num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5958 5959 /* 5960 * Enable CQ notifications and redrain the cq to catch any 5961 * completions we might have missed after the ibd_drain_cq() 5962 * above and before the ibt_enable_cq_notify() that follows. 5963 * Finally, service any new requests to poll the cq that 5964 * could've come in after the ibt_enable_cq_notify(). 5965 */ 5966 do { 5967 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 5968 IBT_SUCCESS) { 5969 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 5970 } 5971 5972 num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5973 5974 mutex_enter(&state->id_cq_poll_lock); 5975 if (state->id_cq_poll_busy & redo_flag) 5976 state->id_cq_poll_busy &= ~redo_flag; 5977 else { 5978 state->id_cq_poll_busy &= ~flag; 5979 redo = 0; 5980 } 5981 mutex_exit(&state->id_cq_poll_lock); 5982 5983 } while (redo); 5984 5985 /* 5986 * If we polled the receive cq and found anything, we need to flush 5987 * it out to the nw layer here. 5988 */ 5989 if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) { 5990 ibd_flush_rx(state, NULL); 5991 } 5992 } 5993 5994 /* 5995 * Unmap the memory area associated with a given swqe. 5996 */ 5997 static void 5998 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 5999 { 6000 ibt_status_t stat; 6001 6002 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 6003 6004 if (swqe->w_mi_hdl) { 6005 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 6006 swqe->w_mi_hdl)) != IBT_SUCCESS) { 6007 DPRINT(10, 6008 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 6009 } 6010 swqe->w_mi_hdl = NULL; 6011 } 6012 swqe->w_swr.wr_nds = 0; 6013 } 6014 6015 /* 6016 * Common code that deals with clean ups after a successful or 6017 * erroneous transmission attempt. 6018 */ 6019 static void 6020 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 6021 { 6022 ibd_ace_t *ace = swqe->w_ahandle; 6023 6024 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 6025 6026 /* 6027 * If this was a dynamic mapping in ibd_send(), we need to 6028 * unmap here. If this was an lso buffer we'd used for sending, 6029 * we need to release the lso buf to the pool, since the resource 6030 * is scarce. However, if this was simply a normal send using 6031 * the copybuf (present in each swqe), we don't need to release it. 6032 */ 6033 if (swqe->swqe_im_mblk != NULL) { 6034 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6035 ibd_unmap_mem(state, swqe); 6036 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6037 ibd_release_lsobufs(state, 6038 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6039 } 6040 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6041 freemsg(swqe->swqe_im_mblk); 6042 swqe->swqe_im_mblk = NULL; 6043 } 6044 6045 /* 6046 * Drop the reference count on the AH; it can be reused 6047 * now for a different destination if there are no more 6048 * posted sends that will use it. This can be eliminated 6049 * if we can always associate each Tx buffer with an AH. 6050 * The ace can be null if we are cleaning up from the 6051 * ibd_send() error path. 6052 */ 6053 if (ace != NULL) { 6054 /* 6055 * The recycling logic can be eliminated from here 6056 * and put into the async thread if we create another 6057 * list to hold ACE's for unjoined mcg's. 6058 */ 6059 if (DEC_REF_DO_CYCLE(ace)) { 6060 ibd_mce_t *mce; 6061 6062 /* 6063 * Check with the lock taken: we decremented 6064 * reference count without the lock, and some 6065 * transmitter might alreay have bumped the 6066 * reference count (possible in case of multicast 6067 * disable when we leave the AH on the active 6068 * list). If not still 0, get out, leaving the 6069 * recycle bit intact. 6070 * 6071 * Atomically transition the AH from active 6072 * to free list, and queue a work request to 6073 * leave the group and destroy the mce. No 6074 * transmitter can be looking at the AH or 6075 * the MCE in between, since we have the 6076 * ac_mutex lock. In the SendOnly reap case, 6077 * it is not neccesary to hold the ac_mutex 6078 * and recheck the ref count (since the AH was 6079 * taken off the active list), we just do it 6080 * to have uniform processing with the Full 6081 * reap case. 6082 */ 6083 mutex_enter(&state->id_ac_mutex); 6084 mce = ace->ac_mce; 6085 if (GET_REF_CYCLE(ace) == 0) { 6086 CLEAR_REFCYCLE(ace); 6087 /* 6088 * Identify the case of fullmember reap as 6089 * opposed to mcg trap reap. Also, port up 6090 * might set ac_mce to NULL to indicate Tx 6091 * cleanup should do no more than put the 6092 * AH in the free list (see ibd_async_link). 6093 */ 6094 if (mce != NULL) { 6095 ace->ac_mce = NULL; 6096 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 6097 /* 6098 * mc_req was initialized at mce 6099 * creation time. 6100 */ 6101 ibd_queue_work_slot(state, 6102 &mce->mc_req, IBD_ASYNC_REAP); 6103 } 6104 IBD_ACACHE_INSERT_FREE(state, ace); 6105 } 6106 mutex_exit(&state->id_ac_mutex); 6107 } 6108 } 6109 6110 /* 6111 * Release the send wqe for reuse. 6112 */ 6113 ibd_release_swqe(state, swqe); 6114 } 6115 6116 /* 6117 * Hand off the processed rx mp chain to mac_rx() 6118 */ 6119 static void 6120 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc) 6121 { 6122 if (mpc == NULL) { 6123 mutex_enter(&state->id_rx_lock); 6124 6125 mpc = state->id_rx_mp; 6126 6127 state->id_rx_mp = NULL; 6128 state->id_rx_mp_tail = NULL; 6129 state->id_rx_mp_len = 0; 6130 6131 mutex_exit(&state->id_rx_lock); 6132 } 6133 6134 if (mpc) { 6135 mac_rx(state->id_mh, state->id_rh, mpc); 6136 } 6137 } 6138 6139 /* 6140 * Processing to be done after receipt of a packet; hand off to GLD 6141 * in the format expected by GLD. The received packet has this 6142 * format: 2b sap :: 00 :: data. 6143 */ 6144 static void 6145 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 6146 { 6147 ib_header_info_t *phdr; 6148 mblk_t *mp; 6149 mblk_t *mpc = NULL; 6150 ipoib_hdr_t *ipibp; 6151 ipha_t *iphap; 6152 ip6_t *ip6h; 6153 int rxcnt, len; 6154 6155 /* 6156 * Track number handed to upper layer, and number still 6157 * available to receive packets. 6158 */ 6159 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 6160 ASSERT(rxcnt >= 0); 6161 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 6162 6163 /* 6164 * Adjust write pointer depending on how much data came in. 6165 */ 6166 mp = rwqe->rwqe_im_mblk; 6167 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 6168 6169 /* 6170 * Make sure this is NULL or we're in trouble. 6171 */ 6172 if (mp->b_next != NULL) { 6173 ibd_print_warn(state, 6174 "ibd_process_rx: got duplicate mp from rcq?"); 6175 mp->b_next = NULL; 6176 } 6177 6178 /* 6179 * the IB link will deliver one of the IB link layer 6180 * headers called, the Global Routing Header (GRH). 6181 * ibd driver uses the information in GRH to build the 6182 * Header_info structure and pass it with the datagram up 6183 * to GLDv3. 6184 * If the GRH is not valid, indicate to GLDv3 by setting 6185 * the VerTcFlow field to 0. 6186 */ 6187 phdr = (ib_header_info_t *)mp->b_rptr; 6188 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 6189 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 6190 6191 /* if it is loop back packet, just drop it. */ 6192 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 6193 IPOIB_ADDRL) == 0) { 6194 freemsg(mp); 6195 return; 6196 } 6197 6198 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 6199 sizeof (ipoib_mac_t)); 6200 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 6201 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 6202 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 6203 } else { 6204 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 6205 } 6206 } else { 6207 /* 6208 * It can not be a IBA multicast packet. Must have been 6209 * unicast for us. Just copy the interface address to dst. 6210 */ 6211 phdr->ib_grh.ipoib_vertcflow = 0; 6212 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 6213 sizeof (ipoib_mac_t)); 6214 } 6215 6216 /* 6217 * For ND6 packets, padding is at the front of the source/target 6218 * lladdr. However the inet6 layer is not aware of it, hence remove 6219 * the padding from such packets. 6220 */ 6221 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6222 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 6223 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 6224 if (!pullupmsg(mp, IPV6_HDR_LEN + 6225 sizeof (ipoib_hdr_t))) { 6226 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 6227 freemsg(mp); 6228 return; 6229 } 6230 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + 6231 sizeof (ipoib_pgrh_t)); 6232 } 6233 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6234 len = ntohs(ip6h->ip6_plen); 6235 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6236 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 6237 IPV6_HDR_LEN + len) { 6238 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 6239 IPV6_HDR_LEN + len)) { 6240 DPRINT(10, "ibd_process_rx: pullupmsg" 6241 " failed"); 6242 freemsg(mp); 6243 return; 6244 } 6245 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 6246 sizeof (ipoib_pgrh_t) + 6247 sizeof (ipoib_hdr_t)); 6248 } 6249 /* LINTED: E_CONSTANT_CONDITION */ 6250 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6251 } 6252 } 6253 6254 /* 6255 * Update statistics 6256 */ 6257 atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer); 6258 atomic_inc_64(&state->id_rcv_pkt); 6259 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6260 atomic_inc_64(&state->id_brd_rcv); 6261 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6262 atomic_inc_64(&state->id_multi_rcv); 6263 6264 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6265 /* 6266 * Set receive checksum status in mp 6267 * Hardware checksumming can be considered valid only if: 6268 * 1. CQE.IP_OK bit is set 6269 * 2. CQE.CKSUM = 0xffff 6270 * 3. IPv6 routing header is not present in the packet 6271 * 4. If there are no IP_OPTIONS in the IP HEADER 6272 */ 6273 6274 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6275 (wc->wc_cksum == 0xFFFF) && 6276 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6277 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 6278 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 6279 } 6280 6281 /* 6282 * Add this mp to the list of processed mp's to send to 6283 * the nw layer 6284 */ 6285 mutex_enter(&state->id_rx_lock); 6286 if (state->id_rx_mp) { 6287 ASSERT(state->id_rx_mp_tail != NULL); 6288 state->id_rx_mp_tail->b_next = mp; 6289 } else { 6290 ASSERT(state->id_rx_mp_tail == NULL); 6291 state->id_rx_mp = mp; 6292 } 6293 6294 state->id_rx_mp_tail = mp; 6295 state->id_rx_mp_len++; 6296 6297 if (state->id_rx_mp_len >= IBD_MAX_RX_MP_LEN) { 6298 mpc = state->id_rx_mp; 6299 6300 state->id_rx_mp = NULL; 6301 state->id_rx_mp_tail = NULL; 6302 state->id_rx_mp_len = 0; 6303 } 6304 6305 mutex_exit(&state->id_rx_lock); 6306 6307 if (mpc) { 6308 ibd_flush_rx(state, mpc); 6309 } 6310 } 6311 6312 /* 6313 * Callback code invoked from STREAMs when the receive data buffer is 6314 * free for recycling. 6315 */ 6316 static void 6317 ibd_freemsg_cb(char *arg) 6318 { 6319 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6320 ibd_state_t *state = rwqe->w_state; 6321 6322 /* 6323 * If the wqe is being destructed, do not attempt recycling. 6324 */ 6325 if (rwqe->w_freeing_wqe == B_TRUE) { 6326 DPRINT(6, "ibd_freemsg: wqe being freed"); 6327 return; 6328 } else { 6329 /* 6330 * Upper layer has released held mblk, so we have 6331 * no more use for keeping the old pointer in 6332 * our rwqe. 6333 */ 6334 rwqe->rwqe_im_mblk = NULL; 6335 } 6336 6337 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6338 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6339 if (rwqe->rwqe_im_mblk == NULL) { 6340 ibd_delete_rwqe(state, rwqe); 6341 ibd_free_rwqe(state, rwqe); 6342 DPRINT(6, "ibd_freemsg: desballoc failed"); 6343 return; 6344 } 6345 6346 if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) { 6347 ibd_delete_rwqe(state, rwqe); 6348 ibd_free_rwqe(state, rwqe); 6349 return; 6350 } 6351 6352 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 6353 } 6354 6355 static uint_t 6356 ibd_tx_recycle(char *arg) 6357 { 6358 ibd_state_t *state = (ibd_state_t *)arg; 6359 6360 /* 6361 * Poll for completed entries 6362 */ 6363 ibd_poll_compq(state, state->id_scq_hdl); 6364 6365 /* 6366 * Resume any blocked transmissions if possible 6367 */ 6368 (void) ibd_resume_transmission(state); 6369 6370 return (DDI_INTR_CLAIMED); 6371 } 6372 6373 #ifdef IBD_LOGGING 6374 static void 6375 ibd_log_init(void) 6376 { 6377 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 6378 ibd_lbuf_ndx = 0; 6379 6380 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 6381 } 6382 6383 static void 6384 ibd_log_fini(void) 6385 { 6386 if (ibd_lbuf) 6387 kmem_free(ibd_lbuf, IBD_LOG_SZ); 6388 ibd_lbuf_ndx = 0; 6389 ibd_lbuf = NULL; 6390 6391 mutex_destroy(&ibd_lbuf_lock); 6392 } 6393 6394 static void 6395 ibd_log(const char *fmt, ...) 6396 { 6397 va_list ap; 6398 uint32_t off; 6399 uint32_t msglen; 6400 char tmpbuf[IBD_DMAX_LINE]; 6401 6402 if (ibd_lbuf == NULL) 6403 return; 6404 6405 va_start(ap, fmt); 6406 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 6407 va_end(ap); 6408 6409 if (msglen >= IBD_DMAX_LINE) 6410 msglen = IBD_DMAX_LINE - 1; 6411 6412 mutex_enter(&ibd_lbuf_lock); 6413 6414 off = ibd_lbuf_ndx; /* current msg should go here */ 6415 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 6416 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 6417 6418 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 6419 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 6420 6421 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 6422 ibd_lbuf_ndx = 0; 6423 6424 mutex_exit(&ibd_lbuf_lock); 6425 6426 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 6427 } 6428 #endif 6429