1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/kmem.h>
28 #include <sys/conf.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/ksynch.h>
32 #include <sys/pattr.h> /* HCK_* */
33 #include <inet/ip.h> /* ipha_t */
34 #include <inet/tcp.h> /* tcph_t */
35 #include <sys/mac_provider.h> /* mac_* */
36 #include <sys/strsun.h> /* MBLKL */
37
38 #include <sys/ib/clients/eoib/eib_impl.h>
39
40 /*
41 * Declarations private to this file
42 */
43 static int eib_data_setup_cqs(eib_t *, eib_vnic_t *);
44 static int eib_data_setup_ud_channel(eib_t *, eib_vnic_t *);
45 static void eib_data_setup_lso(eib_wqe_t *, mblk_t *, uint32_t,
46 eib_ether_hdr_t *);
47 static int eib_data_prepare_sgl(eib_vnic_t *, eib_wqe_t *, mblk_t *);
48 static int eib_data_is_mcast_pkt_ok(eib_vnic_t *, uint8_t *, uint64_t *,
49 uint64_t *);
50 static void eib_data_rx_comp_intr(ibt_cq_hdl_t, void *);
51 static void eib_data_tx_comp_intr(ibt_cq_hdl_t, void *);
52 static mblk_t *eib_data_rx_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *);
53 static void eib_data_tx_comp(eib_vnic_t *, eib_wqe_t *, eib_chan_t *);
54 static void eib_data_err_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *);
55 static void eib_rb_data_setup_cqs(eib_t *, eib_vnic_t *);
56 static void eib_rb_data_setup_ud_channel(eib_t *, eib_vnic_t *);
57
58
59 int
eib_data_create_qp(eib_t * ss,eib_vnic_t * vnic,int * err)60 eib_data_create_qp(eib_t *ss, eib_vnic_t *vnic, int *err)
61 {
62 eib_chan_t *chan = NULL;
63
64 /*
65 * Allocate a eib_chan_t to store stuff about this vnic's data qp
66 * and initialize it with default admin qp pkey parameters. We'll
67 * re-associate this with the pkey we receive from the gw once we
68 * receive the login ack.
69 */
70 vnic->vn_data_chan = eib_chan_init();
71
72 chan = vnic->vn_data_chan;
73 chan->ch_pkey = ss->ei_admin_chan->ch_pkey;
74 chan->ch_pkey_ix = ss->ei_admin_chan->ch_pkey_ix;
75 chan->ch_vnic_inst = vnic->vn_instance;
76
77 /*
78 * Setup tx/rx CQs and completion handlers
79 */
80 if (eib_data_setup_cqs(ss, vnic) != EIB_E_SUCCESS) {
81 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: "
82 "eib_data_setup_cqs(vn_inst=0x%x) failed",
83 vnic->vn_instance);
84 *err = ENOMEM;
85 goto data_create_qp_fail;
86 }
87
88 /*
89 * Setup UD channel
90 */
91 if (eib_data_setup_ud_channel(ss, vnic) != EIB_E_SUCCESS) {
92 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: "
93 "eib_data_setup_ud_channel(vn_inst=0x%x) failed",
94 vnic->vn_instance);
95 *err = ENOMEM;
96 goto data_create_qp_fail;
97 }
98
99 return (EIB_E_SUCCESS);
100
101 data_create_qp_fail:
102 eib_rb_data_create_qp(ss, vnic);
103 return (EIB_E_FAILURE);
104 }
105
106 /*ARGSUSED*/
107 uint_t
eib_data_rx_comp_handler(caddr_t arg1,caddr_t arg2)108 eib_data_rx_comp_handler(caddr_t arg1, caddr_t arg2)
109 {
110 eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1;
111 eib_t *ss = vnic->vn_ss;
112 eib_chan_t *chan = vnic->vn_data_chan;
113 eib_stats_t *stats = ss->ei_stats;
114 ibt_wc_t *wc;
115 eib_wqe_t *wqe;
116 mblk_t *mp;
117 mblk_t *head = NULL;
118 mblk_t *tail = NULL;
119 ibt_status_t ret;
120 uint_t pkts_per_call = 0;
121 uint_t polled;
122 uint_t rbytes;
123 uint_t ipkts;
124 uint_t num_wc;
125 int i;
126
127 /*
128 * Re-arm the rx notification callback before we start polling
129 * the completion queue. There's nothing much we can do if the
130 * enable_cq_notify fails - we issue a warning and move on.
131 */
132 ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION);
133 if (ret != IBT_SUCCESS) {
134 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp_handler: "
135 "ibt_enable_cq_notify() failed, ret=%d", ret);
136 }
137
138 /*
139 * We don't want to be stuck in receive processing for too long without
140 * giving others a chance.
141 */
142 num_wc = (chan->ch_rcv_cq_sz < EIB_MAX_RX_PKTS_ONINTR) ?
143 chan->ch_rcv_cq_sz : EIB_MAX_RX_PKTS_ONINTR;
144
145 /*
146 * Handle rx completions
147 */
148 while ((ret = ibt_poll_cq(chan->ch_rcv_cq_hdl, chan->ch_rcv_wc,
149 num_wc, &polled)) == IBT_SUCCESS) {
150
151 rbytes = ipkts = 0;
152 head = tail = NULL;
153
154 for (wc = chan->ch_rcv_wc, i = 0; i < polled; i++, wc++) {
155 wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id;
156
157 ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX);
158
159 /*
160 * Clear the posted-to-hca flag and reduce the number
161 * of posted-rwqes count
162 */
163 wqe->qe_info &= (~EIB_WQE_FLG_POSTED_TO_HCA);
164 eib_rsrc_decr_posted_rwqe(ss, chan);
165
166 rbytes += wc->wc_bytes_xfer;
167 if (wc->wc_status != IBT_WC_SUCCESS) {
168 EIB_INCR_COUNTER(&stats->st_ierrors);
169 eib_data_err_comp(vnic, wqe, wc);
170 } else {
171 ipkts++;
172 mp = eib_data_rx_comp(vnic, wqe, wc);
173 if (mp == NULL) {
174 continue;
175 } else {
176 /*
177 * Add this mp to the list to
178 * send it to the nw layer. Note
179 * that the wqe could've been
180 * returned to the pool if we're
181 * running low, so don't process
182 * wqe after this point.
183 */
184 if (head)
185 tail->b_next = mp;
186 else
187 head = mp;
188 tail = mp;
189 }
190 }
191 }
192
193 /*
194 * We reduce the number of atomic updates to key statistics
195 * by pooling them here, once per ibt_poll_cq(). The accuracy
196 * and consistency of the published statistics within a cq
197 * polling cycle will be compromised a little bit, but that
198 * should be ok, given that we probably gain a little bit by
199 * not having to do these atomic operations per packet.
200 */
201 EIB_UPDATE_COUNTER(&stats->st_rbytes, rbytes);
202 EIB_UPDATE_COUNTER(&stats->st_ipkts, ipkts);
203
204 pkts_per_call += ipkts;
205
206 if (head) {
207 mac_rx(ss->ei_mac_hdl, NULL, head);
208 }
209
210 /*
211 * If we have processed too many packets in one attempt, we'll
212 * have to come back here later.
213 */
214 if (pkts_per_call >= EIB_MAX_RX_PKTS_ONINTR) {
215 (void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl,
216 NULL);
217 break;
218 }
219
220 num_wc -= polled;
221 }
222
223 return (DDI_INTR_CLAIMED);
224 }
225
226 /*ARGSUSED*/
227 uint_t
eib_data_tx_comp_handler(caddr_t arg1,caddr_t arg2)228 eib_data_tx_comp_handler(caddr_t arg1, caddr_t arg2)
229 {
230 eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1;
231 eib_t *ss = vnic->vn_ss;
232 eib_chan_t *chan = vnic->vn_data_chan;
233 eib_stats_t *stats = ss->ei_stats;
234 ibt_wc_t *wc;
235 eib_wqe_t *wqe;
236 ibt_status_t ret;
237 uint_t polled;
238 int i;
239
240 /*
241 * Re-arm the tx notification callback before we start polling
242 * the completion queue. There's nothing much we can do if the
243 * enable_cq_notify fails - we issue a warning and move on.
244 */
245 ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
246 if (ret != IBT_SUCCESS) {
247 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_tx_comp_handler: "
248 "ibt_enable_cq_notify() failed, ret=%d", ret);
249 }
250
251 /*
252 * Handle tx completions
253 */
254 while ((ret = ibt_poll_cq(chan->ch_cq_hdl, chan->ch_wc, chan->ch_cq_sz,
255 &polled)) == IBT_SUCCESS) {
256 for (wc = chan->ch_wc, i = 0; i < polled; i++, wc++) {
257 wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id;
258
259 ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_TX);
260
261 if (wc->wc_status != IBT_WC_SUCCESS) {
262 EIB_INCR_COUNTER(&stats->st_oerrors);
263 eib_data_err_comp(vnic, wqe, wc);
264 } else {
265 eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan);
266 }
267 }
268 }
269
270 return (DDI_INTR_CLAIMED);
271 }
272
273 void
eib_data_rx_recycle(caddr_t arg)274 eib_data_rx_recycle(caddr_t arg)
275 {
276 eib_wqe_t *rwqe = (eib_wqe_t *)(void *)arg;
277 eib_t *ss = rwqe->qe_pool->wp_ss;
278 eib_chan_t *vn_chan;
279 uint_t nic_state;
280 int ret;
281
282 /*
283 * We come here from three places - (a) from the nw layer if the
284 * rx mblk we handed to it has been done with and the nw layer is
285 * calling the freemsg() (b) from eib_data_rx_comp() if the rx
286 * completion processing discovers that the received EoIB packet
287 * has a problem and (c) from eib_data_err_comp() if we're tearing
288 * down this channel. We only need to repost the rwqe if we're
289 * being called back from the nw layer. For the other two cases,
290 * we'll simply return the rwqe to the pool. Also, since we would've
291 * already updated the ch_rx_posted counters in the rx completion
292 * handler, we don't pass the chan pointer to eib_rsrc_return_rwqe
293 * from within this routine.
294 */
295 rwqe->qe_mp = NULL;
296 if ((rwqe->qe_info & EIB_WQE_FLG_WITH_NW) == 0) {
297 eib_rsrc_return_rwqe(ss, rwqe, NULL);
298 return;
299 }
300
301 rwqe->qe_info &= (~EIB_WQE_FLG_WITH_NW);
302
303 /*
304 * If the buffers are being returned by nw layer after a long
305 * time, this eoib instance could've even been stopped by now.
306 * If so, simply return the rwqe to the pool.
307 */
308 nic_state = eib_mac_get_nic_state(ss);
309 if ((nic_state & EIB_NIC_STARTED) != EIB_NIC_STARTED) {
310 eib_rsrc_return_rwqe(ss, rwqe, NULL);
311 return;
312 }
313
314 /*
315 * Or it could've taken even longer, and the nic has even been
316 * restarted. Only thing we can do is to make sure that the
317 * original channel pointer we passed corresponds to what's in
318 * the instance of the vnic currently.
319 */
320 vn_chan = eib_vnic_get_data_chan(ss, rwqe->qe_vnic_inst);
321 if (vn_chan == NULL || vn_chan != rwqe->qe_chan) {
322 eib_rsrc_return_rwqe(ss, rwqe, NULL);
323 return;
324 }
325
326 /*
327 * Try to repost the rwqe if we're not tearing down this channel
328 */
329 if (vn_chan->ch_tear_down) {
330 eib_rsrc_return_rwqe(ss, rwqe, NULL);
331 } else {
332 ret = eib_chan_post_recv(ss, vn_chan, rwqe);
333 if (ret != EIB_E_SUCCESS) {
334 if (rwqe->qe_mp)
335 freemsg(rwqe->qe_mp);
336 else
337 eib_rsrc_return_rwqe(ss, rwqe, NULL);
338 }
339 }
340 }
341
342 void
eib_data_post_tx(eib_vnic_t * vnic,eib_wqe_t * swqe)343 eib_data_post_tx(eib_vnic_t *vnic, eib_wqe_t *swqe)
344 {
345 eib_chan_t *chan = vnic->vn_data_chan;
346 eib_t *ss = vnic->vn_ss;
347 eib_stats_t *stats = vnic->vn_ss->ei_stats;
348 ibt_send_wr_t wrs[EIB_MAX_POST_MULTIPLE];
349 eib_wqe_t *wqes[EIB_MAX_POST_MULTIPLE];
350 eib_wqe_t *elem;
351 ibt_status_t ret;
352 uint_t n_wrs;
353 uint_t n_posted;
354 uint_t total_failed = 0;
355 uint_t n_failed = 0;
356 uint_t i;
357
358 /*
359 * See if we have room for this wqe and then add it to the
360 * list of tx wrs to post in this channel.
361 */
362 mutex_enter(&chan->ch_tx_lock);
363
364 if ((chan->ch_tx_posted + 1) >= (chan->ch_max_swqes - 1)) {
365 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: "
366 "too many swqes posted already, posted=0x%lx, "
367 "max=0x%lx", chan->ch_tx_posted, chan->ch_max_swqes);
368 mutex_exit(&chan->ch_tx_lock);
369 return;
370 }
371
372 swqe->qe_nxt_post = NULL;
373 if (chan->ch_tx) {
374 chan->ch_tx_tail->qe_nxt_post = swqe;
375 } else {
376 chan->ch_tx = swqe;
377 }
378 chan->ch_tx_tail = swqe;
379 chan->ch_tx_posted++; /* pre-increment */
380
381 /*
382 * If someone's already posting tx wqes in this channel, let
383 * them post ours as well.
384 */
385 if (chan->ch_tx_busy == B_TRUE) {
386 mutex_exit(&chan->ch_tx_lock);
387 return;
388 }
389 chan->ch_tx_busy = B_TRUE;
390
391 while (chan->ch_tx) {
392 /*
393 * Post EIB_MAX_POST_MULTIPLE wrs at a time
394 */
395 for (n_wrs = 0, elem = chan->ch_tx;
396 (elem) && (n_wrs < EIB_MAX_POST_MULTIPLE);
397 elem = elem->qe_nxt_post, n_wrs++) {
398 wqes[n_wrs] = elem;
399 wrs[n_wrs] = (elem->qe_wr).send;
400 }
401 chan->ch_tx = elem;
402 if (elem == NULL) {
403 chan->ch_tx_tail = NULL;
404 }
405 mutex_exit(&chan->ch_tx_lock);
406
407 ASSERT(n_wrs != 0);
408
409 /*
410 * If multiple wrs posting fails for some reason, we'll try
411 * posting the unposted ones one by one. If even that fails,
412 * we'll release any mappings/buffers/mblks associated with
413 * this wqe and return it to the pool.
414 */
415 n_posted = n_failed = 0;
416 ret = ibt_post_send(chan->ch_chan, wrs, n_wrs, &n_posted);
417 if (ret != IBT_SUCCESS) {
418 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: "
419 "ibt_post_send(n_wrs=0x%lx, n_posted=0x%lx) "
420 "failed, ret=%d", n_wrs, n_posted, ret);
421
422 for (i = n_posted; i < n_wrs; i++) {
423 ret = ibt_post_send(chan->ch_chan, &wrs[i],
424 1, NULL);
425 if (ret != IBT_SUCCESS) {
426 n_failed++;
427 eib_data_tx_comp(vnic, wqes[i], chan);
428
429 EIB_DPRINTF_WARN(ss->ei_instance,
430 "eib_data_post_tx: "
431 "ibt_post_send(n_wrs=1) failed, "
432 "ret=%d", ret);
433 }
434 }
435 }
436 total_failed += n_failed;
437
438 mutex_enter(&chan->ch_tx_lock);
439 }
440
441 chan->ch_tx_busy = B_FALSE;
442 mutex_exit(&chan->ch_tx_lock);
443
444 /*
445 * If we failed to post something, update error stats
446 */
447 if (total_failed) {
448 EIB_UPDATE_COUNTER(&stats->st_oerrors, total_failed);
449 }
450 }
451
452 void
eib_data_parse_ether_hdr(mblk_t * mp,eib_ether_hdr_t * evh)453 eib_data_parse_ether_hdr(mblk_t *mp, eib_ether_hdr_t *evh)
454 {
455 struct ether_vlan_header *vl_hdr;
456 struct ether_header *hdr;
457
458 /*
459 * Assume that the ether header (with or without vlan tag) is
460 * contained in one fragment
461 */
462 hdr = (struct ether_header *)(void *)mp->b_rptr;
463 vl_hdr = (struct ether_vlan_header *)(void *)mp->b_rptr;
464
465 evh->eh_ether_type = ntohs(hdr->ether_type);
466 if (evh->eh_ether_type != ETHERTYPE_VLAN) {
467 evh->eh_tagless = 1;
468 evh->eh_vlan = 0;
469 ether_copy((void *)hdr->ether_dhost.ether_addr_octet,
470 (void *)evh->eh_dmac);
471 ether_copy((void *)hdr->ether_shost.ether_addr_octet,
472 (void *)evh->eh_smac);
473 } else {
474 evh->eh_ether_type = ntohs(vl_hdr->ether_type);
475 evh->eh_tagless = 0;
476 evh->eh_vlan = VLAN_ID(ntohs(vl_hdr->ether_tci));
477 ether_copy((void *)vl_hdr->ether_dhost.ether_addr_octet,
478 (void *)evh->eh_dmac);
479 ether_copy((void *)vl_hdr->ether_shost.ether_addr_octet,
480 (void *)evh->eh_smac);
481 }
482 }
483
484 int
eib_data_lookup_vnic(eib_t * ss,uint8_t * mac,uint16_t vlan,eib_vnic_t ** vnicp,boolean_t * failed)485 eib_data_lookup_vnic(eib_t *ss, uint8_t *mac, uint16_t vlan, eib_vnic_t **vnicp,
486 boolean_t *failed)
487 {
488 eib_vnic_t *vnic;
489 eib_vnic_req_t *vrq;
490 uint8_t *vn_mac;
491 uint16_t vn_vlan;
492 uint64_t av;
493 int inst = 0;
494
495 if (mac == NULL)
496 return (EIB_E_FAILURE);
497
498 /*
499 * For now, a simple search (but only what we've allocated). Note that
500 * if we're in the process of creating a vnic, the instance might've
501 * been allocated, but the vnic entry would be NULL.
502 */
503 mutex_enter(&ss->ei_vnic_lock);
504 av = ss->ei_active_vnics;
505 while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
506 if ((vnic = ss->ei_vnic[inst]) != NULL) {
507 vn_mac = vnic->vn_login_data.ld_assigned_mac;
508 vn_vlan = vnic->vn_login_data.ld_assigned_vlan;
509
510 if ((vn_vlan == vlan) &&
511 (bcmp(vn_mac, mac, ETHERADDRL) == 0)) {
512 if (vnicp) {
513 *vnicp = vnic;
514 }
515 mutex_exit(&ss->ei_vnic_lock);
516 return (EIB_E_SUCCESS);
517 }
518 }
519
520 av &= (~((uint64_t)1 << inst));
521 }
522 mutex_exit(&ss->ei_vnic_lock);
523
524 /*
525 * If we haven't been able to locate a vnic for this {mac,vlan} tuple,
526 * see if we've already failed a creation request for this vnic, and
527 * return that information.
528 */
529 if (failed) {
530 mutex_enter(&ss->ei_vnic_req_lock);
531 *failed = B_FALSE;
532 for (vrq = ss->ei_failed_vnic_req; vrq; vrq = vrq->vr_next) {
533 if ((vrq->vr_vlan == vlan) &&
534 (bcmp(vrq->vr_mac, mac, ETHERADDRL) == 0)) {
535 *failed = B_TRUE;
536 }
537 }
538 mutex_exit(&ss->ei_vnic_req_lock);
539 }
540
541 return (EIB_E_FAILURE);
542 }
543
544 int
eib_data_prepare_frame(eib_vnic_t * vnic,eib_wqe_t * swqe,mblk_t * mp,eib_ether_hdr_t * evh)545 eib_data_prepare_frame(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp,
546 eib_ether_hdr_t *evh)
547 {
548 uint32_t mss;
549 uint32_t lsoflags;
550 uint32_t hckflags;
551
552 /*
553 * The swqe defaults are set to use the regular ud work request
554 * member and the IBT_WRC_SEND opcode, so we don't need to do
555 * anything here if this isn't an LSO packet.
556 */
557 mac_lso_get(mp, &mss, &lsoflags);
558 if ((lsoflags & HW_LSO) == HW_LSO)
559 eib_data_setup_lso(swqe, mp, mss, evh);
560
561 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
562 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) {
563 swqe->qe_wr.send.wr_flags |= IBT_WR_SEND_CKSUM;
564 } else {
565 swqe->qe_wr.send.wr_flags &= (~IBT_WR_SEND_CKSUM);
566 }
567
568 if (eib_data_prepare_sgl(vnic, swqe, mp) != 0)
569 return (EIB_E_FAILURE);
570
571 swqe->qe_mp = mp;
572
573 return (EIB_E_SUCCESS);
574 }
575
576 void
eib_rb_data_create_qp(eib_t * ss,eib_vnic_t * vnic)577 eib_rb_data_create_qp(eib_t *ss, eib_vnic_t *vnic)
578 {
579 eib_rb_data_setup_ud_channel(ss, vnic);
580
581 eib_rb_data_setup_cqs(ss, vnic);
582
583 eib_chan_fini(vnic->vn_data_chan);
584 vnic->vn_data_chan = NULL;
585 }
586
587 static int
eib_data_setup_cqs(eib_t * ss,eib_vnic_t * vnic)588 eib_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic)
589 {
590 eib_chan_t *chan = vnic->vn_data_chan;
591 ibt_cq_attr_t cq_attr;
592 ibt_status_t ret;
593 uint_t snd_sz;
594 uint_t rcv_sz;
595 int rv;
596
597 /*
598 * Allocate send completion queue. Note that we've already verified
599 * that cp_max_swqe and cp_max_rwqe meet the max cq size requirements
600 * of the hca.
601 */
602 cq_attr.cq_sched = NULL;
603 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
604 cq_attr.cq_size = ss->ei_caps->cp_max_swqe + 1;
605
606 ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_cq_hdl, &snd_sz);
607 if (ret != IBT_SUCCESS) {
608 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
609 "ibt_alloc_cq(snd_cq_sz=0x%lx) failed, ret=%d",
610 cq_attr.cq_size, ret);
611 goto setup_data_cqs_fail;
612 }
613 ret = ibt_modify_cq(chan->ch_cq_hdl, EIB_TX_COMP_COUNT,
614 EIB_TX_COMP_USEC, 0);
615 if (ret != IBT_SUCCESS) {
616 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: "
617 "ibt_modify_cq(snd_comp_count=0x%lx, snd_comp_usec=0x%lx) "
618 "failed, ret=%d",
619 EIB_TX_COMP_COUNT, EIB_TX_COMP_USEC, ret);
620 }
621
622 /*
623 * Allocate receive completion queue
624 */
625 cq_attr.cq_sched = NULL;
626 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
627 cq_attr.cq_size = ss->ei_caps->cp_max_rwqe + 1;
628
629 ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_rcv_cq_hdl,
630 &rcv_sz);
631 if (ret != IBT_SUCCESS) {
632 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
633 "ibt_alloc_cq(rcv_cq_sz=0x%lx) failed, ret=%d",
634 cq_attr.cq_size, ret);
635 goto setup_data_cqs_fail;
636 }
637 ret = ibt_modify_cq(chan->ch_rcv_cq_hdl, EIB_RX_COMP_COUNT,
638 EIB_RX_COMP_USEC, 0);
639 if (ret != IBT_SUCCESS) {
640 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: "
641 "ibt_modify_cq(rcv_comp_count=0x%lx, rcv_comp_usec=0x%lx) "
642 "failed, ret=%d",
643 EIB_RX_COMP_COUNT, EIB_RX_COMP_USEC, ret);
644 }
645
646 /*
647 * Set up parameters for collecting tx and rx completion information
648 */
649 chan->ch_cq_sz = snd_sz;
650 chan->ch_wc = kmem_zalloc(sizeof (ibt_wc_t) * snd_sz, KM_SLEEP);
651 chan->ch_rcv_cq_sz = rcv_sz;
652 chan->ch_rcv_wc = kmem_zalloc(sizeof (ibt_wc_t) * rcv_sz, KM_SLEEP);
653
654 /*
655 * Set up the vnic's data tx completion queue handler and allocate
656 * a softint for it as well.
657 */
658 if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_tx_si_hdl,
659 EIB_SOFTPRI_DATA, eib_data_tx_comp_handler, vnic)) != DDI_SUCCESS) {
660 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
661 "ddi_intr_add_softint() failed for data tx qp, ret=%d", rv);
662 goto setup_data_cqs_fail;
663 }
664 ibt_set_cq_handler(chan->ch_cq_hdl, eib_data_tx_comp_intr, vnic);
665 ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
666 if (ret != IBT_SUCCESS) {
667 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
668 "ibt_enable_cq_notify() failed for tx cq, ret=%d", ret);
669 goto setup_data_cqs_fail;
670 }
671
672 /*
673 * And then the data rx completion queue handler
674 */
675 if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_rx_si_hdl,
676 EIB_SOFTPRI_DATA, eib_data_rx_comp_handler, vnic)) != DDI_SUCCESS) {
677 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
678 "ddi_intr_add_softint() failed for data rx qp, ret=%d", rv);
679 goto setup_data_cqs_fail;
680 }
681 ibt_set_cq_handler(chan->ch_rcv_cq_hdl, eib_data_rx_comp_intr, vnic);
682 ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION);
683 if (ret != IBT_SUCCESS) {
684 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
685 "ibt_enable_cq_notify() failed for rx cq, ret=%d", ret);
686 goto setup_data_cqs_fail;
687 }
688
689 return (EIB_E_SUCCESS);
690
691 setup_data_cqs_fail:
692 eib_rb_data_setup_cqs(ss, vnic);
693 return (EIB_E_FAILURE);
694 }
695
696 static int
eib_data_setup_ud_channel(eib_t * ss,eib_vnic_t * vnic)697 eib_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic)
698 {
699 eib_chan_t *chan = vnic->vn_data_chan;
700 ibt_ud_chan_alloc_args_t alloc_attr;
701 ibt_ud_chan_query_attr_t query_attr;
702 ibt_status_t ret;
703
704 bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t));
705 bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t));
706
707 alloc_attr.ud_flags = IBT_ALL_SIGNALED;
708 if (ss->ei_caps->cp_resv_lkey_capab)
709 alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
710 if (ss->ei_caps->cp_lso_maxlen)
711 alloc_attr.ud_flags |= IBT_USES_LSO;
712
713 alloc_attr.ud_hca_port_num = ss->ei_props->ep_port_num;
714 alloc_attr.ud_pkey_ix = chan->ch_pkey_ix;
715 alloc_attr.ud_sizes.cs_sq = ss->ei_caps->cp_max_swqe;
716 alloc_attr.ud_sizes.cs_rq = ss->ei_caps->cp_max_rwqe;
717 alloc_attr.ud_sizes.cs_sq_sgl = ss->ei_caps->cp_max_sgl;
718 alloc_attr.ud_sizes.cs_rq_sgl = 1;
719 alloc_attr.ud_sizes.cs_inline = 0;
720
721 alloc_attr.ud_qkey = EIB_DATA_QKEY;
722 alloc_attr.ud_scq = chan->ch_cq_hdl;
723 alloc_attr.ud_rcq = chan->ch_rcv_cq_hdl;
724 alloc_attr.ud_pd = ss->ei_pd_hdl;
725
726 ret = ibt_alloc_ud_channel(ss->ei_hca_hdl, IBT_ACHAN_NO_FLAGS,
727 &alloc_attr, &chan->ch_chan, NULL);
728 if (ret != IBT_SUCCESS) {
729 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: "
730 "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x, "
731 "cs_sq=0x%lx, cs_rq=0x%lx, sq_sgl=0x%lx) failed, ret=%d",
732 alloc_attr.ud_hca_port_num, chan->ch_pkey_ix,
733 alloc_attr.ud_sizes.cs_sq, alloc_attr.ud_sizes.cs_rq,
734 alloc_attr.ud_sizes.cs_sq_sgl, ret);
735
736 goto setup_data_ud_channel_fail;
737 }
738
739 ret = ibt_query_ud_channel(chan->ch_chan, &query_attr);
740 if (ret != IBT_SUCCESS) {
741 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: "
742 "ibt_query_ud_channel() failed, ret=%d", ret);
743 goto setup_data_ud_channel_fail;
744 }
745
746 chan->ch_qpn = query_attr.ud_qpn;
747 chan->ch_max_swqes = query_attr.ud_chan_sizes.cs_sq;
748 chan->ch_max_rwqes = query_attr.ud_chan_sizes.cs_rq;
749 chan->ch_lwm_rwqes = chan->ch_max_rwqes >> 2;
750 chan->ch_rwqe_bktsz = (chan->ch_max_rwqes < EIB_DATA_RWQE_BKT) ?
751 chan->ch_max_rwqes : EIB_DATA_RWQE_BKT;
752 chan->ch_ip_hdr_align = EIB_IP_HDR_ALIGN;
753 chan->ch_alloc_mp = B_TRUE;
754 chan->ch_tear_down = B_FALSE;
755
756 return (EIB_E_SUCCESS);
757
758 setup_data_ud_channel_fail:
759 eib_rb_data_setup_ud_channel(ss, vnic);
760 return (EIB_E_FAILURE);
761 }
762
763 static void
eib_data_setup_lso(eib_wqe_t * swqe,mblk_t * mp,uint32_t mss,eib_ether_hdr_t * evh)764 eib_data_setup_lso(eib_wqe_t *swqe, mblk_t *mp, uint32_t mss,
765 eib_ether_hdr_t *evh)
766 {
767 ibt_wr_lso_t *lso;
768 mblk_t *nmp;
769 uint8_t *dst;
770 uintptr_t ip_start;
771 uintptr_t tcp_start;
772 uint_t pending;
773 uint_t mblen;
774 uint_t eth_hdr_len;
775 uint_t ip_hdr_len;
776 uint_t tcp_hdr_len;
777
778 /*
779 * When the swqe was grabbed, it would've had its wr_opcode and
780 * wr.ud.udwr_dest set to default values. Since we're now going
781 * to use LSO, we need to change these.
782 */
783 swqe->qe_wr.send.wr_opcode = IBT_WRC_SEND_LSO;
784 lso = &(swqe->qe_wr.send.wr.ud_lso);
785 lso->lso_ud_dest = swqe->qe_dest;
786 lso->lso_mss = mss;
787
788 /*
789 * Details on the ethernet header in the mp is already known to us
790 */
791 eth_hdr_len = (evh->eh_tagless) ? (sizeof (struct ether_header)) :
792 (sizeof (struct ether_vlan_header));
793
794 /*
795 * Calculate the LSO header size and set it in the UD LSO structure.
796 * Note that the only assumption we make is that each of the Ethernet,
797 * IP and TCP headers will be contained in a single mblk fragment;
798 * together, the headers may span multiple mblk fragments. Note also
799 * that since the EoIB encapsulation header is not part of the message
800 * block we receive, we'll need to account space for inserting it later.
801 */
802 nmp = mp;
803 ip_start = (uintptr_t)(nmp->b_rptr) + eth_hdr_len;
804 if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
805 ip_start = (uintptr_t)nmp->b_cont->b_rptr
806 + (ip_start - (uintptr_t)(nmp->b_wptr));
807 nmp = nmp->b_cont;
808 }
809 ip_hdr_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
810
811 tcp_start = ip_start + ip_hdr_len;
812 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
813 tcp_start = (uintptr_t)nmp->b_cont->b_rptr
814 + (tcp_start - (uintptr_t)(nmp->b_wptr));
815 nmp = nmp->b_cont;
816 }
817 tcp_hdr_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
818
819 /*
820 * Since the passed mp fragment never contains the EoIB encapsulation
821 * header, we always have to copy the lso header. Sigh.
822 */
823 lso->lso_hdr = swqe->qe_payload_hdr;
824 lso->lso_hdr_sz = EIB_ENCAP_HDR_SZ + eth_hdr_len +
825 ip_hdr_len + tcp_hdr_len;
826
827 /*
828 * We already have the EoIB encapsulation header written at the
829 * start of wqe->qe_payload_hdr during swqe acquisition. Only
830 * copy the remaining headers.
831 */
832 dst = lso->lso_hdr + EIB_ENCAP_HDR_SZ;
833 pending = lso->lso_hdr_sz - EIB_ENCAP_HDR_SZ;
834
835 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
836 mblen = MBLKL(nmp);
837 if (pending > mblen) {
838 bcopy(nmp->b_rptr, dst, mblen);
839 dst += mblen;
840 pending -= mblen;
841 } else {
842 bcopy(nmp->b_rptr, dst, pending);
843 break;
844 }
845 }
846 }
847
848 static int
eib_data_prepare_sgl(eib_vnic_t * vnic,eib_wqe_t * swqe,mblk_t * mp)849 eib_data_prepare_sgl(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp)
850 {
851 eib_t *ss = vnic->vn_ss;
852 eib_stats_t *stats = vnic->vn_ss->ei_stats;
853 ibt_iov_t iov_arr[EIB_MAX_SGL];
854 ibt_iov_attr_t iov_attr;
855 ibt_wr_ds_t *sgl;
856 ibt_status_t ret;
857 mblk_t *nmp;
858 mblk_t *data_mp;
859 uchar_t *bufp;
860 size_t blksize;
861 size_t skip;
862 size_t avail;
863 uint_t lsohdr_sz;
864 uint_t pktsz;
865 ptrdiff_t frag_len;
866 uint_t pending_hdr;
867 uint_t nblks;
868 uint_t i;
869
870 /*
871 * Let's skip ahead to the TCP data if this is LSO. Note that while
872 * the lso header size in the swqe includes the EoIB encapsulation
873 * header size, that encapsulation header itself won't be found in
874 * the mblk.
875 */
876 lsohdr_sz = (swqe->qe_wr.send.wr_opcode == IBT_WRC_SEND) ? 0 :
877 swqe->qe_wr.send.wr.ud_lso.lso_hdr_sz;
878
879 data_mp = mp;
880 pending_hdr = 0;
881 if (lsohdr_sz) {
882 pending_hdr = lsohdr_sz - EIB_ENCAP_HDR_SZ;
883 for (nmp = mp; nmp; nmp = nmp->b_cont) {
884 frag_len =
885 (uintptr_t)nmp->b_wptr - (uintptr_t)nmp->b_rptr;
886 if (frag_len > pending_hdr)
887 break;
888 pending_hdr -= frag_len;
889 }
890 data_mp = nmp; /* start of data past lso header */
891 ASSERT(data_mp != NULL);
892 }
893
894 /*
895 * If this is an LSO packet, we want pktsz to hold the size of the
896 * data following the eoib/ethernet/tcp/ip headers. If this is a
897 * non-LSO packet, we want pktsz to refer to the size of the entire
898 * packet with all the headers, and nblks to hold the number of
899 * mappings we'll need to iov map this (for reserved lkey request).
900 */
901 if (lsohdr_sz == 0) {
902 nblks = 1;
903 pktsz = EIB_ENCAP_HDR_SZ;
904 } else {
905 nblks = 0;
906 pktsz = 0;
907 }
908 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
909 pktsz += MBLKL(nmp);
910 nblks++;
911 }
912 pktsz -= pending_hdr;
913
914 EIB_UPDATE_COUNTER(&stats->st_obytes, pktsz);
915 EIB_INCR_COUNTER(&stats->st_opkts);
916
917 /*
918 * We only do ibt_map_mem_iov() if the pktsz is above the tx copy
919 * threshold and if the number of mp fragments is less than the
920 * maximum acceptable.
921 */
922 if ((ss->ei_caps->cp_resv_lkey_capab) && (pktsz > EIB_TX_COPY_THRESH) &&
923 (nblks < ss->ei_caps->cp_hiwm_sgl)) {
924
925 iov_attr.iov_as = NULL;
926 iov_attr.iov = iov_arr;
927 iov_attr.iov_buf = NULL;
928 iov_attr.iov_list_len = nblks;
929 iov_attr.iov_wr_nds = ss->ei_caps->cp_max_sgl;
930 iov_attr.iov_lso_hdr_sz = lsohdr_sz;
931 iov_attr.iov_flags = IBT_IOV_SLEEP;
932
933 i = 0;
934 if (lsohdr_sz == 0) {
935 iov_arr[i].iov_addr = (caddr_t)swqe->qe_payload_hdr;
936 iov_arr[i].iov_len = EIB_ENCAP_HDR_SZ;
937 i++;
938 }
939 for (nmp = data_mp; i < nblks; i++, nmp = nmp->b_cont) {
940 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
941 iov_arr[i].iov_len = MBLKL(nmp);
942 if (nmp == data_mp) {
943 iov_arr[i].iov_addr += pending_hdr;
944 iov_arr[i].iov_len -= pending_hdr;
945 }
946 }
947 swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_MAPPED;
948 swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl;
949
950 ret = ibt_map_mem_iov(ss->ei_hca_hdl, &iov_attr,
951 &swqe->qe_wr, &swqe->qe_iov_hdl);
952 if (ret != IBT_SUCCESS) {
953 EIB_DPRINTF_WARN(ss->ei_instance,
954 "eib_data_prepare_sgl: "
955 "ibt_map_mem_iov(nblks=0x%lx) failed, ret=%d ",
956 "attempting to use copy path", nblks, ret);
957 goto prepare_sgl_copy_path;
958 }
959
960 return (EIB_E_SUCCESS);
961 }
962
963 prepare_sgl_copy_path:
964 if (pktsz <= swqe->qe_bufsz) {
965 swqe->qe_wr.send.wr_nds = 1;
966 swqe->qe_wr.send.wr_sgl = &swqe->qe_sgl;
967 swqe->qe_sgl.ds_len = pktsz;
968
969 /*
970 * Even though this is the copy path for transfers less than
971 * qe_bufsz, it could still be an LSO packet. If so, we only
972 * have to write the data following all the headers into the
973 * work request buffer, since we'll be sending the lso header
974 * itself separately. If this is not an LSO send (but pkt size
975 * greater than mtu, say for a jumbo frame), then we need
976 * to write all the headers including EoIB encapsulation,
977 * into the work request buffer.
978 */
979 bufp = (uchar_t *)(uintptr_t)swqe->qe_sgl.ds_va;
980 if (lsohdr_sz == 0) {
981 *(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR);
982 bufp += EIB_ENCAP_HDR_SZ;
983 }
984 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
985 blksize = MBLKL(nmp) - pending_hdr;
986 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
987 bufp += blksize;
988 pending_hdr = 0;
989 }
990
991 /*
992 * If the ethernet frame we're going to send is less than
993 * ETHERMIN, pad up the buffer to ETHERMIN (with zeros)
994 */
995 if ((pktsz + lsohdr_sz) < (ETHERMIN + EIB_ENCAP_HDR_SZ)) {
996 bzero(bufp, (ETHERMIN + EIB_ENCAP_HDR_SZ) -
997 (pktsz + lsohdr_sz));
998 swqe->qe_sgl.ds_len = ETHERMIN + EIB_ENCAP_HDR_SZ;
999 }
1000 return (EIB_E_SUCCESS);
1001 }
1002
1003 /*
1004 * Copy path for transfers greater than swqe->qe_bufsz
1005 */
1006 swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl;
1007 if (eib_rsrc_grab_lsobufs(ss, pktsz, swqe->qe_wr.send.wr_sgl,
1008 &(swqe->qe_wr.send.wr_nds)) != EIB_E_SUCCESS) {
1009 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_prepare_sgl: "
1010 "eib_rsrc_grab_lsobufs() failed");
1011 return (EIB_E_FAILURE);
1012 }
1013 swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_LSO;
1014
1015 /*
1016 * Copy the larger-than-qe_buf_sz packet into a set of fixed-sized,
1017 * pre-mapped LSO buffers. Note that we might need to skip part of
1018 * the LSO header in the first fragment as before.
1019 */
1020 nmp = data_mp;
1021 skip = pending_hdr;
1022 for (i = 0; i < swqe->qe_wr.send.wr_nds; i++) {
1023 sgl = swqe->qe_wr.send.wr_sgl + i;
1024 bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
1025 avail = EIB_LSO_BUFSZ;
1026
1027 /*
1028 * If this is a non-LSO packet (perhaps a jumbo frame?)
1029 * we may still need to prefix the EoIB header in the
1030 * wr buffer.
1031 */
1032 if ((i == 0) && (lsohdr_sz == 0)) {
1033 *(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR);
1034 bufp += EIB_ENCAP_HDR_SZ;
1035 avail -= EIB_ENCAP_HDR_SZ;
1036 }
1037
1038 while (nmp && avail) {
1039 blksize = MBLKL(nmp) - skip;
1040 if (blksize > avail) {
1041 bcopy(nmp->b_rptr + skip, bufp, avail);
1042 skip += avail;
1043 avail = 0;
1044 } else {
1045 bcopy(nmp->b_rptr + skip, bufp, blksize);
1046 skip = 0;
1047 bufp += blksize;
1048 avail -= blksize;
1049 nmp = nmp->b_cont;
1050 }
1051 }
1052 }
1053
1054 return (EIB_E_SUCCESS);
1055 }
1056
1057 /*ARGSUSED*/
1058 static int
eib_data_is_mcast_pkt_ok(eib_vnic_t * vnic,uint8_t * macaddr,uint64_t * brdcst,uint64_t * multicst)1059 eib_data_is_mcast_pkt_ok(eib_vnic_t *vnic, uint8_t *macaddr, uint64_t *brdcst,
1060 uint64_t *multicst)
1061 {
1062 /*
1063 * If the dmac is a broadcast packet, let it through. Otherwise, either
1064 * we should be in promiscuous mode or the dmac should be in our list of
1065 * joined multicast addresses. Currently we only update the stat
1066 * counters and always let things through.
1067 */
1068 if (bcmp(macaddr, eib_broadcast_mac, ETHERADDRL) == 0)
1069 EIB_INCR_COUNTER(brdcst);
1070 else
1071 EIB_INCR_COUNTER(multicst);
1072
1073 return (1);
1074 }
1075
1076 static void
eib_data_rx_comp_intr(ibt_cq_hdl_t cq_hdl,void * arg)1077 eib_data_rx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
1078 {
1079 eib_vnic_t *vnic = arg;
1080 eib_chan_t *chan = vnic->vn_data_chan;
1081 eib_t *ss = vnic->vn_ss;
1082
1083 if (cq_hdl != chan->ch_rcv_cq_hdl) {
1084 EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_rx_comp_intr: "
1085 "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
1086 "ignoring completion", cq_hdl, chan->ch_cq_hdl);
1087 return;
1088 }
1089
1090 ASSERT(vnic->vn_data_rx_si_hdl != NULL);
1091
1092 (void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl, NULL);
1093 }
1094
1095 static void
eib_data_tx_comp_intr(ibt_cq_hdl_t cq_hdl,void * arg)1096 eib_data_tx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
1097 {
1098 eib_vnic_t *vnic = arg;
1099 eib_chan_t *chan = vnic->vn_data_chan;
1100 eib_t *ss = vnic->vn_ss;
1101
1102 if (cq_hdl != chan->ch_cq_hdl) {
1103 EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_tx_comp_intr: "
1104 "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
1105 "ignoring completion", cq_hdl, chan->ch_cq_hdl);
1106 return;
1107 }
1108
1109 ASSERT(vnic->vn_data_tx_si_hdl != NULL);
1110
1111 (void) ddi_intr_trigger_softint(vnic->vn_data_tx_si_hdl, NULL);
1112 }
1113
1114 static mblk_t *
eib_data_rx_comp(eib_vnic_t * vnic,eib_wqe_t * wqe,ibt_wc_t * wc)1115 eib_data_rx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc)
1116 {
1117 eib_t *ss = vnic->vn_ss;
1118 eib_chan_t *chan = vnic->vn_data_chan;
1119 eib_login_data_t *ld = &vnic->vn_login_data;
1120 eib_stats_t *stats = ss->ei_stats;
1121 eib_ether_hdr_t evh;
1122 mblk_t *mp;
1123 boolean_t allocd_mp = B_FALSE;
1124 uint_t ec_hdr;
1125 uint_t ec_sign;
1126 uint_t ec_ver;
1127 uint_t ec_tu_cs;
1128 uint_t ec_ip_cs;
1129
1130 /*
1131 * Before we process this mblk and send it up to network layer, see
1132 * if we're running low on rwqes in the wqe pool. If so, allocate a
1133 * new mblk, copy the received data into it and send it up (and return
1134 * the current rwqe back to the pool immediately by calling freemsg()
1135 * on the original mblk).
1136 */
1137 if (!eib_rsrc_rxpool_low(wqe)) {
1138 mp = wqe->qe_mp;
1139 } else {
1140 if ((mp = allocb(wc->wc_bytes_xfer, BPRI_HI)) != NULL) {
1141 bcopy(wqe->qe_mp->b_rptr, mp->b_rptr,
1142 wc->wc_bytes_xfer);
1143 freemsg(wqe->qe_mp);
1144 allocd_mp = B_TRUE;
1145 } else {
1146 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1147 "wqe level below watermark, dropping rx pkt");
1148 EIB_INCR_COUNTER(&stats->st_norcvbuf);
1149 freemsg(wqe->qe_mp);
1150 return (NULL);
1151 }
1152 }
1153
1154 /*
1155 * Adjust write pointer depending on how much data came in. Note that
1156 * since the nw layer will expect us to hand over the mp with the
1157 * ethernet header starting at mp->b_rptr, update the b_rptr as well.
1158 */
1159 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
1160
1161 /*
1162 * We have a problem if this really happens!
1163 */
1164 if (mp->b_next != NULL) {
1165 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1166 "received packet's b_next not NULL, possible dup from cq");
1167 mp->b_next = NULL;
1168 }
1169
1170 /*
1171 * Drop loopback packets ?
1172 */
1173 if ((wc->wc_slid == ss->ei_props->ep_blid) &&
1174 (wc->wc_qpn == chan->ch_qpn)) {
1175 goto data_rx_comp_fail;
1176 }
1177
1178 mp->b_rptr += EIB_GRH_SZ;
1179
1180 /*
1181 * Since the recv buffer has been aligned for IP header to start on
1182 * a word boundary, it is safe to say that the EoIB and ethernet
1183 * headers won't start on a word boundary.
1184 */
1185 bcopy(mp->b_rptr, &ec_hdr, EIB_ENCAP_HDR_SZ);
1186
1187 /*
1188 * Check EoIB signature and version
1189 */
1190 ec_hdr = ntohl(ec_hdr);
1191
1192 ec_sign = (ec_hdr >> EIB_ENCAP_SIGN_SHIFT) & EIB_ENCAP_SIGN_MASK;
1193 if (ec_sign != EIB_EH_SIGNATURE) {
1194 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1195 "EoIB encapsulation header signature (0x%lx) unknown",
1196 ec_sign);
1197 goto data_rx_comp_fail;
1198 }
1199
1200 ec_ver = (ec_hdr >> EIB_ENCAP_VER_SHIFT) & EIB_ENCAP_VER_MASK;
1201 if (ec_ver != EIB_EH_VERSION) {
1202 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1203 "EoIB encapsulation header version (0x%lx) unknown",
1204 ec_ver);
1205 goto data_rx_comp_fail;
1206 }
1207
1208 /*
1209 * Check TCP/UDP and IP checksum
1210 */
1211 ec_tu_cs = (ec_hdr >> EIB_ENCAP_TCPCHK_SHIFT) & EIB_ENCAP_TCPCHK_MASK;
1212 ec_ip_cs = (ec_hdr >> EIB_ENCAP_IPCHK_SHIFT) & EIB_ENCAP_IPCHK_MASK;
1213
1214 if ((ec_tu_cs == EIB_EH_UDPCSUM_OK || ec_tu_cs == EIB_EH_TCPCSUM_OK) &&
1215 (ec_ip_cs == EIB_EH_IPCSUM_OK)) {
1216 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
1217 } else if (ec_tu_cs == EIB_EH_CSUM_BAD || ec_ip_cs == EIB_EH_CSUM_BAD) {
1218 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1219 "EoIB encapsulation header tcp/udp checksum (0x%lx) or"
1220 "ip checksum (0x%lx) is bad", ec_tu_cs, ec_ip_cs);
1221 }
1222
1223 /*
1224 * Update the message block's b_rptr to the start of ethernet header
1225 * and parse the header information
1226 */
1227 mp->b_rptr += EIB_ENCAP_HDR_SZ;
1228 eib_data_parse_ether_hdr(mp, &evh);
1229
1230 /*
1231 * If the incoming packet is vlan-tagged, but the tag doesn't match
1232 * this vnic's vlan, drop it.
1233 */
1234 if ((evh.eh_tagless == 0) && (evh.eh_vlan != ld->ld_assigned_vlan)) {
1235 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1236 "received packet's vlan unknown, expected=0x%x, got=0x%x",
1237 ld->ld_assigned_vlan, evh.eh_vlan);
1238 goto data_rx_comp_fail;
1239 }
1240
1241 /*
1242 * Final checks to see if the unicast destination is indeed correct
1243 * and to see if the multicast address is ok for us.
1244 */
1245 if (EIB_UNICAST_MAC(evh.eh_dmac)) {
1246 if (bcmp(evh.eh_dmac, ld->ld_assigned_mac, ETHERADDRL) != 0) {
1247 uint8_t *exp;
1248 uint8_t *got;
1249
1250 exp = ld->ld_assigned_mac;
1251 got = evh.eh_dmac;
1252
1253 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1254 "received packet's macaddr mismatch, "
1255 "expected=%x:%x:%x:%x:%x:%x, got=%x:%x:%x:%x:%x:%x",
1256 exp[0], exp[1], exp[2], exp[3], exp[4], exp[5],
1257 got[0], got[1], got[2], got[3], got[4], got[5]);
1258
1259 goto data_rx_comp_fail;
1260 }
1261 } else {
1262 if (!eib_data_is_mcast_pkt_ok(vnic, evh.eh_dmac,
1263 &stats->st_brdcstrcv, &stats->st_multircv)) {
1264 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1265 "multicast packet not ok");
1266 goto data_rx_comp_fail;
1267 }
1268 }
1269
1270 /*
1271 * Strip ethernet FCS if present in the packet. ConnectX-2 doesn't
1272 * support ethernet FCS, so this shouldn't happen anyway.
1273 */
1274 if ((ec_hdr >> EIB_ENCAP_FCS_B_SHIFT) & 0x1) {
1275 EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1276 "ethernet FCS present (ec_hdr=0%lx), ignoring",
1277 ec_hdr);
1278
1279 mp->b_wptr -= ETHERFCSL;
1280 }
1281
1282 /*
1283 * If this is the same mp as was in the original rwqe (i.e. we didn't
1284 * do any allocb()), then mark the rwqe flag so we know that its mblk
1285 * is with the network layer.
1286 */
1287 if (!allocd_mp) {
1288 wqe->qe_info |= EIB_WQE_FLG_WITH_NW;
1289 }
1290
1291 return (mp);
1292
1293 data_rx_comp_fail:
1294 freemsg(mp);
1295 return (NULL);
1296 }
1297
1298 static void
eib_data_tx_comp(eib_vnic_t * vnic,eib_wqe_t * wqe,eib_chan_t * chan)1299 eib_data_tx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, eib_chan_t *chan)
1300 {
1301 eib_t *ss = vnic->vn_ss;
1302 ibt_status_t ret;
1303
1304 if (wqe->qe_mp) {
1305 if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_MAPPED) {
1306 ret = ibt_unmap_mem_iov(ss->ei_hca_hdl,
1307 wqe->qe_iov_hdl);
1308 if (ret != IBT_SUCCESS) {
1309 EIB_DPRINTF_WARN(ss->ei_instance,
1310 "eib_data_tx_comp: "
1311 "ibt_unmap_mem_iov() failed, ret=%d", ret);
1312 }
1313 wqe->qe_iov_hdl = NULL;
1314 } else if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_LSO) {
1315 eib_rsrc_return_lsobufs(ss, wqe->qe_big_sgl,
1316 wqe->qe_wr.send.wr_nds);
1317 }
1318 freemsg(wqe->qe_mp);
1319 wqe->qe_mp = NULL;
1320 }
1321
1322 eib_rsrc_return_swqe(ss, wqe, chan);
1323 }
1324
1325 static void
eib_data_err_comp(eib_vnic_t * vnic,eib_wqe_t * wqe,ibt_wc_t * wc)1326 eib_data_err_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc)
1327 {
1328 eib_t *ss = vnic->vn_ss;
1329
1330 /*
1331 * Currently, all we do is report
1332 */
1333 switch (wc->wc_status) {
1334 case IBT_WC_WR_FLUSHED_ERR:
1335 break;
1336
1337 case IBT_WC_LOCAL_CHAN_OP_ERR:
1338 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: "
1339 "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ",
1340 wqe->qe_info);
1341 break;
1342
1343 case IBT_WC_LOCAL_PROTECT_ERR:
1344 EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: "
1345 "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ",
1346 wqe->qe_info);
1347 break;
1348 }
1349
1350 /*
1351 * When a wc indicates error, we do not attempt to repost the
1352 * rwqe but simply return it to the wqe pool. Also for rwqes,
1353 * attempting to free the mblk in the wqe invokes the
1354 * eib_data_rx_recycle() callback. For tx wqes, error handling
1355 * is the same as successful completion handling. We still
1356 * have to unmap iov/free lsobufs/free mblk and then return the
1357 * swqe to the pool.
1358 */
1359 if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) {
1360 ASSERT(wqe->qe_mp != NULL);
1361 freemsg(wqe->qe_mp);
1362 } else {
1363 eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan);
1364 }
1365 }
1366
1367 /*ARGSUSED*/
1368 static void
eib_rb_data_setup_cqs(eib_t * ss,eib_vnic_t * vnic)1369 eib_rb_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic)
1370 {
1371 eib_chan_t *chan = vnic->vn_data_chan;
1372 ibt_status_t ret;
1373
1374 if (chan == NULL)
1375 return;
1376
1377 /*
1378 * Reset any completion handlers we may have set up
1379 */
1380 if (chan->ch_rcv_cq_hdl) {
1381 ibt_set_cq_handler(chan->ch_rcv_cq_hdl, NULL, NULL);
1382 }
1383 if (chan->ch_cq_hdl) {
1384 ibt_set_cq_handler(chan->ch_cq_hdl, NULL, NULL);
1385 }
1386
1387 /*
1388 * Remove any softints that were added
1389 */
1390 if (vnic->vn_data_rx_si_hdl) {
1391 (void) ddi_intr_remove_softint(vnic->vn_data_rx_si_hdl);
1392 vnic->vn_data_rx_si_hdl = NULL;
1393 }
1394 if (vnic->vn_data_tx_si_hdl) {
1395 (void) ddi_intr_remove_softint(vnic->vn_data_tx_si_hdl);
1396 vnic->vn_data_tx_si_hdl = NULL;
1397 }
1398
1399 /*
1400 * Release any work completion buffers we may have allocated
1401 */
1402 if (chan->ch_rcv_wc && chan->ch_rcv_cq_sz) {
1403 kmem_free(chan->ch_rcv_wc,
1404 sizeof (ibt_wc_t) * chan->ch_rcv_cq_sz);
1405 }
1406 chan->ch_rcv_cq_sz = 0;
1407 chan->ch_rcv_wc = NULL;
1408
1409 if (chan->ch_wc && chan->ch_cq_sz) {
1410 kmem_free(chan->ch_wc, sizeof (ibt_wc_t) * chan->ch_cq_sz);
1411 }
1412 chan->ch_cq_sz = 0;
1413 chan->ch_wc = NULL;
1414
1415 /*
1416 * Free any completion queues we may have allocated
1417 */
1418 if (chan->ch_rcv_cq_hdl) {
1419 ret = ibt_free_cq(chan->ch_rcv_cq_hdl);
1420 if (ret != IBT_SUCCESS) {
1421 EIB_DPRINTF_WARN(ss->ei_instance,
1422 "eib_rb_data_setup_cqs: "
1423 "ibt_free_cq(rcv_cq) failed, ret=%d", ret);
1424 }
1425 chan->ch_rcv_cq_hdl = NULL;
1426 }
1427 if (chan->ch_cq_hdl) {
1428 ret = ibt_free_cq(chan->ch_cq_hdl);
1429 if (ret != IBT_SUCCESS) {
1430 EIB_DPRINTF_WARN(ss->ei_instance,
1431 "eib_rb_data_setup_cqs: "
1432 "ibt_free_cq(snd_cq) failed, ret=%d", ret);
1433 }
1434 chan->ch_cq_hdl = NULL;
1435 }
1436 }
1437
1438 /*ARGSUSED*/
1439 static void
eib_rb_data_setup_ud_channel(eib_t * ss,eib_vnic_t * vnic)1440 eib_rb_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic)
1441 {
1442 eib_chan_t *chan = vnic->vn_data_chan;
1443 ibt_status_t ret;
1444
1445 if (chan == NULL)
1446 return;
1447
1448 if (chan->ch_chan) {
1449 /*
1450 * We're trying to tear down this UD channel. Make sure that
1451 * we don't attempt to refill (repost) at any point from now on.
1452 */
1453 chan->ch_tear_down = B_TRUE;
1454 if ((ret = ibt_flush_channel(chan->ch_chan)) != IBT_SUCCESS) {
1455 EIB_DPRINTF_WARN(ss->ei_instance,
1456 "eib_rb_data_setup_ud_channel: "
1457 "ibt_flush_channel() failed, ret=%d", ret);
1458 }
1459
1460 /*
1461 * Wait until all posted tx wqes on this channel are back with
1462 * the wqe pool.
1463 */
1464 mutex_enter(&chan->ch_tx_lock);
1465 while (chan->ch_tx_posted > 0)
1466 cv_wait(&chan->ch_tx_cv, &chan->ch_tx_lock);
1467 mutex_exit(&chan->ch_tx_lock);
1468
1469 /*
1470 * Wait until all posted rx wqes on this channel are back with
1471 * the wqe pool.
1472 */
1473 mutex_enter(&chan->ch_rx_lock);
1474 while (chan->ch_rx_posted > 0)
1475 cv_wait(&chan->ch_rx_cv, &chan->ch_rx_lock);
1476 mutex_exit(&chan->ch_rx_lock);
1477
1478 /*
1479 * Now we're ready to free this channel
1480 */
1481 if ((ret = ibt_free_channel(chan->ch_chan)) != IBT_SUCCESS) {
1482 EIB_DPRINTF_WARN(ss->ei_instance,
1483 "eib_rb_data_setup_ud_channel: "
1484 "ibt_free_channel() failed, ret=%d", ret);
1485 }
1486
1487 chan->ch_alloc_mp = B_FALSE;
1488 chan->ch_ip_hdr_align = 0;
1489 chan->ch_rwqe_bktsz = 0;
1490 chan->ch_lwm_rwqes = 0;
1491 chan->ch_max_rwqes = 0;
1492 chan->ch_max_swqes = 0;
1493 chan->ch_qpn = 0;
1494 chan->ch_chan = NULL;
1495 }
1496 }
1497