xref: /titanic_51/usr/src/uts/common/io/ib/adapters/hermon/hermon_wr.c (revision 17a2b317610f531d565bf4e940433aab2d9e6985)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * hermon_wr.c
28  *    Hermon Work Request Processing Routines
29  *
30  *    Implements all the routines necessary to provide the PostSend(),
31  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
32  *    necessary to implement the Hermon WRID tracking mechanism.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/modctl.h>
40 #include <sys/avl.h>
41 
42 #include <sys/ib/adapters/hermon/hermon.h>
43 
44 static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr);
45 static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr);
46 static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
47     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
48 static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
49     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
50 static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp);
51 static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
52     ibt_recv_wr_t *wr, uint64_t *desc);
53 static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
54     ibt_recv_wr_t *wr, uint64_t *desc);
55 static hermon_workq_avl_t *hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn,
56     uint_t send_or_recv);
57 static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl);
58 static void hermon_cq_workq_remove(hermon_cqhdl_t cq,
59     hermon_workq_avl_t *wqavl);
60 
61 static	ibt_wr_ds_t	null_sgl = { 0, 0x00000100, 0 };
62 
63 /*
64  * Add ability to try to debug RDMA_READ/RDMA_WRITE failures.
65  *
66  *      0x1 - print rkey used during post_send
67  *      0x2 - print sgls used during post_send
68  *	0x4 - print FMR comings and goings
69  */
70 int hermon_rdma_debug = 0x0;
71 
72 static int
73 hermon_post_send_ud(hermon_state_t *state, hermon_qphdl_t qp,
74     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
75 {
76 	hermon_hw_snd_wqe_ud_t		*ud;
77 	hermon_workq_hdr_t		*wq;
78 	hermon_ahhdl_t			ah;
79 	ibt_wr_rfci_send_t		*rfci;
80 	ibt_wr_init_send_t		*is;
81 	ibt_ud_dest_t			*dest;
82 	uint64_t			*desc;
83 	uint32_t			desc_sz;
84 	uint32_t			signaled_dbd, solicited;
85 	uint32_t			head, tail, next_tail, qsize_msk;
86 	uint32_t			hdrmwqes;
87 	uint32_t			nopcode, fence, immed_data = 0;
88 	hermon_hw_wqe_sgl_t		*ds, *old_ds;
89 	ibt_wr_ds_t			*sgl;
90 	int				nds;
91 	int				i, j, last_ds, num_ds, status;
92 	uint32_t			*wqe_start;
93 	int				sectperwqe;
94 	uint_t				posted_cnt = 0;
95 	int				total_len, strong_order, fc_bits, cksum;
96 
97 
98 	/* initialize the FMA retry loop */
99 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
100 
101 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
102 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
103 
104 	/* Grab the lock for the WRID list */
105 	membar_consumer();
106 
107 	/* Save away some initial QP state */
108 	wq = qp->qp_sq_wqhdr;
109 	qsize_msk = wq->wq_mask;
110 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
111 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
112 
113 	tail	  = wq->wq_tail;
114 	head	  = wq->wq_head;
115 	status	  = DDI_SUCCESS;
116 
117 post_next:
118 	/*
119 	 * Check for "queue full" condition.  If the queue
120 	 * is already full, then no more WQEs can be posted.
121 	 * So break out, ring a doorbell (if necessary) and
122 	 * return an error
123 	 */
124 	if (wq->wq_full != 0) {
125 		status = IBT_QP_FULL;
126 		goto done;
127 	}
128 
129 	next_tail = (tail + 1) & qsize_msk;
130 	if (((tail + hdrmwqes) & qsize_msk) == head) {
131 		wq->wq_full = 1;
132 	}
133 
134 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
135 
136 	nds = wr->wr_nds;
137 	sgl = wr->wr_sgl;
138 	num_ds = 0;
139 	strong_order = 0;
140 	fc_bits = 0;
141 	cksum = 0;
142 
143 	/*
144 	 * Build a Send or Send_LSO WQE
145 	 */
146 	switch (wr->wr_opcode) {
147 	case IBT_WRC_SEND_LSO:
148 		if (wr->wr_trans != IBT_UD_SRV) {
149 			status = IBT_QP_SRV_TYPE_INVALID;
150 			goto done;
151 		}
152 		nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
153 		if (wr->wr_flags & IBT_WR_SEND_CKSUM)
154 			cksum = 0x30;
155 		if (wr->wr.ud_lso.lso_hdr_sz > 60) {
156 			nopcode |= (1 << 6);	/* ReRead bit must be set */
157 		}
158 		dest = wr->wr.ud_lso.lso_ud_dest;
159 		ah = (hermon_ahhdl_t)dest->ud_ah;
160 		if (ah == NULL) {
161 			status = IBT_AH_HDL_INVALID;
162 			goto done;
163 		}
164 		ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
165 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
166 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
167 		    sizeof (hermon_hw_snd_wqe_ud_t));
168 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
169 
170 		total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
171 		if ((uintptr_t)ds + total_len + (nds * 16) >
172 		    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) {
173 			status = IBT_QP_SGL_LEN_INVALID;
174 			goto done;
175 		}
176 		old_ds = ds;
177 		bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)old_ds + 1,
178 		    wr->wr.ud_lso.lso_hdr_sz);
179 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
180 		i = 0;
181 		break;
182 
183 	case IBT_WRC_SEND:
184 		nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
185 		if (qp->qp_serv_type == HERMON_QP_UD) {
186 			if (wr->wr_trans != IBT_UD_SRV) {
187 				status = IBT_QP_SRV_TYPE_INVALID;
188 				goto done;
189 			}
190 			if (wr->wr_flags & IBT_WR_SEND_CKSUM)
191 				cksum = 0x30;
192 			dest = wr->wr.ud.udwr_dest;
193 		} else if (qp->qp_serv_type == HERMON_QP_RFCI) {
194 			if (wr->wr_trans != IBT_RFCI_SRV) {
195 				status = IBT_QP_SRV_TYPE_INVALID;
196 				goto done;
197 			}
198 			rfci = &wr->wr.fc.rfci_send;
199 			if ((wr->wr_flags & IBT_WR_SEND_FC_CRC) != 0) {
200 				nopcode |= (rfci->rfci_eof << 16);
201 				fc_bits = 0x40;	/* set FCRC */
202 			}
203 			dest = rfci->rfci_dest;
204 		} else {
205 			status = IBT_QP_OP_TYPE_INVALID;
206 			goto done;
207 		}
208 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
209 			/* "|=" changes 0xa to 0xb without touching FCEOF */
210 			nopcode |= HERMON_WQE_SEND_NOPCODE_SENDI;
211 			immed_data = wr->wr.ud.udwr_immed;
212 		}
213 		ah = (hermon_ahhdl_t)dest->ud_ah;
214 		if (ah == NULL) {
215 			status = IBT_AH_HDL_INVALID;
216 			goto done;
217 		}
218 		ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
219 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
220 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
221 		    sizeof (hermon_hw_snd_wqe_ud_t));
222 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
223 		i = 0;
224 		break;
225 
226 	case IBT_WRC_INIT_SEND_FCMD:
227 		if (qp->qp_serv_type != HERMON_QP_FCMND) {
228 			status = IBT_QP_OP_TYPE_INVALID;
229 			goto done;
230 		}
231 		if (wr->wr_trans != IBT_FCMD_SRV) {
232 			status = IBT_QP_SRV_TYPE_INVALID;
233 			goto done;
234 		}
235 		nopcode = HERMON_WQE_FCP_OPCODE_INIT_AND_SEND;
236 		is = wr->wr.fc.fc_is;
237 		dest = is->is_ctl.fc_dest;
238 		ah = (hermon_ahhdl_t)dest->ud_ah;
239 		if (ah == NULL) {
240 			status = IBT_AH_HDL_INVALID;
241 			goto done;
242 		}
243 		ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
244 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
245 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
246 		    sizeof (hermon_hw_snd_wqe_ud_t));
247 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
248 		old_ds = ds;
249 		/* move ds beyond the FCP-3 Init Segment */
250 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + 0x10);
251 		i = 0;
252 		break;
253 
254 	case IBT_WRC_FAST_REG_PMR:
255 	{
256 		hermon_hw_snd_wqe_frwr_t	*frwr;
257 
258 		if (qp->qp_serv_type != HERMON_QP_FCMND) {
259 			status = IBT_QP_OP_TYPE_INVALID;
260 			goto done;
261 		}
262 		if (wr->wr_trans != IBT_FCMD_SRV) {
263 			status = IBT_QP_SRV_TYPE_INVALID;
264 			goto done;
265 		}
266 		nopcode = HERMON_WQE_SEND_NOPCODE_FRWR;
267 		frwr = (hermon_hw_snd_wqe_frwr_t *)((uintptr_t)desc +
268 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
269 		HERMON_WQE_BUILD_FRWR(qp, frwr, wr->wr.fc.reg_pmr);
270 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)frwr +
271 		    sizeof (hermon_hw_snd_wqe_frwr_t));
272 		nds = 0;
273 		strong_order = 0x80;
274 		break;
275 	}
276 
277 #if 0
278 	/* firmware does not support this */
279 	case IBT_WRC_LOCAL_INVALIDATE:
280 	{
281 		hermon_hw_snd_wqe_local_inv_t	*li;
282 
283 		if (qp->qp_serv_type != HERMON_QP_FCMND) {
284 			status = IBT_QP_OP_TYPE_INVALID;
285 			goto done;
286 		}
287 		if (wr->wr_trans != IBT_FCMD_SRV) {
288 			status = IBT_QP_SRV_TYPE_INVALID;
289 			goto done;
290 		}
291 		nopcode = HERMON_WQE_SEND_NOPCODE_LCL_INV;
292 		li = (hermon_hw_snd_wqe_local_inv_t *)((uintptr_t)desc +
293 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
294 		HERMON_WQE_BUILD_LI(qp, li, wr->wr.fc.li);
295 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)li +
296 		    sizeof (hermon_hw_snd_wqe_local_inv_t));
297 		nds = 0;
298 		strong_order = 0x80;
299 		break;
300 	}
301 #endif
302 	default:
303 		status = IBT_QP_OP_TYPE_INVALID;
304 		goto done;
305 	}
306 
307 	if (nds > qp->qp_sq_sgl) {
308 		status = IBT_QP_SGL_LEN_INVALID;
309 		goto done;
310 	}
311 	for (last_ds = num_ds, j = i; j < nds; j++) {
312 		if (sgl[j].ds_len != 0)
313 			last_ds++;	/* real last ds of wqe to fill */
314 	}
315 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
316 	for (j = nds; --j >= i; ) {
317 		if (sgl[j].ds_len == 0) {
318 			continue;
319 		}
320 
321 		/*
322 		 * Fill in the Data Segment(s) for the current WQE, using the
323 		 * information contained in the scatter-gather list of the
324 		 * work request.
325 		 */
326 		last_ds--;
327 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
328 	}
329 
330 	membar_producer();
331 
332 	if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
333 		HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
334 		    wr->wr.ud_lso.lso_hdr_sz);
335 	} else if (wr->wr_opcode == IBT_WRC_INIT_SEND_FCMD) {
336 		/* This sits in the STAMP, so must be set after setting SGL */
337 		HERMON_WQE_BUILD_FCP3_INIT(old_ds, is->is_ctl.fc_frame_ctrl,
338 		    is->is_cs_priority, is->is_tx_seq_id, is->is_fc_mtu,
339 		    is->is_dest_id, is->is_op, is->is_rem_exch,
340 		    is->is_exch_qp_idx);
341 
342 		/* The following will be used in HERMON_WQE_SET_CTRL_SEGMENT */
343 		/* SIT bit in FCP-3 ctrl segment */
344 		desc_sz |= (is->is_ctl.fc_frame_ctrl & IBT_FCTL_SIT) ? 0x80 : 0;
345 		/* LS bit in FCP-3 ctrl segment */
346 		fc_bits |= (is->is_ctl.fc_frame_ctrl & IBT_FCTL_LAST_SEQ) ?
347 		    0x10000 : 0;
348 		fc_bits |= ((is->is_ctl.fc_routing_ctrl & 0xF) << 20) |
349 		    (is->is_ctl.fc_seq_id << 24);
350 		immed_data = is->is_ctl.fc_parameter;
351 	}
352 
353 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
354 
355 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
356 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 0xC : 0;
357 
358 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 0x2 : 0;
359 
360 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data,
361 	    solicited, signaled_dbd, cksum, qp, strong_order, fc_bits);
362 
363 	wq->wq_wrid[tail] = wr->wr_id;
364 
365 	tail = next_tail;
366 
367 	/* Update some of the state in the QP */
368 	wq->wq_tail = tail;
369 
370 	membar_producer();
371 
372 	/* Now set the ownership bit and opcode (first dword). */
373 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
374 
375 	posted_cnt++;
376 	if (--num_wr > 0) {
377 		/* do the invalidate of the headroom */
378 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
379 		    (tail + hdrmwqes) & qsize_msk);
380 		for (i = 16; i < sectperwqe; i += 16) {
381 			wqe_start[i] = 0xFFFFFFFF;
382 		}
383 
384 		wr++;
385 		goto post_next;
386 	}
387 done:
388 	if (posted_cnt != 0) {
389 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
390 
391 		membar_producer();
392 
393 		/* the FMA retry loop starts for Hermon doorbell register. */
394 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
395 		    fm_status, fm_test_num);
396 
397 		HERMON_UAR_DOORBELL(state, uarhdl,
398 		    (uint64_t *)(void *)&state->hs_uar->send,
399 		    (uint64_t)qp->qp_ring);
400 
401 		/* the FMA retry loop ends. */
402 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
403 		    fm_status, fm_test_num);
404 
405 		/* do the invalidate of the headroom */
406 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
407 		    (tail + hdrmwqes) & qsize_msk);
408 		for (i = 16; i < sectperwqe; i += 16) {
409 			wqe_start[i] = 0xFFFFFFFF;
410 		}
411 	}
412 	if (num_posted != NULL)
413 		*num_posted = posted_cnt;
414 
415 	mutex_exit(&qp->qp_sq_lock);
416 
417 	return (status);
418 
419 pio_error:
420 	mutex_exit(&qp->qp_sq_lock);
421 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
422 	return (ibc_get_ci_failure(0));
423 }
424 
425 static int
426 hermon_post_send_rc(hermon_state_t *state, hermon_qphdl_t qp,
427     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
428 {
429 	uint64_t			*desc;
430 	hermon_workq_hdr_t		*wq;
431 	uint32_t			desc_sz;
432 	uint32_t			signaled_dbd, solicited;
433 	uint32_t			head, tail, next_tail, qsize_msk;
434 	uint32_t			hdrmwqes;
435 	int				status;
436 	uint32_t			nopcode, fence, immed_data = 0;
437 	hermon_hw_snd_wqe_remaddr_t	*rc;
438 	hermon_hw_snd_wqe_atomic_t	*at;
439 	hermon_hw_snd_wqe_bind_t	*bn;
440 	hermon_hw_snd_wqe_frwr_t	*frwr;
441 	hermon_hw_snd_wqe_local_inv_t	*li;
442 	hermon_hw_wqe_sgl_t		*ds;
443 	ibt_wr_ds_t			*sgl;
444 	int				nds;
445 	int				i, last_ds, num_ds;
446 	uint32_t			*wqe_start;
447 	int				sectperwqe;
448 	uint_t				posted_cnt = 0;
449 	int				strong_order;
450 	int				print_rdma;
451 	int				rlen;
452 	uint32_t			rkey;
453 	uint64_t			raddr;
454 
455 	/* initialize the FMA retry loop */
456 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
457 
458 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
459 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
460 
461 	/* Save away some initial QP state */
462 	wq = qp->qp_sq_wqhdr;
463 	qsize_msk = wq->wq_mask;
464 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
465 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
466 
467 	tail	  = wq->wq_tail;
468 	head	  = wq->wq_head;
469 	status	  = DDI_SUCCESS;
470 
471 post_next:
472 	print_rdma = 0;
473 	rlen = 0;
474 	strong_order = 0;
475 
476 	/*
477 	 * Check for "queue full" condition.  If the queue
478 	 * is already full, then no more WQEs can be posted.
479 	 * So break out, ring a doorbell (if necessary) and
480 	 * return an error
481 	 */
482 	if (wq->wq_full != 0) {
483 		status = IBT_QP_FULL;
484 		goto done;
485 	}
486 	next_tail = (tail + 1) & qsize_msk;
487 	if (((tail + hdrmwqes) & qsize_msk) == head) {
488 		wq->wq_full = 1;
489 	}
490 
491 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
492 
493 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
494 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
495 	nds = wr->wr_nds;
496 	sgl = wr->wr_sgl;
497 	num_ds = 0;
498 	if (wr->wr_trans != IBT_RC_SRV) {
499 		status = IBT_QP_SRV_TYPE_INVALID;
500 		goto done;
501 	}
502 
503 	/*
504 	 * Validate the operation type.  For RC requests, we allow
505 	 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
506 	 * operations, and memory window "Bind"
507 	 */
508 	switch (wr->wr_opcode) {
509 	default:
510 		status = IBT_QP_OP_TYPE_INVALID;
511 		goto done;
512 
513 	case IBT_WRC_SEND:
514 		if (wr->wr_flags & IBT_WR_SEND_REMOTE_INVAL) {
515 			nopcode = HERMON_WQE_SEND_NOPCODE_SND_INV;
516 			immed_data = wr->wr.rc.rcwr.send_inval;
517 		} else if (wr->wr_flags & IBT_WR_SEND_IMMED) {
518 			nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
519 			immed_data = wr->wr.rc.rcwr.send_immed;
520 		} else {
521 			nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
522 		}
523 		break;
524 
525 	/*
526 	 * If this is an RDMA Read or RDMA Write request, then fill
527 	 * in the "Remote Address" header fields.
528 	 */
529 	case IBT_WRC_RDMAW:
530 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
531 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI;
532 			immed_data = wr->wr.rc.rcwr.rdma.rdma_immed;
533 		} else {
534 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
535 		}
536 		/* FALLTHROUGH */
537 	case IBT_WRC_RDMAR:
538 		if (wr->wr_opcode == IBT_WRC_RDMAR)
539 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
540 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
541 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
542 
543 		/*
544 		 * Build the Remote Address Segment for the WQE, using
545 		 * the information from the RC work request.
546 		 */
547 		HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
548 
549 		if (hermon_rdma_debug) {
550 			print_rdma = hermon_rdma_debug;
551 			rkey = wr->wr.rc.rcwr.rdma.rdma_rkey;
552 			raddr = wr->wr.rc.rcwr.rdma.rdma_raddr;
553 		}
554 
555 		/* Update "ds" for filling in Data Segments (below) */
556 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
557 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
558 		break;
559 
560 	/*
561 	 * If this is one of the Atomic type operations (i.e
562 	 * Compare-Swap or Fetch-Add), then fill in both the "Remote
563 	 * Address" header fields and the "Atomic" header fields.
564 	 */
565 	case IBT_WRC_CSWAP:
566 		nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
567 		/* FALLTHROUGH */
568 	case IBT_WRC_FADD:
569 		if (wr->wr_opcode == IBT_WRC_FADD)
570 			nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
571 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
572 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
573 		at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
574 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
575 
576 		/*
577 		 * Build the Remote Address and Atomic Segments for
578 		 * the WQE, using the information from the RC Atomic
579 		 * work request.
580 		 */
581 		HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
582 		HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
583 
584 		/* Update "ds" for filling in Data Segments (below) */
585 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
586 		    sizeof (hermon_hw_snd_wqe_atomic_t));
587 
588 		/*
589 		 * Update "nds" and "sgl" because Atomic requests have
590 		 * only a single Data Segment.
591 		 */
592 		nds = 1;
593 		sgl = wr->wr_sgl;
594 		break;
595 
596 	/*
597 	 * If this is memory window Bind operation, then we call the
598 	 * hermon_wr_bind_check() routine to validate the request and
599 	 * to generate the updated RKey.  If this is successful, then
600 	 * we fill in the WQE's "Bind" header fields.
601 	 */
602 	case IBT_WRC_BIND:
603 		nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
604 		status = hermon_wr_bind_check(state, wr);
605 		if (status != DDI_SUCCESS)
606 			goto done;
607 
608 		bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
609 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
610 
611 		/*
612 		 * Build the Bind Memory Window Segments for the WQE,
613 		 * using the information from the RC Bind memory
614 		 * window work request.
615 		 */
616 		HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
617 
618 		/*
619 		 * Update the "ds" pointer.  Even though the "bind"
620 		 * operation requires no SGLs, this is necessary to
621 		 * facilitate the correct descriptor size calculations
622 		 * (below).
623 		 */
624 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
625 		    sizeof (hermon_hw_snd_wqe_bind_t));
626 		nds = 0;
627 		break;
628 
629 	case IBT_WRC_FAST_REG_PMR:
630 		nopcode = HERMON_WQE_SEND_NOPCODE_FRWR;
631 		frwr = (hermon_hw_snd_wqe_frwr_t *)((uintptr_t)desc +
632 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
633 		HERMON_WQE_BUILD_FRWR(qp, frwr, wr->wr.rc.rcwr.reg_pmr);
634 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)frwr +
635 		    sizeof (hermon_hw_snd_wqe_frwr_t));
636 		nds = 0;
637 		strong_order = 0x80;
638 		break;
639 
640 	case IBT_WRC_LOCAL_INVALIDATE:
641 		nopcode = HERMON_WQE_SEND_NOPCODE_LCL_INV;
642 		li = (hermon_hw_snd_wqe_local_inv_t *)((uintptr_t)desc +
643 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
644 		HERMON_WQE_BUILD_LI(qp, li, wr->wr.rc.rcwr.li);
645 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)li +
646 		    sizeof (hermon_hw_snd_wqe_local_inv_t));
647 		nds = 0;
648 		strong_order = 0x80;
649 		break;
650 	}
651 
652 	/*
653 	 * Now fill in the Data Segments (SGL) for the Send WQE based
654 	 * on the values setup above (i.e. "sgl", "nds", and the "ds"
655 	 * pointer. Start by checking for a valid number of SGL entries
656 	 */
657 	if (nds > qp->qp_sq_sgl) {
658 		status = IBT_QP_SGL_LEN_INVALID;
659 		goto done;
660 	}
661 
662 	for (last_ds = num_ds, i = 0; i < nds; i++) {
663 		if (sgl[i].ds_len != 0)
664 			last_ds++;	/* real last ds of wqe to fill */
665 	}
666 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
667 	for (i = nds; --i >= 0; ) {
668 		if (sgl[i].ds_len == 0) {
669 			continue;
670 		}
671 		rlen += sgl[i].ds_len;
672 		if (print_rdma & 0x2)
673 			IBTF_DPRINTF_L2("rdma", "post: [%d]: laddr %llx  "
674 			    "llen %x", i, sgl[i].ds_va, sgl[i].ds_len);
675 
676 		/*
677 		 * Fill in the Data Segment(s) for the current WQE, using the
678 		 * information contained in the scatter-gather list of the
679 		 * work request.
680 		 */
681 		last_ds--;
682 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[i]);
683 	}
684 	/* ensure RDMA READ does not exceed HCA limit */
685 	if ((wr->wr_opcode == IBT_WRC_RDMAR) && (desc_sz >
686 	    state->hs_ibtfinfo.hca_attr->hca_conn_rdma_read_sgl_sz + 2)) {
687 		status = IBT_QP_SGL_LEN_INVALID;
688 		goto done;
689 	}
690 
691 	if (print_rdma & 0x1) {
692 		IBTF_DPRINTF_L2("rdma", "post: indx %x  rkey %x  raddr %llx  "
693 		    "total len %x", tail, rkey, raddr, rlen);
694 	}
695 
696 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
697 
698 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
699 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 0xC : 0;
700 
701 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 0x2 : 0;
702 
703 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited,
704 	    signaled_dbd, 0, qp, strong_order, 0);
705 
706 	wq->wq_wrid[tail] = wr->wr_id;
707 
708 	tail = next_tail;
709 
710 	/* Update some of the state in the QP */
711 	wq->wq_tail = tail;
712 
713 	membar_producer();
714 
715 	/* Now set the ownership bit of the first one in the chain. */
716 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
717 
718 	posted_cnt++;
719 	if (--num_wr > 0) {
720 		/* do the invalidate of the headroom */
721 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
722 		    (tail + hdrmwqes) & qsize_msk);
723 		for (i = 16; i < sectperwqe; i += 16) {
724 			wqe_start[i] = 0xFFFFFFFF;
725 		}
726 
727 		wr++;
728 		goto post_next;
729 	}
730 done:
731 
732 	if (posted_cnt != 0) {
733 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
734 
735 		membar_producer();
736 
737 		/* the FMA retry loop starts for Hermon doorbell register. */
738 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
739 		    fm_status, fm_test_num);
740 
741 		/* Ring the doorbell */
742 		HERMON_UAR_DOORBELL(state, uarhdl,
743 		    (uint64_t *)(void *)&state->hs_uar->send,
744 		    (uint64_t)qp->qp_ring);
745 
746 		/* the FMA retry loop ends. */
747 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
748 		    fm_status, fm_test_num);
749 
750 		/* do the invalidate of the headroom */
751 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
752 		    (tail + hdrmwqes) & qsize_msk);
753 		for (i = 16; i < sectperwqe; i += 16) {
754 			wqe_start[i] = 0xFFFFFFFF;
755 		}
756 	}
757 	/*
758 	 * Update the "num_posted" return value (if necessary).
759 	 * Then drop the locks and return success.
760 	 */
761 	if (num_posted != NULL) {
762 		*num_posted = posted_cnt;
763 	}
764 
765 	mutex_exit(&qp->qp_sq_lock);
766 	return (status);
767 
768 pio_error:
769 	mutex_exit(&qp->qp_sq_lock);
770 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
771 	return (ibc_get_ci_failure(0));
772 }
773 
774 /*
775  * hermon_post_send()
776  *    Context: Can be called from interrupt or base context.
777  */
778 int
779 hermon_post_send(hermon_state_t *state, hermon_qphdl_t qp,
780     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
781 {
782 	ibt_send_wr_t 			*curr_wr;
783 	hermon_workq_hdr_t		*wq;
784 	hermon_ahhdl_t			ah;
785 	uint64_t			*desc, *prev;
786 	uint32_t			desc_sz;
787 	uint32_t			signaled_dbd, solicited;
788 	uint32_t			head, tail, next_tail, qsize_msk;
789 	uint32_t			hdrmwqes;
790 	uint_t				currindx, wrindx, numremain;
791 	uint_t				chainlen;
792 	uint_t				posted_cnt, maxstat;
793 	uint_t				total_posted;
794 	int				status;
795 	uint32_t			nopcode, fence, immed_data = 0;
796 	uint32_t			prev_nopcode;
797 	uint_t				qp_state;
798 
799 	/* initialize the FMA retry loop */
800 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test);
801 
802 	/*
803 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
804 	 * clients to post to QP memory that is accessible directly by the
805 	 * user.  If the QP memory is user accessible, then return an error.
806 	 */
807 	if (qp->qp_alloc_flags & IBT_QP_USER_MAP) {
808 		return (IBT_QP_HDL_INVALID);
809 	}
810 
811 	mutex_enter(&qp->qp_sq_lock);
812 
813 	/*
814 	 * Check QP state.  Can not post Send requests from the "Reset",
815 	 * "Init", or "RTR" states
816 	 */
817 	qp_state = qp->qp_state_for_post_send;
818 	if ((qp_state == HERMON_QP_RESET) ||
819 	    (qp_state == HERMON_QP_INIT) ||
820 	    (qp_state == HERMON_QP_RTR)) {
821 		mutex_exit(&qp->qp_sq_lock);
822 		return (IBT_QP_STATE_INVALID);
823 	}
824 
825 	if (qp->qp_is_special)
826 		goto post_many;
827 
828 	/* Use these optimized functions most of the time */
829 	if (qp->qp_type == IBT_UD_RQP) {
830 		return (hermon_post_send_ud(state, qp, wr, num_wr, num_posted));
831 	}
832 
833 	if (qp->qp_serv_type == HERMON_QP_RC) {
834 		return (hermon_post_send_rc(state, qp, wr, num_wr, num_posted));
835 	}
836 
837 	if (qp->qp_serv_type == HERMON_QP_UC)
838 		goto post_many;
839 
840 	mutex_exit(&qp->qp_sq_lock);
841 	return (IBT_QP_SRV_TYPE_INVALID);
842 
843 post_many:
844 	/* general loop for non-optimized posting */
845 
846 	/* Save away some initial QP state */
847 	wq = qp->qp_sq_wqhdr;
848 	qsize_msk = wq->wq_mask;
849 	tail	  = wq->wq_tail;
850 	head	  = wq->wq_head;
851 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
852 
853 	/* Initialize posted_cnt */
854 	posted_cnt = 0;
855 	total_posted = 0;
856 
857 	/*
858 	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
859 	 * request and build a Send WQE.  NOTE:  Because we are potentially
860 	 * building a chain of WQEs to post, we want to build them all first,
861 	 * and set the valid (HW Ownership) bit on all but the first.
862 	 * However, we do not want to validate the first one until the
863 	 * entire chain of WQEs has been built.  Then in the final
864 	 * we set the valid bit in the first, flush if needed, and as a last
865 	 * step ring the appropriate doorbell.  NOTE: the doorbell ring may
866 	 * NOT be needed if the HCA is already processing, but the doorbell
867 	 * ring will be done regardless. NOTE ALSO:  It is possible for
868 	 * more Work Requests to be posted than the HW will support at one
869 	 * shot.  If this happens, we need to be able to post and ring
870 	 * several chains here until the the entire request is complete.
871 	 * NOTE ALSO:  the term "chain" is used to differentiate it from
872 	 * Work Request List passed in; and because that's the terminology
873 	 * from the previous generations of HCA - but the WQEs are not, in fact
874 	 * chained together for Hermon
875 	 */
876 
877 	wrindx = 0;
878 	numremain = num_wr;
879 	status	  = DDI_SUCCESS;
880 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
881 		/*
882 		 * For the first WQE on a new chain we need "prev" to point
883 		 * to the current descriptor.
884 		 */
885 		prev = HERMON_QP_SQ_ENTRY(qp, tail);
886 
887 		/*
888 		 * Break the request up into lists that are less than or
889 		 * equal to the maximum number of WQEs that can be posted
890 		 * per doorbell ring - 256 currently
891 		 */
892 		chainlen = (numremain > HERMON_QP_MAXDESC_PER_DB) ?
893 		    HERMON_QP_MAXDESC_PER_DB : numremain;
894 		numremain -= chainlen;
895 
896 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
897 			/*
898 			 * Check for "queue full" condition.  If the queue
899 			 * is already full, then no more WQEs can be posted.
900 			 * So break out, ring a doorbell (if necessary) and
901 			 * return an error
902 			 */
903 			if (wq->wq_full != 0) {
904 				status = IBT_QP_FULL;
905 				break;
906 			}
907 
908 			/*
909 			 * Increment the "tail index". Check for "queue
910 			 * full" condition incl. headroom.  If we detect that
911 			 * the current work request is going to fill the work
912 			 * queue, then we mark this condition and continue.
913 			 * Don't need >=, because going one-by-one we have to
914 			 * hit it exactly sooner or later
915 			 */
916 
917 			next_tail = (tail + 1) & qsize_msk;
918 			if (((tail + hdrmwqes) & qsize_msk) == head) {
919 				wq->wq_full = 1;
920 			}
921 
922 			/*
923 			 * Get the address of the location where the next
924 			 * Send WQE should be built
925 			 */
926 			desc = HERMON_QP_SQ_ENTRY(qp, tail);
927 			/*
928 			 * Call hermon_wqe_send_build() to build the WQE
929 			 * at the given address.  This routine uses the
930 			 * information in the ibt_send_wr_t list (wr[]) and
931 			 * returns the size of the WQE when it returns.
932 			 */
933 			status = hermon_wqe_send_build(state, qp,
934 			    &wr[wrindx], desc, &desc_sz);
935 			if (status != DDI_SUCCESS) {
936 				break;
937 			}
938 
939 			/*
940 			 * Now, build the Ctrl Segment based on
941 			 * what was just done
942 			 */
943 			curr_wr = &wr[wrindx];
944 
945 			switch (curr_wr->wr_opcode) {
946 			case IBT_WRC_RDMAW:
947 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
948 					nopcode =
949 					    HERMON_WQE_SEND_NOPCODE_RDMAWI;
950 					immed_data =
951 					    hermon_wr_get_immediate(curr_wr);
952 				} else {
953 					nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
954 				}
955 				break;
956 
957 			case IBT_WRC_SEND:
958 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
959 					nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
960 					immed_data =
961 					    hermon_wr_get_immediate(curr_wr);
962 				} else {
963 					nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
964 				}
965 				break;
966 
967 			case IBT_WRC_SEND_LSO:
968 				nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
969 				break;
970 
971 			case IBT_WRC_RDMAR:
972 				nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
973 				break;
974 
975 			case IBT_WRC_CSWAP:
976 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
977 				break;
978 
979 			case IBT_WRC_FADD:
980 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
981 				break;
982 
983 			case IBT_WRC_BIND:
984 				nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
985 				break;
986 			}
987 
988 			fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
989 
990 			/*
991 			 * now, build up the control segment, leaving the
992 			 * owner bit as it is
993 			 */
994 
995 			if ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
996 			    (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL)) {
997 				signaled_dbd = 0xC;
998 			} else {
999 				signaled_dbd = 0;
1000 			}
1001 			if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT)
1002 				solicited = 0x2;
1003 			else
1004 				solicited = 0;
1005 
1006 			if (qp->qp_is_special) {
1007 				/* Ensure correctness, set the ReRead bit */
1008 				nopcode |= (1 << 6);
1009 				ah = (hermon_ahhdl_t)
1010 				    curr_wr->wr.ud.udwr_dest->ud_ah;
1011 				mutex_enter(&ah->ah_lock);
1012 				maxstat = ah->ah_udav->max_stat_rate;
1013 				HERMON_WQE_SET_MLX_CTRL_SEGMENT(desc, desc_sz,
1014 				    signaled_dbd, maxstat, ah->ah_udav->rlid,
1015 				    qp, ah->ah_udav->sl);
1016 				mutex_exit(&ah->ah_lock);
1017 			} else {
1018 				HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz,
1019 				    fence, immed_data, solicited,
1020 				    signaled_dbd, 0, qp, 0, 0);
1021 			}
1022 			wq->wq_wrid[tail] = curr_wr->wr_id;
1023 
1024 			/*
1025 			 * If this is not the first descriptor on the current
1026 			 * chain, then set the ownership bit.
1027 			 */
1028 			if (currindx != 0) {		/* not the first */
1029 				membar_producer();
1030 				HERMON_SET_SEND_WQE_OWNER(qp,
1031 				    (uint32_t *)desc, nopcode);
1032 			} else
1033 				prev_nopcode = nopcode;
1034 
1035 			/*
1036 			 * Update the current "tail index" and increment
1037 			 * "posted_cnt"
1038 			 */
1039 			tail = next_tail;
1040 			posted_cnt++;
1041 		}
1042 
1043 		/*
1044 		 * If we reach here and there are one or more WQEs which have
1045 		 * been successfully built as a chain, we have to finish up
1046 		 * and prepare them for writing to the HW
1047 		 * The steps are:
1048 		 * 	1. do the headroom fixup
1049 		 *	2. add in the size of the headroom for the sync
1050 		 *	3. write the owner bit for the first WQE
1051 		 *	4. sync them
1052 		 *	5. fix up the structures
1053 		 *	6. hit the doorbell in UAR
1054 		 */
1055 		if (posted_cnt != 0) {
1056 			ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
1057 
1058 			/* do the invalidate of the headroom */
1059 
1060 			hermon_wqe_headroom(tail, qp);
1061 
1062 			/* Update some of the state in the QP */
1063 			wq->wq_tail = tail;
1064 			total_posted += posted_cnt;
1065 			posted_cnt = 0;
1066 
1067 			membar_producer();
1068 
1069 			/*
1070 			 * Now set the ownership bit of the first
1071 			 * one in the chain
1072 			 */
1073 			HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)prev,
1074 			    prev_nopcode);
1075 
1076 			/* the FMA retry loop starts for Hermon doorbell. */
1077 			hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
1078 			    fm_status, fm_test);
1079 
1080 			HERMON_UAR_DOORBELL(state, uarhdl,
1081 			    (uint64_t *)(void *)&state->hs_uar->send,
1082 			    (uint64_t)qp->qp_ring);
1083 
1084 			/* the FMA retry loop ends. */
1085 			hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
1086 			    fm_status, fm_test);
1087 		}
1088 	}
1089 
1090 	/*
1091 	 * Update the "num_posted" return value (if necessary).
1092 	 * Then drop the locks and return success.
1093 	 */
1094 	if (num_posted != NULL) {
1095 		*num_posted = total_posted;
1096 	}
1097 	mutex_exit(&qp->qp_sq_lock);
1098 	return (status);
1099 
1100 pio_error:
1101 	mutex_exit(&qp->qp_sq_lock);
1102 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1103 	return (ibc_get_ci_failure(0));
1104 }
1105 
1106 
1107 /*
1108  * hermon_post_recv()
1109  *    Context: Can be called from interrupt or base context.
1110  */
1111 int
1112 hermon_post_recv(hermon_state_t *state, hermon_qphdl_t qp,
1113     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1114 {
1115 	uint64_t			*desc;
1116 	hermon_workq_hdr_t		*wq;
1117 	uint32_t			head, tail, next_tail, qsize_msk;
1118 	uint_t				wrindx;
1119 	uint_t				posted_cnt;
1120 	int				status;
1121 
1122 	/*
1123 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
1124 	 * clients to post to QP memory that is accessible directly by the
1125 	 * user.  If the QP memory is user accessible, then return an error.
1126 	 */
1127 	if (qp->qp_alloc_flags & IBT_QP_USER_MAP) {
1128 		return (IBT_QP_HDL_INVALID);
1129 	}
1130 
1131 	/* Initialize posted_cnt */
1132 	posted_cnt = 0;
1133 
1134 	mutex_enter(&qp->qp_lock);
1135 
1136 	/*
1137 	 * Check if QP is associated with an SRQ
1138 	 */
1139 	if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
1140 		mutex_exit(&qp->qp_lock);
1141 		return (IBT_SRQ_IN_USE);
1142 	}
1143 
1144 	/*
1145 	 * Check QP state.  Can not post Recv requests from the "Reset" state
1146 	 */
1147 	if (qp->qp_state == HERMON_QP_RESET) {
1148 		mutex_exit(&qp->qp_lock);
1149 		return (IBT_QP_STATE_INVALID);
1150 	}
1151 
1152 	/* Check that work request transport type is valid */
1153 	if ((qp->qp_type != IBT_UD_RQP) &&
1154 	    (qp->qp_serv_type != HERMON_QP_RC) &&
1155 	    (qp->qp_serv_type != HERMON_QP_UC)) {
1156 		mutex_exit(&qp->qp_lock);
1157 		return (IBT_QP_SRV_TYPE_INVALID);
1158 	}
1159 
1160 	/*
1161 	 * Grab the lock for the WRID list, i.e., membar_consumer().
1162 	 * This is not needed because the mutex_enter() above has
1163 	 * the same effect.
1164 	 */
1165 
1166 	/* Save away some initial QP state */
1167 	wq = qp->qp_rq_wqhdr;
1168 	qsize_msk = wq->wq_mask;
1169 	tail	  = wq->wq_tail;
1170 	head	  = wq->wq_head;
1171 
1172 	wrindx = 0;
1173 	status	  = DDI_SUCCESS;
1174 
1175 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1176 		if (wq->wq_full != 0) {
1177 			status = IBT_QP_FULL;
1178 			break;
1179 		}
1180 		next_tail = (tail + 1) & qsize_msk;
1181 		if (next_tail == head) {
1182 			wq->wq_full = 1;
1183 		}
1184 		desc = HERMON_QP_RQ_ENTRY(qp, tail);
1185 		status = hermon_wqe_recv_build(state, qp, &wr[wrindx], desc);
1186 		if (status != DDI_SUCCESS) {
1187 			break;
1188 		}
1189 
1190 		wq->wq_wrid[tail] = wr[wrindx].wr_id;
1191 		qp->qp_rq_wqecntr++;
1192 
1193 		tail = next_tail;
1194 		posted_cnt++;
1195 	}
1196 
1197 	if (posted_cnt != 0) {
1198 
1199 		wq->wq_tail = tail;
1200 
1201 		membar_producer();	/* ensure wrids are visible */
1202 
1203 		/* Update the doorbell record w/ wqecntr */
1204 		HERMON_UAR_DB_RECORD_WRITE(qp->qp_rq_vdbr,
1205 		    qp->qp_rq_wqecntr & 0xFFFF);
1206 	}
1207 
1208 	if (num_posted != NULL) {
1209 		*num_posted = posted_cnt;
1210 	}
1211 
1212 
1213 	mutex_exit(&qp->qp_lock);
1214 	return (status);
1215 }
1216 
1217 /*
1218  * hermon_post_srq()
1219  *    Context: Can be called from interrupt or base context.
1220  */
1221 int
1222 hermon_post_srq(hermon_state_t *state, hermon_srqhdl_t srq,
1223     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1224 {
1225 	uint64_t			*desc;
1226 	hermon_workq_hdr_t		*wq;
1227 	uint_t				indx, wrindx;
1228 	uint_t				posted_cnt;
1229 	int				status;
1230 
1231 	mutex_enter(&srq->srq_lock);
1232 
1233 	/*
1234 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
1235 	 * clients to post to QP memory that is accessible directly by the
1236 	 * user.  If the QP memory is user accessible, then return an error.
1237 	 */
1238 	if (srq->srq_is_umap) {
1239 		mutex_exit(&srq->srq_lock);
1240 		return (IBT_SRQ_HDL_INVALID);
1241 	}
1242 
1243 	/*
1244 	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
1245 	 */
1246 	if (srq->srq_state == HERMON_SRQ_STATE_ERROR) {
1247 		mutex_exit(&srq->srq_lock);
1248 		return (IBT_QP_STATE_INVALID);
1249 	}
1250 
1251 	status = DDI_SUCCESS;
1252 	posted_cnt = 0;
1253 	wq = srq->srq_wq_wqhdr;
1254 	indx = wq->wq_head;
1255 
1256 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1257 
1258 		if (indx == wq->wq_tail) {
1259 			status = IBT_QP_FULL;
1260 			break;
1261 		}
1262 		desc = HERMON_SRQ_WQE_ADDR(srq, indx);
1263 
1264 		wq->wq_wrid[indx] = wr[wrindx].wr_id;
1265 
1266 		status = hermon_wqe_srq_build(state, srq, &wr[wrindx], desc);
1267 		if (status != DDI_SUCCESS) {
1268 			break;
1269 		}
1270 
1271 		posted_cnt++;
1272 		indx = htons(((uint16_t *)desc)[1]);
1273 		wq->wq_head = indx;
1274 	}
1275 
1276 	if (posted_cnt != 0) {
1277 
1278 		srq->srq_wq_wqecntr += posted_cnt;
1279 
1280 		membar_producer();	/* ensure wrids are visible */
1281 
1282 		/* Ring the doorbell w/ wqecntr */
1283 		HERMON_UAR_DB_RECORD_WRITE(srq->srq_wq_vdbr,
1284 		    srq->srq_wq_wqecntr & 0xFFFF);
1285 	}
1286 
1287 	if (num_posted != NULL) {
1288 		*num_posted = posted_cnt;
1289 	}
1290 
1291 	mutex_exit(&srq->srq_lock);
1292 	return (status);
1293 }
1294 
1295 
1296 /*
1297  * hermon_wqe_send_build()
1298  *    Context: Can be called from interrupt or base context.
1299  */
1300 static int
1301 hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
1302     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1303 {
1304 	hermon_hw_snd_wqe_ud_t		*ud;
1305 	hermon_hw_snd_wqe_remaddr_t	*rc;
1306 	hermon_hw_snd_wqe_atomic_t	*at;
1307 	hermon_hw_snd_wqe_remaddr_t	*uc;
1308 	hermon_hw_snd_wqe_bind_t	*bn;
1309 	hermon_hw_wqe_sgl_t		*ds, *old_ds;
1310 	ibt_ud_dest_t			*dest;
1311 	ibt_wr_ds_t			*sgl;
1312 	hermon_ahhdl_t			ah;
1313 	uint32_t			nds;
1314 	int				i, j, last_ds, num_ds, status;
1315 	int				tmpsize;
1316 
1317 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1318 
1319 	/* Initialize the information for the Data Segments */
1320 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1321 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
1322 	nds = wr->wr_nds;
1323 	sgl = wr->wr_sgl;
1324 	num_ds = 0;
1325 	i = 0;
1326 
1327 	/*
1328 	 * Build a Send WQE depends first and foremost on the transport
1329 	 * type of Work Request (i.e. UD, RC, or UC)
1330 	 */
1331 	switch (wr->wr_trans) {
1332 	case IBT_UD_SRV:
1333 		/* Ensure that work request transport type matches QP type */
1334 		if (qp->qp_serv_type != HERMON_QP_UD) {
1335 			return (IBT_QP_SRV_TYPE_INVALID);
1336 		}
1337 
1338 		/*
1339 		 * Validate the operation type.  For UD requests, only the
1340 		 * "Send" and "Send LSO" operations are valid.
1341 		 */
1342 		if (wr->wr_opcode != IBT_WRC_SEND &&
1343 		    wr->wr_opcode != IBT_WRC_SEND_LSO) {
1344 			return (IBT_QP_OP_TYPE_INVALID);
1345 		}
1346 
1347 		/*
1348 		 * If this is a Special QP (QP0 or QP1), then we need to
1349 		 * build MLX WQEs instead.  So jump to hermon_wqe_mlx_build()
1350 		 * and return whatever status it returns
1351 		 */
1352 		if (qp->qp_is_special) {
1353 			if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1354 				return (IBT_QP_OP_TYPE_INVALID);
1355 			}
1356 			status = hermon_wqe_mlx_build(state, qp,
1357 			    wr, desc, size);
1358 			return (status);
1359 		}
1360 
1361 		/*
1362 		 * Otherwise, if this is a normal UD Send request, then fill
1363 		 * all the fields in the Hermon UD header for the WQE.  Note:
1364 		 * to do this we'll need to extract some information from the
1365 		 * Address Handle passed with the work request.
1366 		 */
1367 		ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1368 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
1369 		if (wr->wr_opcode == IBT_WRC_SEND) {
1370 			dest = wr->wr.ud.udwr_dest;
1371 		} else {
1372 			dest = wr->wr.ud_lso.lso_ud_dest;
1373 		}
1374 		ah = (hermon_ahhdl_t)dest->ud_ah;
1375 		if (ah == NULL) {
1376 			return (IBT_AH_HDL_INVALID);
1377 		}
1378 
1379 		/*
1380 		 * Build the Unreliable Datagram Segment for the WQE, using
1381 		 * the information from the address handle and the work
1382 		 * request.
1383 		 */
1384 		/* mutex_enter(&ah->ah_lock); */
1385 		if (wr->wr_opcode == IBT_WRC_SEND) {
1386 			HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud.udwr_dest);
1387 		} else {	/* IBT_WRC_SEND_LSO */
1388 			HERMON_WQE_BUILD_UD(qp, ud, ah,
1389 			    wr->wr.ud_lso.lso_ud_dest);
1390 		}
1391 		/* mutex_exit(&ah->ah_lock); */
1392 
1393 		/* Update "ds" for filling in Data Segments (below) */
1394 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
1395 		    sizeof (hermon_hw_snd_wqe_ud_t));
1396 
1397 		if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1398 			int total_len;
1399 
1400 			total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
1401 			if ((uintptr_t)ds + total_len + (nds * 16) >
1402 			    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz))
1403 				return (IBT_QP_SGL_LEN_INVALID);
1404 
1405 			bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1,
1406 			    wr->wr.ud_lso.lso_hdr_sz);
1407 			old_ds = ds;
1408 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
1409 			for (; i < nds; i++) {
1410 				if (sgl[i].ds_len == 0)
1411 					continue;
1412 				HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds],
1413 				    &sgl[i]);
1414 				num_ds++;
1415 				i++;
1416 				break;
1417 			}
1418 			membar_producer();
1419 			HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
1420 			    wr->wr.ud_lso.lso_hdr_sz);
1421 		}
1422 
1423 		break;
1424 
1425 	case IBT_RC_SRV:
1426 		/* Ensure that work request transport type matches QP type */
1427 		if (qp->qp_serv_type != HERMON_QP_RC) {
1428 			return (IBT_QP_SRV_TYPE_INVALID);
1429 		}
1430 
1431 		/*
1432 		 * Validate the operation type.  For RC requests, we allow
1433 		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1434 		 * operations, and memory window "Bind"
1435 		 */
1436 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1437 		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
1438 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1439 		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
1440 		    (wr->wr_opcode != IBT_WRC_FADD) &&
1441 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1442 			return (IBT_QP_OP_TYPE_INVALID);
1443 		}
1444 
1445 		/*
1446 		 * If this is a Send request, then all we need to do is break
1447 		 * out and here and begin the Data Segment processing below
1448 		 */
1449 		if (wr->wr_opcode == IBT_WRC_SEND) {
1450 			break;
1451 		}
1452 
1453 		/*
1454 		 * If this is an RDMA Read or RDMA Write request, then fill
1455 		 * in the "Remote Address" header fields.
1456 		 */
1457 		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1458 		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1459 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1460 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1461 
1462 			/*
1463 			 * Build the Remote Address Segment for the WQE, using
1464 			 * the information from the RC work request.
1465 			 */
1466 			HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1467 
1468 			/* Update "ds" for filling in Data Segments (below) */
1469 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
1470 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1471 			break;
1472 		}
1473 
1474 		/*
1475 		 * If this is one of the Atomic type operations (i.e
1476 		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1477 		 * Address" header fields and the "Atomic" header fields.
1478 		 */
1479 		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1480 		    (wr->wr_opcode == IBT_WRC_FADD)) {
1481 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1482 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1483 			at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1484 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1485 
1486 			/*
1487 			 * Build the Remote Address and Atomic Segments for
1488 			 * the WQE, using the information from the RC Atomic
1489 			 * work request.
1490 			 */
1491 			HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1492 			HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1493 
1494 			/* Update "ds" for filling in Data Segments (below) */
1495 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
1496 			    sizeof (hermon_hw_snd_wqe_atomic_t));
1497 
1498 			/*
1499 			 * Update "nds" and "sgl" because Atomic requests have
1500 			 * only a single Data Segment (and they are encoded
1501 			 * somewhat differently in the work request.
1502 			 */
1503 			nds = 1;
1504 			sgl = wr->wr_sgl;
1505 			break;
1506 		}
1507 
1508 		/*
1509 		 * If this is memory window Bind operation, then we call the
1510 		 * hermon_wr_bind_check() routine to validate the request and
1511 		 * to generate the updated RKey.  If this is successful, then
1512 		 * we fill in the WQE's "Bind" header fields.
1513 		 */
1514 		if (wr->wr_opcode == IBT_WRC_BIND) {
1515 			status = hermon_wr_bind_check(state, wr);
1516 			if (status != DDI_SUCCESS) {
1517 				return (status);
1518 			}
1519 
1520 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1521 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1522 
1523 			/*
1524 			 * Build the Bind Memory Window Segments for the WQE,
1525 			 * using the information from the RC Bind memory
1526 			 * window work request.
1527 			 */
1528 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1529 
1530 			/*
1531 			 * Update the "ds" pointer.  Even though the "bind"
1532 			 * operation requires no SGLs, this is necessary to
1533 			 * facilitate the correct descriptor size calculations
1534 			 * (below).
1535 			 */
1536 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1537 			    sizeof (hermon_hw_snd_wqe_bind_t));
1538 			nds = 0;
1539 		}
1540 		break;
1541 
1542 	case IBT_UC_SRV:
1543 		/* Ensure that work request transport type matches QP type */
1544 		if (qp->qp_serv_type != HERMON_QP_UC) {
1545 			return (IBT_QP_SRV_TYPE_INVALID);
1546 		}
1547 
1548 		/*
1549 		 * Validate the operation type.  For UC requests, we only
1550 		 * allow "Send", "RDMA Write", and memory window "Bind".
1551 		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1552 		 * operations
1553 		 */
1554 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1555 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1556 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1557 			return (IBT_QP_OP_TYPE_INVALID);
1558 		}
1559 
1560 		/*
1561 		 * If this is a Send request, then all we need to do is break
1562 		 * out and here and begin the Data Segment processing below
1563 		 */
1564 		if (wr->wr_opcode == IBT_WRC_SEND) {
1565 			break;
1566 		}
1567 
1568 		/*
1569 		 * If this is an RDMA Write request, then fill in the "Remote
1570 		 * Address" header fields.
1571 		 */
1572 		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1573 			uc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1574 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1575 
1576 			/*
1577 			 * Build the Remote Address Segment for the WQE, using
1578 			 * the information from the UC work request.
1579 			 */
1580 			HERMON_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1581 
1582 			/* Update "ds" for filling in Data Segments (below) */
1583 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)uc +
1584 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1585 			break;
1586 		}
1587 
1588 		/*
1589 		 * If this is memory window Bind operation, then we call the
1590 		 * hermon_wr_bind_check() routine to validate the request and
1591 		 * to generate the updated RKey.  If this is successful, then
1592 		 * we fill in the WQE's "Bind" header fields.
1593 		 */
1594 		if (wr->wr_opcode == IBT_WRC_BIND) {
1595 			status = hermon_wr_bind_check(state, wr);
1596 			if (status != DDI_SUCCESS) {
1597 				return (status);
1598 			}
1599 
1600 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1601 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1602 
1603 			/*
1604 			 * Build the Bind Memory Window Segments for the WQE,
1605 			 * using the information from the UC Bind memory
1606 			 * window work request.
1607 			 */
1608 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1609 
1610 			/*
1611 			 * Update the "ds" pointer.  Even though the "bind"
1612 			 * operation requires no SGLs, this is necessary to
1613 			 * facilitate the correct descriptor size calculations
1614 			 * (below).
1615 			 */
1616 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1617 			    sizeof (hermon_hw_snd_wqe_bind_t));
1618 			nds = 0;
1619 		}
1620 		break;
1621 
1622 	default:
1623 		return (IBT_QP_SRV_TYPE_INVALID);
1624 	}
1625 
1626 	/*
1627 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1628 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1629 	 * Start by checking for a valid number of SGL entries
1630 	 */
1631 	if (nds > qp->qp_sq_sgl) {
1632 		return (IBT_QP_SGL_LEN_INVALID);
1633 	}
1634 
1635 	/*
1636 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1637 	 * segments.  Note: We skip any SGL with zero size because Hermon
1638 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1639 	 * the encoding for zero means a 2GB transfer.
1640 	 */
1641 	for (last_ds = num_ds, j = i; j < nds; j++) {
1642 		if (sgl[j].ds_len != 0)
1643 			last_ds++;	/* real last ds of wqe to fill */
1644 	}
1645 
1646 	/*
1647 	 * Return the size of descriptor (in 16-byte chunks)
1648 	 * For Hermon, we want them (for now) to be on stride size
1649 	 * boundaries, which was implicit in Tavor/Arbel
1650 	 *
1651 	 */
1652 	tmpsize = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc);
1653 
1654 	*size = tmpsize >> 0x4;
1655 
1656 	for (j = nds; --j >= i; ) {
1657 		if (sgl[j].ds_len == 0) {
1658 			continue;
1659 		}
1660 
1661 		/*
1662 		 * Fill in the Data Segment(s) for the current WQE, using the
1663 		 * information contained in the scatter-gather list of the
1664 		 * work request.
1665 		 */
1666 		last_ds--;
1667 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
1668 	}
1669 
1670 	return (DDI_SUCCESS);
1671 }
1672 
1673 
1674 
1675 /*
1676  * hermon_wqe_mlx_build()
1677  *    Context: Can be called from interrupt or base context.
1678  */
1679 static int
1680 hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
1681     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1682 {
1683 	hermon_ahhdl_t		ah;
1684 	hermon_hw_udav_t	*udav;
1685 	ib_lrh_hdr_t		*lrh;
1686 	ib_grh_t		*grh;
1687 	ib_bth_hdr_t		*bth;
1688 	ib_deth_hdr_t		*deth;
1689 	hermon_hw_wqe_sgl_t	*ds;
1690 	ibt_wr_ds_t		*sgl;
1691 	uint8_t			*mgmtclass, *hpoint, *hcount;
1692 	uint32_t		nds, offset, pktlen;
1693 	uint32_t		desc_sz;
1694 	int			i, num_ds;
1695 	int			tmpsize;
1696 
1697 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1698 
1699 	/* Initialize the information for the Data Segments */
1700 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1701 	    sizeof (hermon_hw_mlx_wqe_nextctrl_t));
1702 
1703 	/*
1704 	 * Pull the address handle from the work request. The UDAV will
1705 	 * be used to answer some questions about the request.
1706 	 */
1707 	ah = (hermon_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1708 	if (ah == NULL) {
1709 		return (IBT_AH_HDL_INVALID);
1710 	}
1711 	mutex_enter(&ah->ah_lock);
1712 	udav = ah->ah_udav;
1713 
1714 	/*
1715 	 * If the request is for QP1 and the destination LID is equal to
1716 	 * the Permissive LID, then return an error.  This combination is
1717 	 * not allowed
1718 	 */
1719 	if ((udav->rlid == IB_LID_PERMISSIVE) &&
1720 	    (qp->qp_is_special == HERMON_QP_GSI)) {
1721 		mutex_exit(&ah->ah_lock);
1722 		return (IBT_AH_HDL_INVALID);
1723 	}
1724 
1725 	/*
1726 	 * Calculate the size of the packet headers, including the GRH
1727 	 * (if necessary)
1728 	 */
1729 	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1730 	    sizeof (ib_deth_hdr_t);
1731 	if (udav->grh) {
1732 		desc_sz += sizeof (ib_grh_t);
1733 	}
1734 
1735 	/*
1736 	 * Begin to build the first "inline" data segment for the packet
1737 	 * headers.  Note:  By specifying "inline" we can build the contents
1738 	 * of the MAD packet headers directly into the work queue (as part
1739 	 * descriptor).  This has the advantage of both speeding things up
1740 	 * and of not requiring the driver to allocate/register any additional
1741 	 * memory for the packet headers.
1742 	 */
1743 	HERMON_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1744 	desc_sz += 4;
1745 
1746 	/*
1747 	 * Build Local Route Header (LRH)
1748 	 *    We start here by building the LRH into a temporary location.
1749 	 *    When we have finished we copy the LRH data into the descriptor.
1750 	 *
1751 	 *    Notice that the VL values are hardcoded.  This is not a problem
1752 	 *    because VL15 is decided later based on the value in the MLX
1753 	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1754 	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1755 	 *    values.  This rule does not hold for loopback packets however
1756 	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1757 	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1758 	 *
1759 	 *    Notice also that Source LID is hardcoded to the Permissive LID
1760 	 *    (0xFFFF).  This is also not a problem because if the Destination
1761 	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1762 	 *    transport "next/ctrl" header will be set to zero and the hardware
1763 	 *    will pull the LID from value in the port.
1764 	 */
1765 	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1766 	pktlen = (desc_sz + 0x100) >> 2;
1767 	HERMON_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1768 
1769 	/*
1770 	 * Build Global Route Header (GRH)
1771 	 *    This is only built if necessary as defined by the "grh" bit in
1772 	 *    the address vector.  Note:  We also calculate the offset to the
1773 	 *    next header (BTH) based on whether or not the "grh" bit is set.
1774 	 */
1775 	if (udav->grh) {
1776 		/*
1777 		 * If the request is for QP0, then return an error.  The
1778 		 * combination of global routine (GRH) and QP0 is not allowed.
1779 		 */
1780 		if (qp->qp_is_special == HERMON_QP_SMI) {
1781 			mutex_exit(&ah->ah_lock);
1782 			return (IBT_AH_HDL_INVALID);
1783 		}
1784 		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1785 		HERMON_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1786 
1787 		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1788 	} else {
1789 		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1790 	}
1791 	mutex_exit(&ah->ah_lock);
1792 
1793 
1794 	/*
1795 	 * Build Base Transport Header (BTH)
1796 	 *    Notice that the M, PadCnt, and TVer fields are all set
1797 	 *    to zero implicitly.  This is true for all Management Datagrams
1798 	 *    MADs whether GSI are SMI.
1799 	 */
1800 	HERMON_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1801 
1802 	/*
1803 	 * Build Datagram Extended Transport Header (DETH)
1804 	 */
1805 	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1806 	HERMON_WQE_BUILD_MLX_DETH(deth, qp);
1807 
1808 	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1809 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1810 	ds = (hermon_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1811 	nds = wr->wr_nds;
1812 	sgl = wr->wr_sgl;
1813 	num_ds = 0;
1814 
1815 	/*
1816 	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1817 	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1818 	 * Start by checking for a valid number of SGL entries
1819 	 */
1820 	if (nds > qp->qp_sq_sgl) {
1821 		return (IBT_QP_SGL_LEN_INVALID);
1822 	}
1823 
1824 	/*
1825 	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1826 	 * segments.  Note: We skip any SGL with zero size because Hermon
1827 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1828 	 * the encoding for zero means a 2GB transfer.  Because of this special
1829 	 * encoding in the hardware, we mask the requested length with
1830 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1831 	 * zero.)
1832 	 */
1833 	mgmtclass = hpoint = hcount = NULL;
1834 	offset = 0;
1835 	for (i = 0; i < nds; i++) {
1836 		if (sgl[i].ds_len == 0) {
1837 			continue;
1838 		}
1839 
1840 		/*
1841 		 * Fill in the Data Segment(s) for the MLX send WQE, using
1842 		 * the information contained in the scatter-gather list of
1843 		 * the work request.
1844 		 */
1845 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]);
1846 
1847 		/*
1848 		 * Search through the contents of all MADs posted to QP0 to
1849 		 * initialize pointers to the places where Directed Route "hop
1850 		 * pointer", "hop count", and "mgmtclass" would be.  Hermon
1851 		 * needs these updated (i.e. incremented or decremented, as
1852 		 * necessary) by software.
1853 		 */
1854 		if (qp->qp_is_special == HERMON_QP_SMI) {
1855 
1856 			HERMON_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1857 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1858 
1859 			HERMON_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1860 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1861 
1862 			HERMON_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1863 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1864 
1865 			offset += sgl[i].ds_len;
1866 		}
1867 		num_ds++;
1868 	}
1869 
1870 	/*
1871 	 * Hermon's Directed Route MADs need to have the "hop pointer"
1872 	 * incremented/decremented (as necessary) depending on whether it is
1873 	 * currently less than or greater than the "hop count" (i.e. whether
1874 	 * the MAD is a request or a response.)
1875 	 */
1876 	if (qp->qp_is_special == HERMON_QP_SMI) {
1877 		HERMON_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1878 		    *hpoint, *hcount);
1879 	}
1880 
1881 	/*
1882 	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1883 	 * just like the packets headers above, but it is only four bytes and
1884 	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1885 	 */
1886 	HERMON_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1887 	num_ds++;
1888 
1889 	/*
1890 	 * Return the size of descriptor (in 16-byte chunks)
1891 	 * For Hermon, we want them (for now) to be on stride size
1892 	 * boundaries, which was implicit in Tavor/Arbel
1893 	 */
1894 	tmpsize = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc);
1895 
1896 	*size = tmpsize >> 0x04;
1897 
1898 	return (DDI_SUCCESS);
1899 }
1900 
1901 
1902 
1903 /*
1904  * hermon_wqe_recv_build()
1905  *    Context: Can be called from interrupt or base context.
1906  */
1907 /* ARGSUSED */
1908 static int
1909 hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
1910     ibt_recv_wr_t *wr, uint64_t *desc)
1911 {
1912 	hermon_hw_wqe_sgl_t	*ds;
1913 	int			i, num_ds;
1914 
1915 	ASSERT(MUTEX_HELD(&qp->qp_lock));
1916 
1917 	/*
1918 	 * Fill in the Data Segments (SGL) for the Recv WQE  - don't
1919 	 * need to have a reserved for the ctrl, there is none on the
1920 	 * recv queue for hermon, but will need to put an invalid
1921 	 * (null) scatter pointer per PRM
1922 	 */
1923 	ds = (hermon_hw_wqe_sgl_t *)(uintptr_t)desc;
1924 	num_ds = 0;
1925 
1926 	/* Check for valid number of SGL entries */
1927 	if (wr->wr_nds > qp->qp_rq_sgl) {
1928 		return (IBT_QP_SGL_LEN_INVALID);
1929 	}
1930 
1931 	/*
1932 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1933 	 * segments.  Note: We skip any SGL with zero size because Hermon
1934 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1935 	 * the encoding for zero means a 2GB transfer.  Because of this special
1936 	 * encoding in the hardware, we mask the requested length with
1937 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1938 	 * zero.)
1939 	 */
1940 	for (i = 0; i < wr->wr_nds; i++) {
1941 		if (wr->wr_sgl[i].ds_len == 0) {
1942 			continue;
1943 		}
1944 
1945 		/*
1946 		 * Fill in the Data Segment(s) for the receive WQE, using the
1947 		 * information contained in the scatter-gather list of the
1948 		 * work request.
1949 		 */
1950 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1951 		num_ds++;
1952 	}
1953 
1954 	/* put the null sgl pointer as well if needed */
1955 	if (num_ds < qp->qp_rq_sgl) {
1956 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1957 	}
1958 
1959 	return (DDI_SUCCESS);
1960 }
1961 
1962 
1963 
1964 /*
1965  * hermon_wqe_srq_build()
1966  *    Context: Can be called from interrupt or base context.
1967  */
1968 /* ARGSUSED */
1969 static int
1970 hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
1971     ibt_recv_wr_t *wr, uint64_t *desc)
1972 {
1973 	hermon_hw_wqe_sgl_t	*ds;
1974 	int			i, num_ds;
1975 
1976 	ASSERT(MUTEX_HELD(&srq->srq_lock));
1977 
1978 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1979 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1980 	    sizeof (hermon_hw_srq_wqe_next_t));
1981 	num_ds = 0;
1982 
1983 	/* Check for valid number of SGL entries */
1984 	if (wr->wr_nds > srq->srq_wq_sgl) {
1985 		return (IBT_QP_SGL_LEN_INVALID);
1986 	}
1987 
1988 	/*
1989 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1990 	 * segments.  Note: We skip any SGL with zero size because Hermon
1991 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1992 	 * the encoding for zero means a 2GB transfer.  Because of this special
1993 	 * encoding in the hardware, we mask the requested length with
1994 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1995 	 * zero.)
1996 	 */
1997 	for (i = 0; i < wr->wr_nds; i++) {
1998 		if (wr->wr_sgl[i].ds_len == 0) {
1999 			continue;
2000 		}
2001 
2002 		/*
2003 		 * Fill in the Data Segment(s) for the receive WQE, using the
2004 		 * information contained in the scatter-gather list of the
2005 		 * work request.
2006 		 */
2007 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
2008 		num_ds++;
2009 	}
2010 
2011 	/*
2012 	 * put in the null sgl pointer as well, if needed
2013 	 */
2014 	if (num_ds < srq->srq_wq_sgl) {
2015 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
2016 	}
2017 
2018 	return (DDI_SUCCESS);
2019 }
2020 
2021 
2022 /*
2023  * hermon_wr_get_immediate()
2024  *    Context: Can be called from interrupt or base context.
2025  */
2026 static uint32_t
2027 hermon_wr_get_immediate(ibt_send_wr_t *wr)
2028 {
2029 	/*
2030 	 * This routine extracts the "immediate data" from the appropriate
2031 	 * location in the IBTF work request.  Because of the way the
2032 	 * work request structure is defined, the location for this data
2033 	 * depends on the actual work request operation type.
2034 	 */
2035 
2036 	/* For RDMA Write, test if RC or UC */
2037 	if (wr->wr_opcode == IBT_WRC_RDMAW) {
2038 		if (wr->wr_trans == IBT_RC_SRV) {
2039 			return (wr->wr.rc.rcwr.rdma.rdma_immed);
2040 		} else {  /* IBT_UC_SRV */
2041 			return (wr->wr.uc.ucwr.rdma.rdma_immed);
2042 		}
2043 	}
2044 
2045 	/* For Send, test if RC, UD, or UC */
2046 	if (wr->wr_opcode == IBT_WRC_SEND) {
2047 		if (wr->wr_trans == IBT_RC_SRV) {
2048 			return (wr->wr.rc.rcwr.send_immed);
2049 		} else if (wr->wr_trans == IBT_UD_SRV) {
2050 			return (wr->wr.ud.udwr_immed);
2051 		} else {  /* IBT_UC_SRV */
2052 			return (wr->wr.uc.ucwr.send_immed);
2053 		}
2054 	}
2055 
2056 	/*
2057 	 * If any other type of request, then immediate is undefined
2058 	 */
2059 	return (0);
2060 }
2061 
2062 /*
2063  * hermon_wqe_headroom()
2064  *	Context: can be called from interrupt or base, currently only from
2065  *	base context.
2066  * Routine that fills in the headroom for the Send Queue
2067  */
2068 
2069 static void
2070 hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp)
2071 {
2072 	uint32_t	*wqe_start, *wqe_top, *wqe_base, qsize;
2073 	int		hdrmwqes, wqesizebytes, sectperwqe;
2074 	uint32_t	invalue;
2075 	int		i, j;
2076 
2077 	qsize	 = qp->qp_sq_bufsz;
2078 	wqesizebytes = 1 << qp->qp_sq_log_wqesz;
2079 	sectperwqe = wqesizebytes >> 6; 	/* 64 bytes/section */
2080 	hdrmwqes = qp->qp_sq_hdrmwqes;
2081 	wqe_base  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, 0);
2082 	wqe_top	  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, qsize);
2083 	wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, from);
2084 
2085 	for (i = 0; i < hdrmwqes; i++)	{
2086 		for (j = 0; j < sectperwqe; j++) {
2087 			if (j == 0) {		/* 1st section of wqe */
2088 				/* perserve ownership bit */
2089 				invalue = ddi_get32(qp->qp_wqinfo.qa_acchdl,
2090 				    wqe_start) | 0x7FFFFFFF;
2091 			} else {
2092 				/* or just invalidate it */
2093 				invalue = 0xFFFFFFFF;
2094 			}
2095 			ddi_put32(qp->qp_wqinfo.qa_acchdl, wqe_start, invalue);
2096 			wqe_start += 16;	/* move 64 bytes */
2097 		}
2098 		if (wqe_start == wqe_top)	/* hit the end of the queue */
2099 			wqe_start = wqe_base;	/* wrap to start */
2100 	}
2101 }
2102 
2103 /*
2104  * hermon_wr_bind_check()
2105  *    Context: Can be called from interrupt or base context.
2106  */
2107 /* ARGSUSED */
2108 static int
2109 hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr)
2110 {
2111 	ibt_bind_flags_t	bind_flags;
2112 	uint64_t		vaddr, len;
2113 	uint64_t		reg_start_addr, reg_end_addr;
2114 	hermon_mwhdl_t		mw;
2115 	hermon_mrhdl_t		mr;
2116 	hermon_rsrc_t		*mpt;
2117 	uint32_t		new_rkey;
2118 
2119 	/* Check for a valid Memory Window handle in the WR */
2120 	mw = (hermon_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2121 	if (mw == NULL) {
2122 		return (IBT_MW_HDL_INVALID);
2123 	}
2124 
2125 	/* Check for a valid Memory Region handle in the WR */
2126 	mr = (hermon_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2127 	if (mr == NULL) {
2128 		return (IBT_MR_HDL_INVALID);
2129 	}
2130 
2131 	mutex_enter(&mr->mr_lock);
2132 	mutex_enter(&mw->mr_lock);
2133 
2134 	/*
2135 	 * Check here to see if the memory region has already been partially
2136 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
2137 	 * If so, this is an error, return failure.
2138 	 */
2139 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2140 		mutex_exit(&mr->mr_lock);
2141 		mutex_exit(&mw->mr_lock);
2142 		return (IBT_MR_HDL_INVALID);
2143 	}
2144 
2145 	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2146 	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2147 		mutex_exit(&mr->mr_lock);
2148 		mutex_exit(&mw->mr_lock);
2149 		return (IBT_MR_RKEY_INVALID);
2150 	}
2151 
2152 	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2153 	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2154 		mutex_exit(&mr->mr_lock);
2155 		mutex_exit(&mw->mr_lock);
2156 		return (IBT_MR_LKEY_INVALID);
2157 	}
2158 
2159 	/*
2160 	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2161 	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2162 	 */
2163 	len = wr->wr.rc.rcwr.bind->bind_len;
2164 	if (len != 0) {
2165 		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2166 		reg_start_addr = mr->mr_bindinfo.bi_addr;
2167 		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2168 		    (mr->mr_bindinfo.bi_len - 1);
2169 		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2170 			mutex_exit(&mr->mr_lock);
2171 			mutex_exit(&mw->mr_lock);
2172 			return (IBT_MR_VA_INVALID);
2173 		}
2174 		vaddr = (vaddr + len) - 1;
2175 		if (vaddr > reg_end_addr) {
2176 			mutex_exit(&mr->mr_lock);
2177 			mutex_exit(&mw->mr_lock);
2178 			return (IBT_MR_LEN_INVALID);
2179 		}
2180 	}
2181 
2182 	/*
2183 	 * Validate the bind access flags.  Remote Write and Atomic access for
2184 	 * the Memory Window require that Local Write access be set in the
2185 	 * corresponding Memory Region.
2186 	 */
2187 	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2188 	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2189 	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2190 	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2191 		mutex_exit(&mr->mr_lock);
2192 		mutex_exit(&mw->mr_lock);
2193 		return (IBT_MR_ACCESS_REQ_INVALID);
2194 	}
2195 
2196 	/* Calculate the new RKey for the Memory Window */
2197 	mpt = mw->mr_mptrsrcp;
2198 	new_rkey = hermon_mr_keycalc(mpt->hr_indx);
2199 	new_rkey = hermon_mr_key_swap(new_rkey);
2200 
2201 	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2202 	mw->mr_rkey = new_rkey;
2203 
2204 	mutex_exit(&mr->mr_lock);
2205 	mutex_exit(&mw->mr_lock);
2206 	return (DDI_SUCCESS);
2207 }
2208 
2209 
2210 /*
2211  * hermon_wrid_from_reset_handling()
2212  *    Context: Can be called from interrupt or base context.
2213  */
2214 /* ARGSUSED */
2215 int
2216 hermon_wrid_from_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2217 {
2218 	hermon_workq_hdr_t	*swq, *rwq;
2219 
2220 	if (qp->qp_alloc_flags & IBT_QP_USER_MAP)
2221 		return (DDI_SUCCESS);
2222 
2223 #ifdef __lock_lint
2224 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2225 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2226 #else
2227 	/* grab the cq lock(s) to modify the wqavl tree */
2228 	if (qp->qp_rq_cqhdl)
2229 		mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2230 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2231 	    qp->qp_sq_cqhdl != NULL)
2232 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2233 #endif
2234 
2235 	/* Chain the newly allocated work queue header to the CQ's list */
2236 	if (qp->qp_sq_cqhdl)
2237 		hermon_cq_workq_add(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2238 
2239 	swq = qp->qp_sq_wqhdr;
2240 	swq->wq_head = 0;
2241 	swq->wq_tail = 0;
2242 	swq->wq_full = 0;
2243 
2244 	/*
2245 	 * Now we repeat all the above operations for the receive work queue,
2246 	 * or shared receive work queue.
2247 	 *
2248 	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2249 	 */
2250 
2251 #ifdef __lock_lint
2252 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2253 #else
2254 	if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2255 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2256 	} else {
2257 		rwq = qp->qp_rq_wqhdr;
2258 		rwq->wq_head = 0;
2259 		rwq->wq_tail = 0;
2260 		rwq->wq_full = 0;
2261 		qp->qp_rq_wqecntr = 0;
2262 	}
2263 #endif
2264 	hermon_cq_workq_add(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2265 
2266 #ifdef __lock_lint
2267 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2268 #else
2269 	if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2270 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2271 	}
2272 #endif
2273 
2274 #ifdef __lock_lint
2275 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2276 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2277 #else
2278 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2279 	    qp->qp_sq_cqhdl != NULL)
2280 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2281 	if (qp->qp_rq_cqhdl)
2282 		mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2283 #endif
2284 	return (DDI_SUCCESS);
2285 }
2286 
2287 
2288 /*
2289  * hermon_wrid_to_reset_handling()
2290  *    Context: Can be called from interrupt or base context.
2291  */
2292 int
2293 hermon_wrid_to_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2294 {
2295 	if (qp->qp_alloc_flags & IBT_QP_USER_MAP)
2296 		return (DDI_SUCCESS);
2297 
2298 	/*
2299 	 * If there are unpolled entries in these CQs, they are
2300 	 * polled/flushed.
2301 	 * Grab the CQ lock(s) before manipulating the lists.
2302 	 */
2303 #ifdef __lock_lint
2304 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2305 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2306 #else
2307 	/* grab the cq lock(s) to modify the wqavl tree */
2308 	if (qp->qp_rq_cqhdl)
2309 		mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2310 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2311 	    qp->qp_sq_cqhdl != NULL)
2312 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2313 #endif
2314 
2315 #ifdef __lock_lint
2316 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2317 #else
2318 	if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2319 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2320 	}
2321 #endif
2322 	/*
2323 	 * Flush the entries on the CQ for this QP's QPN.
2324 	 */
2325 	hermon_cq_entries_flush(state, qp);
2326 
2327 #ifdef __lock_lint
2328 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2329 #else
2330 	if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2331 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2332 	}
2333 #endif
2334 
2335 	hermon_cq_workq_remove(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2336 	if (qp->qp_sq_cqhdl != NULL)
2337 		hermon_cq_workq_remove(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2338 
2339 #ifdef __lock_lint
2340 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2341 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2342 #else
2343 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2344 	    qp->qp_sq_cqhdl != NULL)
2345 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2346 	if (qp->qp_rq_cqhdl)
2347 		mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2348 #endif
2349 
2350 	return (IBT_SUCCESS);
2351 }
2352 
2353 
2354 /*
2355  * hermon_wrid_get_entry()
2356  *    Context: Can be called from interrupt or base context.
2357  */
2358 uint64_t
2359 hermon_wrid_get_entry(hermon_cqhdl_t cq, hermon_hw_cqe_t *cqe)
2360 {
2361 	hermon_workq_avl_t	*wqa;
2362 	hermon_workq_hdr_t	*wq;
2363 	uint64_t		wrid;
2364 	uint_t			send_or_recv, qpnum;
2365 	uint32_t		indx;
2366 
2367 	/*
2368 	 * Determine whether this CQE is a send or receive completion.
2369 	 */
2370 	send_or_recv = HERMON_CQE_SENDRECV_GET(cq, cqe);
2371 
2372 	/* Find the work queue for this QP number (send or receive side) */
2373 	qpnum = HERMON_CQE_QPNUM_GET(cq, cqe);
2374 	wqa = hermon_wrid_wqavl_find(cq, qpnum, send_or_recv);
2375 	wq = wqa->wqa_wq;
2376 
2377 	/*
2378 	 * Regardless of whether the completion is the result of a "success"
2379 	 * or a "failure", we lock the list of "containers" and attempt to
2380 	 * search for the the first matching completion (i.e. the first WR
2381 	 * with a matching WQE addr and size).  Once we find it, we pull out
2382 	 * the "wrid" field and return it (see below).  XXX Note: One possible
2383 	 * future enhancement would be to enable this routine to skip over
2384 	 * any "unsignaled" completions to go directly to the next "signaled"
2385 	 * entry on success.
2386 	 */
2387 	indx = HERMON_CQE_WQEADDRSZ_GET(cq, cqe) & wq->wq_mask;
2388 	wrid = wq->wq_wrid[indx];
2389 	if (wqa->wqa_srq_en) {
2390 		struct hermon_sw_srq_s	*srq;
2391 		uint64_t		*desc;
2392 
2393 		/* put wqe back on the srq free list */
2394 		srq = wqa->wqa_srq;
2395 		mutex_enter(&srq->srq_lock);
2396 		desc = HERMON_SRQ_WQE_ADDR(srq, wq->wq_tail);
2397 		((uint16_t *)desc)[1] = htons(indx);
2398 		wq->wq_tail = indx;
2399 		mutex_exit(&srq->srq_lock);
2400 	} else {
2401 		wq->wq_head = (indx + 1) & wq->wq_mask;
2402 		wq->wq_full = 0;
2403 	}
2404 
2405 	return (wrid);
2406 }
2407 
2408 
2409 int
2410 hermon_wrid_workq_compare(const void *p1, const void *p2)
2411 {
2412 	hermon_workq_compare_t	*cmpp;
2413 	hermon_workq_avl_t	*curr;
2414 
2415 	cmpp = (hermon_workq_compare_t *)p1;
2416 	curr = (hermon_workq_avl_t *)p2;
2417 
2418 	if (cmpp->cmp_qpn < curr->wqa_qpn)
2419 		return (-1);
2420 	else if (cmpp->cmp_qpn > curr->wqa_qpn)
2421 		return (+1);
2422 	else if (cmpp->cmp_type < curr->wqa_type)
2423 		return (-1);
2424 	else if (cmpp->cmp_type > curr->wqa_type)
2425 		return (+1);
2426 	else
2427 		return (0);
2428 }
2429 
2430 
2431 /*
2432  * hermon_wrid_workq_find()
2433  *    Context: Can be called from interrupt or base context.
2434  */
2435 static hermon_workq_avl_t *
2436 hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t wq_type)
2437 {
2438 	hermon_workq_avl_t	*curr;
2439 	hermon_workq_compare_t	cmp;
2440 
2441 	/*
2442 	 * Walk the CQ's work queue list, trying to find a send or recv queue
2443 	 * with the same QP number.  We do this even if we are going to later
2444 	 * create a new entry because it helps us easily find the end of the
2445 	 * list.
2446 	 */
2447 	cmp.cmp_qpn = qpn;
2448 	cmp.cmp_type = wq_type;
2449 #ifdef __lock_lint
2450 	hermon_wrid_workq_compare(NULL, NULL);
2451 #endif
2452 	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
2453 
2454 	return (curr);
2455 }
2456 
2457 
2458 /*
2459  * hermon_wrid_wqhdr_create()
2460  *    Context: Can be called from base context.
2461  */
2462 /* ARGSUSED */
2463 hermon_workq_hdr_t *
2464 hermon_wrid_wqhdr_create(int bufsz)
2465 {
2466 	hermon_workq_hdr_t	*wqhdr;
2467 
2468 	/*
2469 	 * Allocate space for the wqhdr, and an array to record all the wrids.
2470 	 */
2471 	wqhdr = (hermon_workq_hdr_t *)kmem_zalloc(sizeof (*wqhdr), KM_NOSLEEP);
2472 	if (wqhdr == NULL) {
2473 		return (NULL);
2474 	}
2475 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr))
2476 	wqhdr->wq_wrid = kmem_zalloc(bufsz * sizeof (uint64_t), KM_NOSLEEP);
2477 	if (wqhdr->wq_wrid == NULL) {
2478 		kmem_free(wqhdr, sizeof (*wqhdr));
2479 		return (NULL);
2480 	}
2481 	wqhdr->wq_size = bufsz;
2482 	wqhdr->wq_mask = bufsz - 1;
2483 
2484 	return (wqhdr);
2485 }
2486 
2487 void
2488 hermon_wrid_wqhdr_destroy(hermon_workq_hdr_t *wqhdr)
2489 {
2490 	kmem_free(wqhdr->wq_wrid, wqhdr->wq_size * sizeof (uint64_t));
2491 	kmem_free(wqhdr, sizeof (*wqhdr));
2492 }
2493 
2494 
2495 /*
2496  * hermon_cq_workq_add()
2497  *    Context: Can be called from interrupt or base context.
2498  */
2499 static void
2500 hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2501 {
2502 	hermon_workq_compare_t	cmp;
2503 	avl_index_t		where;
2504 
2505 	cmp.cmp_qpn = wqavl->wqa_qpn;
2506 	cmp.cmp_type = wqavl->wqa_type;
2507 #ifdef __lock_lint
2508 	hermon_wrid_workq_compare(NULL, NULL);
2509 #endif
2510 	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
2511 	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqavl, where);
2512 }
2513 
2514 
2515 /*
2516  * hermon_cq_workq_remove()
2517  *    Context: Can be called from interrupt or base context.
2518  */
2519 static void
2520 hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2521 {
2522 #ifdef __lock_lint
2523 	hermon_wrid_workq_compare(NULL, NULL);
2524 #endif
2525 	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqavl);
2526 }
2527