xref: /titanic_51/usr/src/lib/udapl/udapl_tavor/tavor/dapl_tavor_hw.c (revision 9e39c5ba00a55fa05777cc94b148296af305e135)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * This file may contain confidential information of
29  * Mellanox Technologies, Ltd. and should not be distributed in source
30  * form without approval from Sun Legal.
31  */
32 
33 #include "dapl.h"
34 #include "dapl_tavor_hw.h"
35 #include "dapl_tavor_wr.h"
36 #include "dapl_tavor_ibtf_impl.h"
37 
38 /*
39  * Function signatures
40  */
41 extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *,
42     uint_t, uint_t, dapls_tavor_wrid_entry_t *);
43 extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t);
44 extern DAPL_OS_LOCK g_tavor_uar_lock;
45 
46 #ifndef	_LP64
47 extern void dapls_atomic_assign_64(uint64_t, uint64_t *);
48 #endif
49 
50 static int dapli_tavor_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *,
51     uint64_t *, uint_t *);
52 static void dapli_tavor_wqe_send_linknext(ibt_send_wr_t *, uint64_t *,
53     boolean_t, uint32_t, uint_t, uint64_t *, tavor_sw_wqe_dbinfo_t *);
54 static DAT_RETURN dapli_tavor_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *,
55     uint64_t *, uint_t *);
56 static void dapli_tavor_wqe_recv_linknext(uint64_t *, boolean_t, uint32_t,
57     uint_t, uint64_t *);
58 static int dapli_tavor_cq_cqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
59     ibt_wc_t *);
60 static int dapli_tavor_cq_errcqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
61     ibt_wc_t *);
62 
63 /* exported to other HCAs */
64 extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t,
65     uint32_t, uint_t);
66 extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t);
67 
68 /*
69  * Note: The 64 bit doorbells need to written atomically.
70  * In 32 bit libraries we need to use the special assembly rtn
71  * because compiler generated code splits into 2 word writes
72  */
73 
74 #if defined(_LP64) || defined(__lint)
75 /* use a macro to ensure inlining on S10 amd64 compiler */
76 #define	dapli_tavor_cq_doorbell(ia_uar, cq_cmd, cqn, cq_param) \
77 	((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64( \
78 	    ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | \
79 	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param)
80 #else
81 
82 /*
83  * dapli_tavor_cq_doorbell()
84  * Takes the specified cq cmd and cq number and rings the cq doorbell
85  */
86 static void
87 dapli_tavor_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn,
88     uint32_t cq_param)
89 {
90 	uint64_t doorbell;
91 
92 	/* Build the doorbell from the parameters */
93 	doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
94 	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
95 
96 	/* Write the doorbell to UAR */
97 #ifdef _LP64
98 	((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell);
99 	/* 32 bit version */
100 #elif defined(i386)
101 	dapl_os_lock(&g_tavor_uar_lock);
102 	/*
103 	 * For 32 bit intel we assign the doorbell in the order
104 	 * prescribed by the Tavor PRM, lower to upper addresses
105 	 */
106 	((tavor_hw_uar32_t *)ia_uar)->cq[0] =
107 	    (uint32_t)HTOBE_32(doorbell >> 32);
108 	((tavor_hw_uar32_t *)ia_uar)->cq[1] =
109 	    (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
110 	dapl_os_unlock(&g_tavor_uar_lock);
111 #else
112 	dapls_atomic_assign_64(HTOBE_64(doorbell),
113 	    &((tavor_hw_uar_t *)ia_uar)->cq);
114 #endif
115 }
116 #pragma inline(dapli_tavor_cq_doorbell)
117 
118 #endif	/* _LP64 */
119 
120 #if defined(_LP64) || defined(__lint)
121 #define	dapli_tavor_qp_send_doorbell(ia_uar, nda, nds, qpn, fence, nopcode) \
122 	((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64( \
123 	    (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << \
124 	    TAVOR_QPSNDDB_NDA_SHIFT) | \
125 	    ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | \
126 	    ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | \
127 	    ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds)
128 #else
129 
130 /*
131  * dapli_tavor_qp_send_doorbell()
132  * Takes the specified next descriptor information, qp number, opcode and
133  * rings the send doorbell
134  */
135 static void
136 dapli_tavor_qp_send_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
137     uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode)
138 {
139 	uint64_t doorbell;
140 
141 	/* Build the doorbell from the parameters */
142 	doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
143 	    TAVOR_QPSNDDB_NDA_SHIFT) |
144 	    ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
145 	    ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
146 	    ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
147 
148 	/* Write the doorbell to UAR */
149 #ifdef _LP64
150 	((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell);
151 #else
152 #if defined(i386)
153 	dapl_os_lock(&g_tavor_uar_lock);
154 	/*
155 	 * For 32 bit intel we assign the doorbell in the order
156 	 * prescribed by the Tavor PRM, lower to upper addresses
157 	 */
158 	((tavor_hw_uar32_t *)ia_uar)->send[0] =
159 	    (uint32_t)HTOBE_32(doorbell >> 32);
160 	((tavor_hw_uar32_t *)ia_uar)->send[1] =
161 	    (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
162 	dapl_os_unlock(&g_tavor_uar_lock);
163 #else
164 	dapls_atomic_assign_64(HTOBE_64(doorbell),
165 	    &((tavor_hw_uar_t *)ia_uar)->send);
166 #endif
167 #endif
168 }
169 #pragma inline(dapli_tavor_qp_send_doorbell)
170 #endif	/* _LP64 */
171 
172 #if defined(_LP64) || defined(__lint)
173 
174 #define	dapli_tavor_qp_recv_doorbell(ia_uar, nda, nds, qpn, credits) \
175 	((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64( \
176 	    (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << \
177 	    TAVOR_QPRCVDB_NDA_SHIFT) | \
178 	    ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | \
179 	    ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits)
180 #else
181 
182 /*
183  * dapli_tavor_qp_recv_doorbell()
184  * Takes the specified next descriptor information, qp number and
185  * rings the recv doorbell
186  */
187 static void
188 dapli_tavor_qp_recv_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
189     uint32_t nds, uint32_t qpn, uint32_t credits)
190 {
191 	uint64_t doorbell;
192 
193 	/* Build the doorbell from the parameters */
194 	doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
195 	    TAVOR_QPRCVDB_NDA_SHIFT) |
196 	    ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
197 	    ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
198 
199 	/* Write the doorbell to UAR */
200 #ifdef _LP64
201 	((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64(doorbell);
202 #else
203 #if defined(i386)
204 	dapl_os_lock(&g_tavor_uar_lock);
205 	/*
206 	 * For 32 bit intel we assign the doorbell in the order
207 	 * prescribed by the Tavor PRM, lower to upper addresses
208 	 */
209 	((tavor_hw_uar32_t *)ia_uar)->recv[0] =
210 	    (uint32_t)HTOBE_32(doorbell >> 32);
211 	((tavor_hw_uar32_t *)ia_uar)->recv[1] =
212 	    (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
213 	dapl_os_unlock(&g_tavor_uar_lock);
214 #else
215 	dapls_atomic_assign_64(HTOBE_64(doorbell),
216 	    &((tavor_hw_uar_t *)ia_uar)->recv);
217 #endif
218 #endif
219 }
220 #pragma inline(dapli_tavor_qp_recv_doorbell)
221 #endif	/* _LP64 */
222 
223 
224 /*
225  * dapls_tavor_max_inline()
226  * Return the max inline value that should be used.
227  * Env variable DAPL_MAX_INLINE can override the default.
228  * If it's not set (or set to -1), default behavior is used.
229  * If it's zero or negative (except -1) inline is not done.
230  */
231 int
232 dapls_tavor_max_inline(void)
233 {
234 	static int max_inline_env = -2;
235 
236 	/* Check the env exactly once, otherwise return previous value. */
237 	if (max_inline_env != -2)
238 		return (max_inline_env);
239 
240 	max_inline_env = dapl_os_get_env_val("DAPL_MAX_INLINE", -1);
241 	if (max_inline_env != -1)
242 		if (max_inline_env <= 0)
243 			max_inline_env = 0;	/* no inlining */
244 	return (max_inline_env);
245 }
246 
247 /*
248  * dapls_ib_max_request_iov(), aka, max send sgl size.
249  * The send queue's scatter/gather list is used for "inline" data.
250  *
251  * By default, compute reasonable send queue size based on #iovs, #wqes,
252  * max_iovs, and max inline byte count.  If the #wqes is large, then we
253  * limit how much the SGL (space for inline data) can take.  The heuristic
254  * is to increase the memory for the send queue to a maximum of 32KB:
255  *
256  *	< 128 wqes	increase to at most 256 minus header
257  *	< 256 wqes	increase to at most 128 minus header
258  *	>= 256 wqes	use SGL unaltered
259  *
260  * If the env is supplied (max_inline >= 0), use it without checking.
261  */
262 int
263 dapls_ib_max_request_iov(int iovs, int wqes, int max_iovs,
264     int max_inline_bytes)
265 {
266 	int ret_iovs;
267 
268 	if (max_inline_bytes > 0) {
269 		ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
270 	} else if (wqes < 128) {
271 		max_inline_bytes = 256 - TAVOR_INLINE_HEADER_SIZE_MAX;
272 		ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
273 	} else if (wqes < 256) {
274 		max_inline_bytes = 128 - TAVOR_INLINE_HEADER_SIZE_MAX;
275 		ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
276 	} else {
277 		ret_iovs = iovs;
278 	}
279 
280 	if (ret_iovs > max_iovs)	/* do not exceed max */
281 		ret_iovs = max_iovs;
282 	if (iovs > ret_iovs)		/* never decrease iovs */
283 		ret_iovs = iovs;
284 	return (ret_iovs);
285 }
286 
287 /*
288  * dapli_tavor_wqe_send_build()
289  * Constructs a WQE for a given ibt_send_wr_t
290  */
291 static int
292 dapli_tavor_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr,
293     uint64_t *addr, uint_t *size)
294 {
295 	tavor_hw_snd_wqe_remaddr_t	*rc;
296 	tavor_hw_snd_wqe_bind_t		*bn;
297 	tavor_hw_wqe_sgl_t		*ds;
298 	ibt_wr_ds_t			*sgl;
299 	uint32_t			nds;
300 	uint32_t			len, total_len;
301 	uint32_t			tavor_num_mpt_mask;
302 	uint32_t			new_rkey;
303 	uint32_t			old_rkey;
304 	int				i, num_ds;
305 	int				max_inline_bytes = -1;
306 
307 	nds = wr->wr_nds;
308 	sgl = wr->wr_sgl;
309 	num_ds = 0;
310 
311 	/*
312 	 * RC is the only supported transport in UDAPL
313 	 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write"
314 	 */
315 	switch (wr->wr_opcode) {
316 	case IBT_WRC_SEND:
317 		/*
318 		 * If this is a Send request, then all we need is
319 		 * the Data Segment processing below.
320 		 * Initialize the information for the Data Segments
321 		 */
322 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
323 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
324 		if (qp->qp_sq_inline != 0)
325 			max_inline_bytes =
326 			    qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND;
327 		break;
328 	case IBT_WRC_RDMAW:
329 		if (qp->qp_sq_inline != 0)
330 			max_inline_bytes =
331 			    qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW;
332 		/* FALLTHROUGH */
333 	case IBT_WRC_RDMAR:
334 		if (qp->qp_sq_inline < 0 && wr->wr_opcode == IBT_WRC_RDMAR)
335 			qp->qp_sq_inline = 0;
336 		/*
337 		 * If this is an RDMA Read or RDMA Write request, then fill
338 		 * in the "Remote Address" header fields.
339 		 */
340 		rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr +
341 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
342 
343 		/*
344 		 * Build the Remote Address Segment for the WQE, using
345 		 * the information from the RC work request.
346 		 */
347 		TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma);
348 
349 		/* Update "ds" for filling in Data Segments (below) */
350 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
351 		    sizeof (tavor_hw_snd_wqe_remaddr_t));
352 		break;
353 	case IBT_WRC_BIND:
354 		/*
355 		 * Generate a new R_key
356 		 * Increment the upper "unconstrained" bits and need to keep
357 		 * the lower "constrained" bits the same it represents
358 		 * the MPT index.
359 		 */
360 		old_rkey = wr->wr.rc.rcwr.bind->bind_rkey;
361 		tavor_num_mpt_mask = (uint32_t)(1 << qp->qp_num_mpt_shift) - 1;
362 		new_rkey = (old_rkey >> qp->qp_num_mpt_shift);
363 		new_rkey++;
364 		new_rkey = ((new_rkey << qp->qp_num_mpt_shift) |
365 		    (old_rkey & tavor_num_mpt_mask));
366 
367 		wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
368 
369 		bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr +
370 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
371 
372 		/*
373 		 * Build the Bind Memory Window Segments for the WQE,
374 		 * using the information from the RC Bind memory
375 		 * window work request.
376 		 */
377 		TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind);
378 
379 		/*
380 		 * Update the "ds" pointer.  Even though the "bind"
381 		 * operation requires no SGLs, this is necessary to
382 		 * facilitate the correct descriptor size calculations
383 		 * (below).
384 		 */
385 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
386 		    sizeof (tavor_hw_snd_wqe_bind_t));
387 		break;
388 	default:
389 		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
390 		    "dapli_tavor_wqe_send_build: invalid wr_opcode=%d\n",
391 		    wr->wr_opcode);
392 		return (DAT_INTERNAL_ERROR);
393 	}
394 
395 	/*
396 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
397 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
398 	 * Start by checking for a valid number of SGL entries
399 	 */
400 	if (nds > qp->qp_sq_sgl) {
401 		return (DAT_INVALID_PARAMETER);
402 	}
403 
404 	/*
405 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
406 	 * segments.  Note: We skip any SGL with zero size because Tavor
407 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
408 	 * the encoding for zero means a 2GB transfer.  Because of this special
409 	 * encoding in the hardware, we mask the requested length with
410 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
411 	 * zero.)
412 	 */
413 
414 	if (max_inline_bytes != -1) {		/* compute total_len */
415 		total_len = 0;
416 		for (i = 0; i < nds; i++)
417 			total_len += sgl[i].ds_len;
418 		if (total_len > max_inline_bytes)
419 			max_inline_bytes = -1;	/* too big, do not "inline" */
420 	}
421 	if (max_inline_bytes != -1) {		/* do "inline" */
422 		uint8_t *dst = (uint8_t *)((uint32_t *)ds + 1);
423 		*(uint32_t *)ds =
424 		    HTOBE_32(total_len | TAVOR_WQE_SGL_INLINE_MASK);
425 		for (i = 0; i < nds; i++) {
426 			if ((len = sgl[i].ds_len) == 0) {
427 				continue;
428 			}
429 			(void) dapl_os_memcpy(dst,
430 			    (void *)(uintptr_t)sgl[i].ds_va, len);
431 			dst += len;
432 		}
433 		/* Return the size of descriptor (in 16-byte chunks) */
434 		*size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4;
435 	} else {
436 		for (i = 0; i < nds; i++) {
437 			if (sgl[i].ds_len == 0) {
438 				continue;
439 			}
440 
441 			/*
442 			 * Fill in the Data Segment(s) for the current WQE,
443 			 * using the information contained in the
444 			 * scatter-gather list of the work request.
445 			 */
446 			TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]);
447 			num_ds++;
448 		}
449 
450 		/* Return the size of descriptor (in 16-byte chunks) */
451 		*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4;
452 	}
453 
454 	return (DAT_SUCCESS);
455 }
456 
457 /*
458  * dapli_tavor_wqe_send_linknext()
459  * Takes a WQE and links it to the prev WQE chain
460  */
461 static void
462 dapli_tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, uint64_t *curr_addr,
463     boolean_t ns, uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr,
464     tavor_sw_wqe_dbinfo_t *dbinfo)
465 {
466 	uint64_t	next, ctrl;
467 	uint32_t	nopcode, fence;
468 
469 	next = 0;
470 	ctrl = 0;
471 
472 	/* Set the "c" (i.e. "signaled") bit appropriately */
473 	if (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
474 		ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
475 	}
476 
477 	/* Set the "s" (i.e. "solicited") bit appropriately */
478 	if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
479 		ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
480 	}
481 	/* Set the "e" (i.e. "event") bit if notification is needed */
482 	if (!ns) {
483 		ctrl = ctrl | TAVOR_WQE_RCV_EVENT_MASK;
484 	}
485 
486 	/*
487 	 * The "i" bit is unused since uDAPL doesn't support
488 	 * the immediate data
489 	 */
490 
491 	/* initialize the ctrl and next fields of the current descriptor */
492 	TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
493 
494 	/*
495 	 * Calculate the "next" field of the prev descriptor.  This amounts
496 	 * to setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
497 	 * fields (see tavor_hw.h for more).
498 	 */
499 
500 	/*
501 	 * Determine the value for the Tavor WQE "nopcode" field
502 	 * by using the IBTF opcode from the work request
503 	 */
504 	switch (curr_wr->wr_opcode) {
505 	case IBT_WRC_RDMAW:
506 		nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
507 		break;
508 
509 	case IBT_WRC_SEND:
510 		nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
511 		break;
512 
513 	case IBT_WRC_RDMAR:
514 		nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
515 		break;
516 
517 	case IBT_WRC_BIND:
518 		nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
519 		break;
520 	default:
521 		/* Unsupported opcodes in UDAPL */
522 		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
523 		    "dapli_tavor_wqe_send_linknext: invalid nopcode=%d\n",
524 		    nopcode);
525 		return;
526 	}
527 
528 	next  = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
529 	next  = next | ((uint64_t)nopcode << 32);
530 	fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
531 	if (fence) {
532 		next = next | TAVOR_WQE_SEND_FENCE_MASK;
533 	}
534 	next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
535 
536 	/*
537 	 * A send queue doorbell will be rung for the next
538 	 * WQE on the chain, set the current WQE's "dbd" bit.
539 	 * Note: We also update the "dbinfo" structure here to pass
540 	 * back information about what should (later) be included
541 	 * in the send queue doorbell.
542 	 */
543 	next = next | TAVOR_WQE_DBD_MASK;
544 	dbinfo->db_nopcode = nopcode;
545 	dbinfo->db_fence   = fence;
546 
547 	/*
548 	 * Send queue doorbell will be rung for the next WQE on
549 	 * the chain, update the prev WQE's "next" field and return.
550 	 */
551 	if (prev_addr != NULL) {
552 		TAVOR_WQE_LINKFIRST(prev_addr, next);
553 	}
554 }
555 
556 
557 /*
558  * dapli_tavor_wqe_recv_build()
559  * Builds the recv WQE for a given ibt_recv_wr_t
560  */
561 static DAT_RETURN
562 dapli_tavor_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr,
563     uint64_t *addr, uint_t *size)
564 {
565 	tavor_hw_wqe_sgl_t	*ds;
566 	int			i;
567 	int			num_ds;
568 
569 	/* Fill in the Data Segments (SGL) for the Recv WQE */
570 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
571 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
572 	num_ds = 0;
573 
574 	/* Check for valid number of SGL entries */
575 	if (wr->wr_nds > qp->qp_rq_sgl) {
576 		return (DAT_INVALID_PARAMETER);
577 	}
578 
579 	/*
580 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
581 	 * segments.  Note: We skip any SGL with zero size because Tavor
582 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
583 	 * the encoding for zero means a 2GB transfer.  Because of this special
584 	 * encoding in the hardware, we mask the requested length with
585 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
586 	 * zero.)
587 	 */
588 	for (i = 0; i < wr->wr_nds; i++) {
589 		if (wr->wr_sgl[i].ds_len == 0) {
590 			continue;
591 		}
592 
593 		/*
594 		 * Fill in the Data Segment(s) for the receive WQE, using the
595 		 * information contained in the scatter-gather list of the
596 		 * work request.
597 		 */
598 		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
599 		num_ds++;
600 	}
601 
602 	/* Return the size of descriptor (in 16-byte chunks) */
603 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 0x4;
604 
605 	return (DAT_SUCCESS);
606 }
607 
608 
609 /*
610  * dapli_tavor_wqe_recv_linknext()
611  * Links a recv WQE to the prev chain
612  */
613 static void
614 dapli_tavor_wqe_recv_linknext(uint64_t *curr_addr, boolean_t ns,
615     uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr)
616 {
617 	uint64_t	next;
618 	uint64_t	ctrl = 0;
619 
620 	/*
621 	 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
622 	 * at a time. If there is no next descriptor (i.e. if the current
623 	 * descriptor is the last WQE on the chain), then set "next" field
624 	 * to TAVOR_WQE_DBD_MASK.  This is because the Tavor hardware
625 	 * requires the "dbd" bit to be set to one for all Recv WQEs.
626 	 * In either case, we must add a single bit in the "reserved" field
627 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
628 	 * workaround for a known Tavor errata that can cause Recv WQEs with
629 	 * zero in the NDA field to behave improperly.
630 	 *
631 	 * If notification suppression is not desired then we set
632 	 * the "E" bit in the ctrl field.
633 	 */
634 
635 	next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
636 	if (!ns) { /* notification needed - so set the "E" bit */
637 		ctrl = TAVOR_WQE_RCV_EVENT_MASK;
638 	}
639 
640 	/* update the WQE */
641 	TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
642 
643 	if (prev_addr != NULL) {
644 		/*
645 		 * Calculate the "next" field of the descriptor.  This amounts
646 		 * to setting up the "next_wqe_addr", "dbd", and "nds" fields
647 		 * (see tavor_hw.h for more).
648 		 */
649 		next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
650 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
651 		    TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
652 
653 		/*
654 		 * If this WQE is supposed to be linked to the previous
655 		 * descriptor, then we need to update not only the previous
656 		 * WQE's "next" fields but we must not touch this WQE's
657 		 * "ctrl" fields.
658 		 */
659 		TAVOR_WQE_LINKFIRST(prev_addr, next);
660 	}
661 }
662 
663 /*
664  * dapli_tavor_wqe_srq_build()
665  * Builds the recv WQE for a given ibt_recv_wr_t
666  */
667 static DAT_RETURN
668 dapli_tavor_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr,
669     uint64_t *addr)
670 {
671 	tavor_hw_wqe_sgl_t	*ds;
672 	ibt_wr_ds_t		end_sgl;
673 	int			i;
674 	int			num_ds;
675 
676 	/* Fill in the Data Segments (SGL) for the Recv WQE */
677 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
678 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
679 	num_ds = 0;
680 
681 	/* Check for valid number of SGL entries */
682 	if (wr->wr_nds > srq->srq_wq_sgl) {
683 		return (DAT_INVALID_PARAMETER);
684 	}
685 
686 	/*
687 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
688 	 * segments.  Note: We skip any SGL with zero size because Tavor
689 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
690 	 * the encoding for zero means a 2GB transfer.  Because of this special
691 	 * encoding in the hardware, we mask the requested length with
692 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
693 	 * zero.)
694 	 */
695 	for (i = 0; i < wr->wr_nds; i++) {
696 		if (wr->wr_sgl[i].ds_len == 0) {
697 			continue;
698 		}
699 
700 		/*
701 		 * Fill in the Data Segment(s) for the receive WQE, using the
702 		 * information contained in the scatter-gather list of the
703 		 * work request.
704 		 */
705 		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
706 		num_ds++;
707 	}
708 
709 	/*
710 	 * For SRQ, if the number of data segments is less than the maximum
711 	 * specified at alloc, then we have to fill in a special "key" entry in
712 	 * the sgl entry after the last valid one in this post request.  We do
713 	 * that here.
714 	 */
715 	if (num_ds < srq->srq_wq_sgl) {
716 		end_sgl.ds_va  = (ib_vaddr_t)0;
717 		end_sgl.ds_len = (ib_msglen_t)0;
718 		end_sgl.ds_key = (ibt_lkey_t)1;
719 		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl);
720 	}
721 
722 	return (DAT_SUCCESS);
723 }
724 
725 /*
726  * dapli_tavor_wqe_srq_linknext()
727  * Links a srq recv WQE to the prev chain
728  */
729 static void
730 dapli_tavor_wqe_srq_linknext(uint64_t *curr_addr, boolean_t ns,
731     uint32_t curr_desc, uint64_t *prev_addr)
732 {
733 	uint64_t	next;
734 	uint64_t	ctrl = 0;
735 
736 	/*
737 	 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
738 	 * at a time. If there is no next descriptor (i.e. if the current
739 	 * descriptor is the last WQE on the chain), then set "next" field
740 	 * to TAVOR_WQE_DBD_MASK.  This is because the Tavor hardware
741 	 * requires the "dbd" bit to be set to one for all Recv WQEs.
742 	 * In either case, we must add a single bit in the "reserved" field
743 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
744 	 * workaround for a known Tavor errata that can cause Recv WQEs with
745 	 * zero in the NDA field to behave improperly.
746 	 *
747 	 * If notification suppression is not desired then we set
748 	 * the "E" bit in the ctrl field.
749 	 */
750 
751 	next = TAVOR_RCV_WQE_NDA0_WA_MASK;
752 	if (!ns) { /* notification needed - so set the "E" bit */
753 		ctrl = TAVOR_WQE_RCV_EVENT_MASK;
754 	}
755 
756 	/* update the WQE */
757 	TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
758 
759 	if (prev_addr != NULL) {
760 		/*
761 		 * Calculate the "next" field of the descriptor.  This amounts
762 		 * to setting up the "next_wqe_addr", "dbd", and "nds" fields
763 		 * (see tavor_hw.h for more).
764 		 */
765 		next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
766 		next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
767 
768 		/*
769 		 * If this WQE is supposed to be linked to the previous
770 		 * descriptor, then we need to update not only the previous
771 		 * WQE's "next" fields but we must not touch this WQE's
772 		 * "ctrl" fields.
773 		 */
774 		TAVOR_WQE_LINKFIRST(prev_addr, next);
775 	}
776 }
777 
778 /*
779  * dapli_tavor_cq_peek()
780  * Peeks into a given CQ to check if there are any events that can be
781  * polled. It returns the number of CQEs that can be polled.
782  */
783 static void
784 dapli_tavor_cq_peek(ib_cq_handle_t cq, int *num_cqe)
785 {
786 	tavor_hw_cqe_t		*cqe;
787 	uint32_t		imm_eth_pkey_cred;
788 	uint32_t		cons_indx;
789 	uint32_t		wrap_around_mask;
790 	uint32_t		polled_cnt;
791 	uint_t			doorbell_cnt;
792 	uint_t			opcode;
793 
794 	/* Get the consumer index */
795 	cons_indx = cq->cq_consindx;
796 
797 	/*
798 	 * Calculate the wrap around mask.  Note: This operation only works
799 	 * because all Tavor completion queues have power-of-2 sizes
800 	 */
801 	wrap_around_mask = (cq->cq_size - 1);
802 
803 	/* Calculate the pointer to the first CQ entry */
804 	cqe = &cq->cq_addr[cons_indx];
805 
806 	/*
807 	 * Count entries in the CQ until we find an entry owned by
808 	 * the hardware.
809 	 */
810 	polled_cnt = 0;
811 	while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
812 		opcode = TAVOR_CQE_OPCODE_GET(cqe);
813 		/* Error CQE map to multiple work completions */
814 		if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
815 		    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
816 			imm_eth_pkey_cred =
817 			    TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
818 			doorbell_cnt =
819 			    imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
820 			polled_cnt += (doorbell_cnt + 1);
821 		} else {
822 			polled_cnt++;
823 		}
824 		/* Increment the consumer index */
825 		cons_indx = (cons_indx + 1) & wrap_around_mask;
826 
827 		/* Update the pointer to the next CQ entry */
828 		cqe = &cq->cq_addr[cons_indx];
829 	}
830 
831 	*num_cqe = polled_cnt;
832 }
833 
834 /*
835  * dapli_tavor_cq_poll()
836  * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t
837  * array that is passed in.
838  */
839 static DAT_RETURN
840 dapli_tavor_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc,
841     uint_t *num_polled)
842 {
843 	tavor_hw_cqe_t		*cqe;
844 	uint32_t		cons_indx;
845 	uint32_t		wrap_around_mask;
846 	uint32_t		polled_cnt;
847 	uint32_t		num_to_increment;
848 	DAT_RETURN		dat_status;
849 	int			status;
850 
851 	/* Get the consumer index */
852 	cons_indx = cq->cq_consindx;
853 
854 	/*
855 	 * Calculate the wrap around mask.  Note: This operation only works
856 	 * because all Tavor completion queues have power-of-2 sizes
857 	 */
858 	wrap_around_mask = (cq->cq_size - 1);
859 
860 	/* Calculate the pointer to the first CQ entry */
861 	cqe = &cq->cq_addr[cons_indx];
862 
863 	/*
864 	 * Keep pulling entries from the CQ until we find an entry owned by
865 	 * the hardware.  As long as there the CQE's owned by SW, process
866 	 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
867 	 * CQ consumer index.  Note:  We only update the consumer index if
868 	 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
869 	 * Otherwise, it indicates that we are going to "recycle" the CQE
870 	 * (probably because it is a error CQE and corresponds to more than one
871 	 * completion).
872 	 */
873 	polled_cnt = 0;
874 	while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
875 		status = dapli_tavor_cq_cqe_consume(cq, cqe,
876 		    &wc_p[polled_cnt++]);
877 		if (status == TAVOR_CQ_SYNC_AND_DB) {
878 			/* Reset entry to hardware ownership */
879 			TAVOR_CQE_OWNER_SET_HW(cqe);
880 
881 			/* Increment the consumer index */
882 			cons_indx = (cons_indx + 1) & wrap_around_mask;
883 
884 			/* Update the pointer to the next CQ entry */
885 			cqe = &cq->cq_addr[cons_indx];
886 		}
887 
888 		/*
889 		 * If we have run out of space to store work completions,
890 		 * then stop and return the ones we have pulled of the CQ.
891 		 */
892 		if (polled_cnt >= num_wc) {
893 			break;
894 		}
895 	}
896 
897 	dat_status = DAT_SUCCESS;
898 	/*
899 	 * Now we only ring the doorbell (to update the consumer index) if
900 	 * we've actually consumed a CQ entry.  If we have, for example,
901 	 * pulled from a CQE that we are still in the process of "recycling"
902 	 * for error purposes, then we would not update the consumer index.
903 	 */
904 	if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
905 		/*
906 		 * Post doorbell to update the consumer index.  Doorbell
907 		 * value indicates number of entries consumed (minus 1)
908 		 */
909 		if (cons_indx > cq->cq_consindx) {
910 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
911 		} else {
912 			num_to_increment = ((cons_indx + cq->cq_size) -
913 			    cq->cq_consindx) - 1;
914 		}
915 		cq->cq_consindx = cons_indx;
916 		dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
917 		    cq->cq_num, num_to_increment);
918 	} else if (polled_cnt == 0) {
919 		/*
920 		 * If the CQ is empty, we can try to free up some of the WRID
921 		 * list containers.
922 		 */
923 		if (cq->cq_wrid_reap_head)	/* look before leaping */
924 			dapls_tavor_wrid_cq_reap(cq);
925 		dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
926 	}
927 
928 	if (num_polled != NULL) {
929 		*num_polled = polled_cnt;
930 	}
931 
932 	return (dat_status);
933 }
934 
935 /*
936  * dapli_tavor_cq_poll_one()
937  * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t
938  * that is passed in.  See above for more comments/details.
939  */
940 static DAT_RETURN
941 dapli_tavor_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p)
942 {
943 	tavor_hw_cqe_t		*cqe;
944 	uint32_t		cons_indx;
945 	DAT_RETURN		dat_status;
946 	int			status;
947 
948 	/* Get the consumer index */
949 	cons_indx = cq->cq_consindx;
950 
951 	/* Calculate the pointer to the first CQ entry */
952 	cqe = &cq->cq_addr[cons_indx];
953 
954 	/*
955 	 * Keep pulling entries from the CQ until we find an entry owned by
956 	 * the hardware.  As long as there the CQE's owned by SW, process
957 	 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
958 	 * CQ consumer index.  Note:  We only update the consumer index if
959 	 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
960 	 * Otherwise, it indicates that we are going to "recycle" the CQE
961 	 * (probably because it is a error CQE and corresponds to more than one
962 	 * completion).
963 	 */
964 	if (TAVOR_CQE_OWNER_IS_SW(cqe)) {
965 		status = dapli_tavor_cq_cqe_consume(cq, cqe, wc_p);
966 		if (status == TAVOR_CQ_SYNC_AND_DB) {
967 			/* Reset entry to hardware ownership */
968 			TAVOR_CQE_OWNER_SET_HW(cqe);
969 
970 			/* Increment the consumer index */
971 			cq->cq_consindx =
972 			    (cons_indx + 1) & (cq->cq_size - 1);
973 			dapli_tavor_cq_doorbell(cq->cq_iauar,
974 			    TAVOR_CQDB_INCR_CONSINDX,
975 			    cq->cq_num, 0);
976 		}
977 		dat_status = DAT_SUCCESS;
978 	} else {
979 		if (cq->cq_wrid_reap_head)	/* look before leaping */
980 			dapls_tavor_wrid_cq_reap(cq);
981 		dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
982 	}
983 	return (dat_status);
984 }
985 
986 /*
987  * dapli_tavor_cq_cqe_consume()
988  * Converts a given CQE into a ibt_wc_t object
989  */
990 static int
991 dapli_tavor_cq_cqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
992     ibt_wc_t *wc)
993 {
994 	uint_t		flags;
995 	uint_t		type;
996 	uint_t		opcode;
997 	int		status;
998 
999 	/*
1000 	 * Determine if this is an "error" CQE by examining "opcode".  If it
1001 	 * is an error CQE, then call dapli_tavor_cq_errcqe_consume() and return
1002 	 * whatever status it returns.  Otherwise, this is a successful
1003 	 * completion.
1004 	 */
1005 	opcode = TAVOR_CQE_OPCODE_GET(cqe);
1006 	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1007 	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1008 		status = dapli_tavor_cq_errcqe_consume(cqhdl, cqe, wc);
1009 		return (status);
1010 	}
1011 
1012 	/*
1013 	 * Fetch the Work Request ID using the information in the CQE.
1014 	 * See tavor_wr.c for more details.
1015 	 */
1016 	wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
1017 	    TAVOR_CQE_SENDRECV_GET(cqe), 0, NULL);
1018 	wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
1019 
1020 	/*
1021 	 * Parse the CQE opcode to determine completion type.  This will set
1022 	 * not only the type of the completion, but also any flags that might
1023 	 * be associated with it (e.g. whether immediate data is present).
1024 	 */
1025 	flags = IBT_WC_NO_FLAGS;
1026 	if (TAVOR_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) {
1027 
1028 		/*
1029 		 * Send CQE
1030 		 *
1031 		 * The following opcodes will not be generated in uDAPL
1032 		 * case TAVOR_CQE_SND_RDMAWR_IMM:
1033 		 * case TAVOR_CQE_SND_SEND_IMM:
1034 		 * case TAVOR_CQE_SND_ATOMIC_CS:
1035 		 * case TAVOR_CQE_SND_ATOMIC_FA:
1036 		 */
1037 		switch (opcode) {
1038 		case TAVOR_CQE_SND_RDMAWR:
1039 			type = IBT_WRC_RDMAW;
1040 			break;
1041 
1042 		case TAVOR_CQE_SND_SEND:
1043 			type = IBT_WRC_SEND;
1044 			break;
1045 
1046 		case TAVOR_CQE_SND_RDMARD:
1047 			type = IBT_WRC_RDMAR;
1048 			wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
1049 			break;
1050 
1051 		case TAVOR_CQE_SND_BIND_MW:
1052 			type = IBT_WRC_BIND;
1053 			break;
1054 
1055 		default:
1056 			wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
1057 			return (TAVOR_CQ_SYNC_AND_DB);
1058 		}
1059 	} else {
1060 
1061 		/*
1062 		 * Receive CQE
1063 		 *
1064 		 * The following opcodes will not be generated in uDAPL
1065 		 *
1066 		 * case TAVOR_CQE_RCV_RECV_IMM:
1067 		 * case TAVOR_CQE_RCV_RECV_IMM2:
1068 		 * case TAVOR_CQE_RCV_RDMAWR_IMM:
1069 		 * case TAVOR_CQE_RCV_RDMAWR_IMM2:
1070 		 */
1071 		switch (opcode & 0x1F) {
1072 		case TAVOR_CQE_RCV_RECV:
1073 			/* FALLTHROUGH */
1074 		case TAVOR_CQE_RCV_RECV2:
1075 			type = IBT_WRC_RECV;
1076 			wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
1077 			break;
1078 		default:
1079 			wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
1080 			return (TAVOR_CQ_SYNC_AND_DB);
1081 		}
1082 	}
1083 	wc->wc_type = type;
1084 	wc->wc_flags = flags;
1085 	/* If we got here, completion status must be success */
1086 	wc->wc_status = IBT_WC_SUCCESS;
1087 
1088 	return (TAVOR_CQ_SYNC_AND_DB);
1089 }
1090 
1091 
1092 /*
1093  * dapli_tavor_cq_errcqe_consume()
1094  */
1095 static int
1096 dapli_tavor_cq_errcqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
1097     ibt_wc_t *wc)
1098 {
1099 	dapls_tavor_wrid_entry_t	wre;
1100 	uint32_t		next_wqeaddr;
1101 	uint32_t		imm_eth_pkey_cred;
1102 	uint_t			nextwqesize, dbd;
1103 	uint_t			doorbell_cnt, status;
1104 	uint_t			opcode = TAVOR_CQE_OPCODE_GET(cqe);
1105 
1106 	dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n",
1107 	    TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
1108 	    TAVOR_CQE_WQEADDRSZ_GET(cqe));
1109 
1110 	/*
1111 	 * Fetch the Work Request ID using the information in the CQE.
1112 	 * See tavor_wr.c for more details.
1113 	 */
1114 	wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
1115 	    (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? TAVOR_COMPLETION_SEND :
1116 	    TAVOR_COMPLETION_RECV, 1, &wre);
1117 	wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
1118 
1119 	/*
1120 	 * Parse the CQE opcode to determine completion type.  We know that
1121 	 * the CQE is an error completion, so we extract only the completion
1122 	 * status here.
1123 	 */
1124 	imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
1125 	status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1126 	switch (status) {
1127 	case TAVOR_CQE_LOC_LEN_ERR:
1128 		status = IBT_WC_LOCAL_LEN_ERR;
1129 		break;
1130 
1131 	case TAVOR_CQE_LOC_OP_ERR:
1132 		status = IBT_WC_LOCAL_CHAN_OP_ERR;
1133 		break;
1134 
1135 	case TAVOR_CQE_LOC_PROT_ERR:
1136 		status = IBT_WC_LOCAL_PROTECT_ERR;
1137 		break;
1138 
1139 	case TAVOR_CQE_WR_FLUSHED_ERR:
1140 		status = IBT_WC_WR_FLUSHED_ERR;
1141 		break;
1142 
1143 	case TAVOR_CQE_MW_BIND_ERR:
1144 		status = IBT_WC_MEM_WIN_BIND_ERR;
1145 		break;
1146 
1147 	case TAVOR_CQE_BAD_RESPONSE_ERR:
1148 		status = IBT_WC_BAD_RESPONSE_ERR;
1149 		break;
1150 
1151 	case TAVOR_CQE_LOCAL_ACCESS_ERR:
1152 		status = IBT_WC_LOCAL_ACCESS_ERR;
1153 		break;
1154 
1155 	case TAVOR_CQE_REM_INV_REQ_ERR:
1156 		status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1157 		break;
1158 
1159 	case TAVOR_CQE_REM_ACC_ERR:
1160 		status = IBT_WC_REMOTE_ACCESS_ERR;
1161 		break;
1162 
1163 	case TAVOR_CQE_REM_OP_ERR:
1164 		status = IBT_WC_REMOTE_OP_ERR;
1165 		break;
1166 
1167 	case TAVOR_CQE_TRANS_TO_ERR:
1168 		status = IBT_WC_TRANS_TIMEOUT_ERR;
1169 		break;
1170 
1171 	case TAVOR_CQE_RNRNAK_TO_ERR:
1172 		status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1173 		break;
1174 
1175 	/*
1176 	 * The following error codes are not supported in the Tavor driver
1177 	 * as they relate only to Reliable Datagram completion statuses:
1178 	 *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1179 	 *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1180 	 *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1181 	 *    case TAVOR_CQE_INV_EEC_NUM_ERR:
1182 	 *    case TAVOR_CQE_INV_EEC_STATE_ERR:
1183 	 *    case TAVOR_CQE_LOC_EEC_ERR:
1184 	 */
1185 
1186 	default:
1187 		status = IBT_WC_LOCAL_CHAN_OP_ERR;
1188 		break;
1189 	}
1190 	wc->wc_status = status;
1191 	wc->wc_type = 0;
1192 	/*
1193 	 * Now we do all the checking that's necessary to handle completion
1194 	 * queue entry "recycling"
1195 	 *
1196 	 * It is not necessary here to try to sync the WQE as we are only
1197 	 * attempting to read from the Work Queue (and hardware does not
1198 	 * write to it).
1199 	 */
1200 
1201 	/*
1202 	 * We can get doorbell info, WQE address, size for the next WQE
1203 	 * from the "wre" (which was filled in above in the call to the
1204 	 * tavor_wrid_get_entry() routine)
1205 	 */
1206 	dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1207 	next_wqeaddr = wre.wr_wqeaddrsz;
1208 	nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1209 
1210 	/*
1211 	 * Get the doorbell count from the CQE.  This indicates how many
1212 	 * completions this one CQE represents.
1213 	 */
1214 	doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1215 
1216 	/*
1217 	 * Determine if we're ready to consume this CQE yet or not.  If the
1218 	 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1219 	 * is down to zero, then this is the last/only completion represented
1220 	 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
1221 	 * current CQE needs to be recycled (see below).
1222 	 */
1223 	if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1224 		/*
1225 		 * Consume the CQE
1226 		 *    Return status to indicate that doorbell and sync may be
1227 		 *    necessary.
1228 		 */
1229 		return (TAVOR_CQ_SYNC_AND_DB);
1230 
1231 	} else {
1232 		/*
1233 		 * Recycle the CQE for use in the next PollCQ() call
1234 		 *    Decrement the doorbell count, modify the error status,
1235 		 *    and update the WQE address and size (to point to the
1236 		 *    next WQE on the chain.  Put these update entries back
1237 		 *    into the CQE.
1238 		 *    Despite the fact that we have updated the CQE, it is not
1239 		 *    necessary for us to attempt to sync this entry just yet
1240 		 *    as we have not changed the "hardware's view" of the
1241 		 *    entry (i.e. we have not modified the "owner" bit - which
1242 		 *    is all that the Tavor hardware really cares about.
1243 		 */
1244 		doorbell_cnt = doorbell_cnt - dbd;
1245 		TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cqe,
1246 		    ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1247 		    (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1248 		TAVOR_CQE_WQEADDRSZ_SET(cqe,
1249 		    TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1250 		dapl_dbg_log(DAPL_DBG_TYPE_EVD,
1251 		    "errcqe_consume: recycling cqe.eth=%x, wqe=%x\n",
1252 		    TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
1253 		    TAVOR_CQE_WQEADDRSZ_GET(cqe));
1254 		return (TAVOR_CQ_RECYCLE_ENTRY);
1255 	}
1256 }
1257 
1258 /*
1259  * dapli_tavor_cq_notify()
1260  * This function is used for arming the CQ by ringing the CQ doorbell.
1261  */
1262 static DAT_RETURN
1263 dapli_tavor_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param)
1264 {
1265 	uint32_t	cqnum;
1266 
1267 	/*
1268 	 * Determine if we are trying to get the next completion or the next
1269 	 * "solicited" completion.  Then hit the appropriate doorbell.
1270 	 */
1271 	cqnum = cq->cq_num;
1272 	if (flags == IB_NOTIFY_ON_NEXT_COMP) {
1273 		dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_CQ,
1274 		    cqnum, TAVOR_CQDB_DEFAULT_PARAM);
1275 
1276 	} else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) {
1277 		dapli_tavor_cq_doorbell(cq->cq_iauar,
1278 		    TAVOR_CQDB_NOTIFY_CQ_SOLICIT, cqnum,
1279 		    TAVOR_CQDB_DEFAULT_PARAM);
1280 
1281 	} else if (flags == IB_NOTIFY_ON_NEXT_NCOMP) {
1282 		dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_NCQ,
1283 		    cqnum, param);
1284 	} else {
1285 		return (DAT_INVALID_PARAMETER);
1286 	}
1287 
1288 	return (DAT_SUCCESS);
1289 }
1290 
1291 /*
1292  * dapli_tavor_post_send()
1293  */
1294 static DAT_RETURN
1295 dapli_tavor_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns)
1296 {
1297 	tavor_sw_wqe_dbinfo_t		dbinfo;
1298 	dapls_tavor_wrid_list_hdr_t	*wridlist;
1299 	dapls_tavor_wrid_entry_t	*wre_last;
1300 	uint32_t			desc;
1301 	uint64_t			*wqe_addr;
1302 	uint32_t			desc_sz;
1303 	uint32_t			wqeaddrsz, signaled_dbd;
1304 	uint32_t			head, tail, next_tail, qsize_msk;
1305 	int				status;
1306 	ib_qp_handle_t			qp;
1307 
1308 	if ((ep->qp_state == IBT_STATE_RESET) ||
1309 	    (ep->qp_state == IBT_STATE_INIT) ||
1310 	    (ep->qp_state == IBT_STATE_RTR)) {
1311 		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1312 		    "post_send: invalid qp_state %d\n", ep->qp_state);
1313 		return (DAT_INVALID_STATE);
1314 	}
1315 
1316 	qp = ep->qp_handle;
1317 
1318 	/* Grab the lock for the WRID list */
1319 	dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1320 	wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
1321 
1322 	/* Save away some initial QP state */
1323 	qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
1324 	tail	  = qp->qp_sq_wqhdr->wq_tail;
1325 	head	  = qp->qp_sq_wqhdr->wq_head;
1326 
1327 	/*
1328 	 * Check for "queue full" condition.  If the queue is already full,
1329 	 * then no more WQEs can be posted, return an error
1330 	 */
1331 	if (qp->qp_sq_wqhdr->wq_full != 0) {
1332 		dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1333 		return (DAT_INSUFFICIENT_RESOURCES);
1334 	}
1335 
1336 	/*
1337 	 * Increment the "tail index" and check for "queue full" condition.
1338 	 * If we detect that the current work request is going to fill the
1339 	 * work queue, then we mark this condition and continue.
1340 	 */
1341 	next_tail = (tail + 1) & qsize_msk;
1342 	if (next_tail == head) {
1343 		qp->qp_sq_wqhdr->wq_full = 1;
1344 	}
1345 
1346 	/*
1347 	 * Get the user virtual address of the location where the next
1348 	 * Send WQE should be built
1349 	 */
1350 	wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail);
1351 
1352 	/*
1353 	 * Call tavor_wqe_send_build() to build the WQE at the given address.
1354 	 * This routine uses the information in the ibt_send_wr_t and
1355 	 * returns the size of the WQE when it returns.
1356 	 */
1357 	status = dapli_tavor_wqe_send_build(qp, wr, wqe_addr, &desc_sz);
1358 	if (status != DAT_SUCCESS) {
1359 		dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1360 		return (status);
1361 	}
1362 
1363 	/*
1364 	 * Get the descriptor (io address) corresponding to the location
1365 	 * Send WQE was built.
1366 	 */
1367 	desc = TAVOR_QP_SQ_DESC(qp, tail);
1368 
1369 	dapl_os_assert(desc >= qp->qp_sq_desc_addr &&
1370 	    desc <= (qp->qp_sq_desc_addr +
1371 	    qp->qp_sq_numwqe*qp->qp_sq_wqesz));
1372 
1373 	/*
1374 	 * Add a WRID entry to the WRID list.  Need to calculate the
1375 	 * "wqeaddrsz" and "signaled_dbd" values to pass to
1376 	 * dapli_tavor_wrid_add_entry()
1377 	 */
1378 	wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
1379 
1380 	if (wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1381 		signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
1382 	}
1383 
1384 	dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz,
1385 	    signaled_dbd);
1386 
1387 	/*
1388 	 * Now link the wqe to the old chain (if there was one)
1389 	 */
1390 	dapli_tavor_wqe_send_linknext(wr, wqe_addr, ns, desc, desc_sz,
1391 	    qp->qp_sq_lastwqeaddr, &dbinfo);
1392 
1393 	/*
1394 	 * Now if the WRID tail entry is non-NULL, then this
1395 	 * represents the entry to which we are chaining the
1396 	 * new entries.  Since we are going to ring the
1397 	 * doorbell for this WQE, we want set its "dbd" bit.
1398 	 *
1399 	 * On the other hand, if the tail is NULL, even though
1400 	 * we will have rung the doorbell for the previous WQE
1401 	 * (for the hardware's sake) it is irrelevant to our
1402 	 * purposes (for tracking WRIDs) because we know the
1403 	 * request must have already completed.
1404 	 */
1405 	wre_last = wridlist->wl_wre_old_tail;
1406 	if (wre_last != NULL) {
1407 		wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1408 	}
1409 
1410 	/* Update some of the state in the QP */
1411 	qp->qp_sq_lastwqeaddr	 = wqe_addr;
1412 	qp->qp_sq_wqhdr->wq_tail = next_tail;
1413 
1414 	/* Ring the doorbell */
1415 	dapli_tavor_qp_send_doorbell(qp->qp_iauar, desc, desc_sz,
1416 	    qp->qp_num, dbinfo.db_fence, dbinfo.db_nopcode);
1417 
1418 	dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1419 
1420 	return (DAT_SUCCESS);
1421 }
1422 
1423 /*
1424  * dapli_tavor_post_recv()
1425  */
1426 static DAT_RETURN
1427 dapli_tavor_post_recv(DAPL_EP	*ep, ibt_recv_wr_t *wr, boolean_t ns)
1428 {
1429 	dapls_tavor_wrid_list_hdr_t	*wridlist;
1430 	dapls_tavor_wrid_entry_t	*wre_last;
1431 	ib_qp_handle_t			qp;
1432 	DAT_RETURN			status;
1433 	uint32_t			desc;
1434 	uint64_t			*wqe_addr;
1435 	uint32_t			desc_sz;
1436 	uint32_t			wqeaddrsz;
1437 	uint32_t			head, tail, next_tail, qsize_msk;
1438 
1439 	if (ep->qp_state == IBT_STATE_RESET) {
1440 		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1441 		    "post_recv: invalid qp_state %d\n", ep->qp_state);
1442 		return (DAT_INVALID_STATE);
1443 	}
1444 	qp = ep->qp_handle;
1445 
1446 	/* Grab the lock for the WRID list */
1447 	dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1448 	wridlist  = qp->qp_rq_wqhdr->wq_wrid_post;
1449 
1450 	/* Save away some initial QP state */
1451 	qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
1452 	tail	  = qp->qp_rq_wqhdr->wq_tail;
1453 	head	  = qp->qp_rq_wqhdr->wq_head;
1454 
1455 	/*
1456 	 * For the ibt_recv_wr_t passed in, parse the request and build a
1457 	 * Recv WQE. Link the WQE with the previous WQE and ring the
1458 	 * door bell.
1459 	 */
1460 
1461 	/*
1462 	 * Check for "queue full" condition.  If the queue is already full,
1463 	 * then no more WQEs can be posted. So return an error.
1464 	 */
1465 	if (qp->qp_rq_wqhdr->wq_full != 0) {
1466 		dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1467 		return (DAT_INSUFFICIENT_RESOURCES);
1468 	}
1469 
1470 	/*
1471 	 * Increment the "tail index" and check for "queue
1472 	 * full" condition.  If we detect that the current
1473 	 * work request is going to fill the work queue, then
1474 	 * we mark this condition and continue.
1475 	 */
1476 	next_tail = (tail + 1) & qsize_msk;
1477 	if (next_tail == head) {
1478 		qp->qp_rq_wqhdr->wq_full = 1;
1479 	}
1480 
1481 	/* Get the descriptor (IO Address) of the WQE to be built */
1482 	desc = TAVOR_QP_RQ_DESC(qp, tail);
1483 	/* The user virtual address of the WQE to be built */
1484 	wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail);
1485 
1486 	/*
1487 	 * Call tavor_wqe_recv_build() to build the WQE at the given
1488 	 * address. This routine uses the information in the
1489 	 * ibt_recv_wr_t and returns the size of the WQE.
1490 	 */
1491 	status = dapli_tavor_wqe_recv_build(qp, wr, wqe_addr, &desc_sz);
1492 	if (status != DAT_SUCCESS) {
1493 		dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1494 		return (DAT_INTERNAL_ERROR);
1495 	}
1496 
1497 	/*
1498 	 * Add a WRID entry to the WRID list.  Need to calculate the
1499 	 * "wqeaddrsz" and "signaled_dbd" values to pass to
1500 	 * dapli_tavor_wrid_add_entry().
1501 	 * Note: all Recv WQEs are essentially "signaled"
1502 	 */
1503 	wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
1504 	dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz,
1505 	    (uint32_t)TAVOR_WRID_ENTRY_SIGNALED);
1506 
1507 	/*
1508 	 * Now link the chain to the old chain (if there was one)
1509 	 * and ring the doorbel for the recv work queue.
1510 	 */
1511 	dapli_tavor_wqe_recv_linknext(wqe_addr, ns, desc, desc_sz,
1512 	    qp->qp_rq_lastwqeaddr);
1513 
1514 	/*
1515 	 * Now if the WRID tail entry is non-NULL, then this
1516 	 * represents the entry to which we are chaining the
1517 	 * new entries.  Since we are going to ring the
1518 	 * doorbell for this WQE, we want set its "dbd" bit.
1519 	 *
1520 	 * On the other hand, if the tail is NULL, even though
1521 	 * we will have rung the doorbell for the previous WQE
1522 	 * (for the hardware's sake) it is irrelevant to our
1523 	 * purposes (for tracking WRIDs) because we know the
1524 	 * request must have already completed.
1525 	 */
1526 	wre_last = wridlist->wl_wre_old_tail;
1527 	if (wre_last != NULL) {
1528 		wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1529 	}
1530 
1531 	/* Update some of the state in the QP */
1532 	qp->qp_rq_lastwqeaddr	 = wqe_addr;
1533 	qp->qp_rq_wqhdr->wq_tail = next_tail;
1534 
1535 	/* Ring the doorbell */
1536 	dapli_tavor_qp_recv_doorbell(qp->qp_iauar, desc, desc_sz,
1537 	    qp->qp_num, 1);
1538 
1539 	dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1540 
1541 	return (DAT_SUCCESS);
1542 }
1543 
1544 /*
1545  * dapli_tavor_post_srq()
1546  */
1547 static DAT_RETURN
1548 dapli_tavor_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns)
1549 {
1550 	ib_srq_handle_t			srq;
1551 	DAT_RETURN			status;
1552 	uint32_t			desc;
1553 	uint64_t			*wqe_addr;
1554 	uint64_t			*last_wqe_addr;
1555 	uint32_t			head, next_head, qsize_msk;
1556 	uint32_t			wqe_index;
1557 
1558 
1559 	srq = srqp->srq_handle;
1560 
1561 	/* Grab the lock for the WRID list */
1562 	dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock);
1563 
1564 	/*
1565 	 * For the ibt_recv_wr_t passed in, parse the request and build a
1566 	 * Recv WQE. Link the WQE with the previous WQE and ring the
1567 	 * door bell.
1568 	 */
1569 
1570 	/*
1571 	 * Check for "queue full" condition.  If the queue is already full,
1572 	 * ie. there are no free entries, then no more WQEs can be posted.
1573 	 * So return an error.
1574 	 */
1575 	if (srq->srq_wridlist->wl_freel_entries == 0) {
1576 		dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1577 		return (DAT_INSUFFICIENT_RESOURCES);
1578 	}
1579 
1580 	/* Save away some initial SRQ state */
1581 	qsize_msk = srq->srq_wridlist->wl_size - 1;
1582 	head	  = srq->srq_wridlist->wl_freel_head;
1583 
1584 	next_head = (head + 1) & qsize_msk;
1585 
1586 	/* Get the descriptor (IO Address) of the WQE to be built */
1587 	desc = srq->srq_wridlist->wl_free_list[head];
1588 
1589 	wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc,
1590 	    srq->srq_wq_wqesz);
1591 
1592 	/* The user virtual address of the WQE to be built */
1593 	wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index);
1594 
1595 	/*
1596 	 * Call dapli_tavor_wqe_srq_build() to build the WQE at the given
1597 	 * address. This routine uses the information in the
1598 	 * ibt_recv_wr_t and returns the size of the WQE.
1599 	 */
1600 	status = dapli_tavor_wqe_srq_build(srq, wr, wqe_addr);
1601 	if (status != DAT_SUCCESS) {
1602 		dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1603 		return (status);
1604 	}
1605 
1606 	/*
1607 	 * Add a WRID entry to the WRID list.
1608 	 */
1609 	dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index);
1610 
1611 	if (srq->srq_wq_lastwqeindex == -1) {
1612 		last_wqe_addr = NULL;
1613 	} else {
1614 		last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq,
1615 		    srq->srq_wq_lastwqeindex);
1616 	}
1617 	/*
1618 	 * Now link the chain to the old chain (if there was one)
1619 	 * and ring the doorbell for the SRQ.
1620 	 */
1621 	dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr);
1622 
1623 	/* Update some of the state in the SRQ */
1624 	srq->srq_wq_lastwqeindex	 = wqe_index;
1625 	srq->srq_wridlist->wl_freel_head = next_head;
1626 	srq->srq_wridlist->wl_freel_entries--;
1627 	dapl_os_assert(srq->srq_wridlist->wl_freel_entries <=
1628 	    srq->srq_wridlist->wl_size);
1629 
1630 	/* Ring the doorbell - for SRQ nds = 0 */
1631 	dapli_tavor_qp_recv_doorbell(srq->srq_iauar, desc, 0,
1632 	    srq->srq_num, 1);
1633 
1634 	dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1635 
1636 	return (DAT_SUCCESS);
1637 }
1638 
1639 /*
1640  * dapli_tavor_wrid_add_entry()
1641  */
1642 extern void
1643 dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *wq, uint64_t wrid,
1644     uint32_t wqeaddrsz, uint_t signaled_dbd)
1645 {
1646 	dapls_tavor_wrid_entry_t	*wre_tmp;
1647 	uint32_t			head, tail, size;
1648 
1649 	/*
1650 	 * Find the entry in the container pointed to by the "tail" index.
1651 	 * Add all of the relevant information to that entry, including WRID,
1652 	 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
1653 	 * and/or doorbelled.
1654 	 */
1655 	head = wq->wq_wrid_post->wl_head;
1656 	tail = wq->wq_wrid_post->wl_tail;
1657 	size = wq->wq_wrid_post->wl_size;
1658 	wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
1659 	wre_tmp->wr_wrid	  = wrid;
1660 	wre_tmp->wr_wqeaddrsz	  = wqeaddrsz;
1661 	wre_tmp->wr_signaled_dbd  = signaled_dbd;
1662 
1663 	/*
1664 	 * Update the "wrid_old_tail" pointer to point to the entry we just
1665 	 * inserted into the queue.  By tracking this pointer (the pointer to
1666 	 * the most recently inserted entry) it will possible later in the
1667 	 * PostSend() and PostRecv() code paths to find the entry that needs
1668 	 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
1669 	 * tavor_post_send()).
1670 	 */
1671 	wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
1672 
1673 	/* Update the tail index */
1674 	tail = ((tail + 1) & (size - 1));
1675 	wq->wq_wrid_post->wl_tail = tail;
1676 
1677 	/*
1678 	 * If the "tail" index has just wrapped over into the "head" index,
1679 	 * then we have filled the container.  We use the "full" flag to
1680 	 * indicate this condition and to distinguish it from the "empty"
1681 	 * condition (where head and tail are also equal).
1682 	 */
1683 	if (head == tail) {
1684 		wq->wq_wrid_post->wl_full = 1;
1685 	}
1686 }
1687 
1688 /*
1689  * dapli_tavor_wrid_add_entry_srq()
1690  */
1691 extern void
1692 dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t srq, uint64_t wrid,
1693     uint32_t wqe_index)
1694 {
1695 	dapls_tavor_wrid_entry_t	*wre;
1696 
1697 	/* ASSERT on impossible wqe_index values */
1698 	dapl_os_assert(wqe_index < srq->srq_wq_numwqe);
1699 
1700 	/*
1701 	 * Setup the WRE.
1702 	 *
1703 	 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
1704 	 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
1705 	 * this information and associate the WRID to the WQE found on the CQE.
1706 	 * Note: all Recv WQEs are essentially "signaled"
1707 	 */
1708 	wre = &srq->srq_wridlist->wl_wre[wqe_index];
1709 	wre->wr_wrid = wrid;
1710 	wre->wr_signaled_dbd = (uint32_t)TAVOR_WRID_ENTRY_SIGNALED;
1711 }
1712 
1713 /*
1714  * dapli_tavor_cq_srq_entries_flush()
1715  */
1716 static void
1717 dapli_tavor_cq_srq_entries_flush(ib_qp_handle_t qp)
1718 {
1719 	ib_cq_handle_t		cq;
1720 	dapls_tavor_workq_hdr_t	*wqhdr;
1721 	tavor_hw_cqe_t		*cqe;
1722 	tavor_hw_cqe_t		*next_cqe;
1723 	uint32_t		cons_indx, tail_cons_indx, wrap_around_mask;
1724 	uint32_t		new_indx, check_indx, indx;
1725 	uint32_t		num_to_increment;
1726 	int			cqe_qpnum, cqe_type;
1727 	int			outstanding_cqes, removed_cqes;
1728 	int			i;
1729 
1730 	/* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */
1731 
1732 	cq = qp->qp_rq_cqhdl;
1733 	wqhdr = qp->qp_rq_wqhdr;
1734 
1735 	dapl_os_assert(wqhdr->wq_wrid_post != NULL);
1736 	dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0);
1737 
1738 	/* Get the consumer index */
1739 	cons_indx = cq->cq_consindx;
1740 
1741 	/*
1742 	 * Calculate the wrap around mask.  Note: This operation only works
1743 	 * because all Tavor completion queues have power-of-2 sizes
1744 	 */
1745 	wrap_around_mask = (cq->cq_size - 1);
1746 
1747 	/* Calculate the pointer to the first CQ entry */
1748 	cqe = &cq->cq_addr[cons_indx];
1749 
1750 	/*
1751 	 * Loop through the CQ looking for entries owned by software.  If an
1752 	 * entry is owned by software then we increment an 'outstanding_cqes'
1753 	 * count to know how many entries total we have on our CQ.  We use this
1754 	 * value further down to know how many entries to loop through looking
1755 	 * for our same QP number.
1756 	 */
1757 	outstanding_cqes = 0;
1758 	tail_cons_indx = cons_indx;
1759 	while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
1760 		/* increment total cqes count */
1761 		outstanding_cqes++;
1762 
1763 		/* increment the consumer index */
1764 		tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1765 
1766 		/* update the pointer to the next cq entry */
1767 		cqe = &cq->cq_addr[tail_cons_indx];
1768 	}
1769 
1770 	/*
1771 	 * Using the 'tail_cons_indx' that was just set, we now know how many
1772 	 * total CQEs possible there are.  Set the 'check_indx' and the
1773 	 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1774 	 */
1775 	check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1776 
1777 	for (i = 0; i < outstanding_cqes; i++) {
1778 		cqe = &cq->cq_addr[check_indx];
1779 
1780 		/* Grab QP number from CQE */
1781 		cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe);
1782 		cqe_type = TAVOR_CQE_SENDRECV_GET(cqe);
1783 
1784 		/*
1785 		 * If the QP number is the same in the CQE as the QP that we
1786 		 * have on this SRQ, then we must free up the entry off the
1787 		 * SRQ.  We also make sure that the completion type is of the
1788 		 * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1789 		 * this CQ will be left as-is.  The handling of returning
1790 		 * entries back to HW ownership happens further down.
1791 		 */
1792 		if (cqe_qpnum == qp->qp_num &&
1793 		    cqe_type == TAVOR_COMPLETION_RECV) {
1794 			/* Add back to SRQ free list */
1795 			(void) dapli_tavor_wrid_find_match_srq(
1796 			    wqhdr->wq_wrid_post, cqe);
1797 		} else {
1798 			/* Do Copy */
1799 			if (check_indx != new_indx) {
1800 				next_cqe = &cq->cq_addr[new_indx];
1801 				/*
1802 				 * Copy the CQE into the "next_cqe"
1803 				 * pointer.
1804 				 */
1805 				(void) dapl_os_memcpy(next_cqe, cqe,
1806 				    sizeof (tavor_hw_cqe_t));
1807 			}
1808 			new_indx = (new_indx - 1) & wrap_around_mask;
1809 		}
1810 		/* Move index to next CQE to check */
1811 		check_indx = (check_indx - 1) & wrap_around_mask;
1812 	}
1813 
1814 	/* Initialize removed cqes count */
1815 	removed_cqes = 0;
1816 
1817 	/* If an entry was removed */
1818 	if (check_indx != new_indx) {
1819 
1820 		/*
1821 		 * Set current pointer back to the beginning consumer index.
1822 		 * At this point, all unclaimed entries have been copied to the
1823 		 * index specified by 'new_indx'.  This 'new_indx' will be used
1824 		 * as the new consumer index after we mark all freed entries as
1825 		 * having HW ownership.  We do that here.
1826 		 */
1827 
1828 		/* Loop through all entries until we reach our new pointer */
1829 		for (indx = cons_indx; indx <= new_indx;
1830 		    indx = (indx + 1) & wrap_around_mask) {
1831 			removed_cqes++;
1832 			cqe = &cq->cq_addr[indx];
1833 
1834 			/* Reset entry to hardware ownership */
1835 			TAVOR_CQE_OWNER_SET_HW(cqe);
1836 		}
1837 	}
1838 
1839 	/*
1840 	 * Update consumer index to be the 'new_indx'.  This moves it past all
1841 	 * removed entries.  Because 'new_indx' is pointing to the last
1842 	 * previously valid SW owned entry, we add 1 to point the cons_indx to
1843 	 * the first HW owned entry.
1844 	 */
1845 	cons_indx = (new_indx + 1) & wrap_around_mask;
1846 
1847 	/*
1848 	 * Now we only ring the doorbell (to update the consumer index) if
1849 	 * we've actually consumed a CQ entry.  If we found no QP number
1850 	 * matches above, then we would not have removed anything.  So only if
1851 	 * something was removed do we ring the doorbell.
1852 	 */
1853 	if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1854 		/*
1855 		 * Post doorbell to update the consumer index.  Doorbell
1856 		 * value indicates number of entries consumed (minus 1)
1857 		 */
1858 		if (cons_indx > cq->cq_consindx) {
1859 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1860 		} else {
1861 			num_to_increment = ((cons_indx + cq->cq_size) -
1862 			    cq->cq_consindx) - 1;
1863 		}
1864 		cq->cq_consindx = cons_indx;
1865 
1866 		dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
1867 		    cq->cq_num, num_to_increment);
1868 	}
1869 }
1870 
1871 /* ARGSUSED */
1872 static void
1873 dapli_tavor_qp_init(ib_qp_handle_t qp)
1874 {
1875 }
1876 
1877 /* ARGSUSED */
1878 static void
1879 dapli_tavor_cq_init(ib_cq_handle_t cq)
1880 {
1881 }
1882 
1883 /* ARGSUSED */
1884 static void
1885 dapli_tavor_srq_init(ib_srq_handle_t srq)
1886 {
1887 }
1888 
1889 void
1890 dapls_init_funcs_tavor(DAPL_HCA *hca_ptr)
1891 {
1892 	hca_ptr->post_send = dapli_tavor_post_send;
1893 	hca_ptr->post_recv = dapli_tavor_post_recv;
1894 	hca_ptr->post_srq = dapli_tavor_post_srq;
1895 	hca_ptr->cq_peek = dapli_tavor_cq_peek;
1896 	hca_ptr->cq_poll = dapli_tavor_cq_poll;
1897 	hca_ptr->cq_poll_one = dapli_tavor_cq_poll_one;
1898 	hca_ptr->cq_notify = dapli_tavor_cq_notify;
1899 	hca_ptr->srq_flush = dapli_tavor_cq_srq_entries_flush;
1900 	hca_ptr->qp_init = dapli_tavor_qp_init;
1901 	hca_ptr->cq_init = dapli_tavor_cq_init;
1902 	hca_ptr->srq_init = dapli_tavor_srq_init;
1903 	hca_ptr->hermon_resize_cq = 0;
1904 }
1905