xref: /titanic_50/usr/src/lib/udapl/udapl_tavor/tavor/dapl_hermon_hw.c (revision 1ed53a3f65abecaadc1b967e341970ad0f6b2aeb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include "dapl.h"
28 #include "dapl_tavor_hw.h"
29 #include "dapl_tavor_wr.h"
30 #include "dapl_tavor_ibtf_impl.h"
31 
32 #define	HERMON_WQE_SGL_INVALID_LKEY	0x00000100
33 #define	HERMON_WQE_SEND_FENCE_MASK	0x40
34 #define	HERMON_WQE_NDS_MASK		0x3F
35 
36 #define	HERMON_CQDB_NOTIFY_CQ_SOLICIT	(0x1 << 24)
37 #define	HERMON_CQDB_NOTIFY_CQ		(0x2 << 24)
38 
39 #define	HERMON_CQE_RCV_SEND		0x1
40 #define	HERMON_CQE_ERR_OPCODE		0x1E
41 #define	HERMON_CQE_RESIZE_OPCODE	0x16
42 #define	HERMON_CQE_OPCODE_GET(cqe)	(((uint8_t *)cqe)[31] & 0x1F)
43 #define	HERMON_CQE_SENDRECV_GET(cqe)	(((uint8_t *)cqe)[31] & 0x40)
44 #define	HERMON_CQE_OWNER_IS_SW(cq, cqe)	((((uint8_t *)cqe)[31] >> 7) == \
45 			((cq->cq_consindx & cq->cq_size) >> cq->cq_log_cqsz))
46 
47 #define	HERMON_QP_WQEADDRSZ(wcnt)	((uint32_t)(wcnt << 6))
48 
49 #define	HERMON_WQE_SEND_SIGNALED_MASK	0x0000000C00000000ull
50 #define	HERMON_WQE_SEND_SOLICIT_MASK	0x0000000200000000ull
51 #define	HERMON_WQE_SETCTRL(desc, ctrl)	\
52 	((uint64_t *)(desc))[1] = HTOBE_64(ctrl)
53 #define	HERMON_WQE_SETNEXT(desc, nopcode, size, fence)			\
54 	((uint64_t *)(desc))[0] = HTOBE_64((nopcode) | (size) | (fence) | \
55 	(((uint64_t)((uint8_t *)desc)[0] &0x80) << 56))
56 #define	HERMON_WQE_BUILD_DATA_SEG(ds, sgl)				\
57 {									\
58 	uint64_t		*tmp;					\
59 									\
60 	tmp	= (uint64_t *)(ds);					\
61 	tmp[1]	= HTOBE_64((sgl)->ds_va);				\
62 	((uint32_t *)tmp)[1] = HTOBE_32((sgl)->ds_key);			\
63 	membar_producer();						\
64 	((uint32_t *)tmp)[0] = HTOBE_32((sgl)->ds_len);			\
65 }
66 
67 
68 /* handy macro, useful because of cq_resize dynamics */
69 #define	cq_wrap_around_mask	(cq->cq_size - 1)
70 
71 pthread_spinlock_t hermon_bf_lock;
72 
73 /*
74  * Function signatures
75  */
76 extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *,
77     uint_t, uint_t, dapls_tavor_wrid_entry_t *);
78 extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t);
79 extern DAPL_OS_LOCK g_tavor_uar_lock;
80 
81 #ifndef	_LP64
82 extern void dapls_atomic_assign_64(uint64_t, uint64_t *);
83 #endif
84 
85 static int dapli_hermon_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *,
86     uint64_t *, uint_t *);
87 static DAT_RETURN dapli_hermon_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *,
88     uint64_t *, uint_t *);
89 static int dapli_hermon_cq_cqe_consume(ib_cq_handle_t, uint32_t *, ibt_wc_t *);
90 static int dapli_hermon_cq_errcqe_consume(ib_cq_handle_t, uint32_t *,
91     ibt_wc_t *);
92 extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t,
93     uint32_t, uint_t);
94 extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t);
95 
96 /*
97  * Note: The 64 bit doorbells need to written atomically.
98  * In 32 bit libraries we need to use the special assembly rtn
99  * because compiler generated code splits into 2 word writes
100  */
101 
102 /*
103  * dapli_hermon_cq_doorbell()
104  * Takes the specified cq cmd and cq number and rings the cq doorbell
105  */
106 static void
dapli_hermon_cq_doorbell(dapls_hw_uar_t ia_uar,uint32_t cq_cmd,uint32_t cqn,uint32_t cmd_sn,uint32_t cq_param)107 dapli_hermon_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn,
108     uint32_t cmd_sn, uint32_t cq_param)
109 {
110 	uint64_t doorbell;
111 
112 	/* Build the doorbell from the parameters */
113 	doorbell = (cmd_sn | cq_cmd | cqn);
114 	doorbell = (doorbell << 32) | cq_param;
115 
116 	/* Write the doorbell to UAR */
117 #ifdef _LP64
118 	((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell);
119 	/* 32 bit version */
120 #elif defined(i386)
121 	dapl_os_lock(&g_tavor_uar_lock);
122 	/*
123 	 * For 32 bit intel we assign the doorbell in the order
124 	 * prescribed by the Tavor PRM, lower to upper addresses
125 	 */
126 	((tavor_hw_uar32_t *)ia_uar)->cq[0] =
127 	    (uint32_t)HTOBE_32(doorbell >> 32);
128 	((tavor_hw_uar32_t *)ia_uar)->cq[1] =
129 	    (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
130 	dapl_os_unlock(&g_tavor_uar_lock);
131 #else
132 	dapls_atomic_assign_64(HTOBE_64(doorbell),
133 	    &((tavor_hw_uar_t *)ia_uar)->cq);
134 #endif
135 }
136 
137 /*
138  * dapli_hermon_qp_send_doorbell()
139  * Takes the specified qp number and rings the send doorbell.
140  */
141 static void
dapli_hermon_sq_dbreg(dapls_hw_uar_t ia_uar,uint32_t qpn)142 dapli_hermon_sq_dbreg(dapls_hw_uar_t ia_uar, uint32_t qpn)
143 {
144 	uint64_t doorbell;
145 
146 	doorbell = qpn << 8;
147 
148 	/* Write the doorbell to UAR */
149 #ifdef _LP64
150 	((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell);
151 #else
152 #if defined(i386)
153 	dapl_os_lock(&g_tavor_uar_lock);
154 	/*
155 	 * For 32 bit intel we assign the doorbell in the order
156 	 * prescribed by the Tavor PRM, lower to upper addresses
157 	 */
158 	((tavor_hw_uar32_t *)ia_uar)->send[0] =
159 	    (uint32_t)HTOBE_32(doorbell >> 32);
160 	((tavor_hw_uar32_t *)ia_uar)->send[1] =
161 	    (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
162 	dapl_os_unlock(&g_tavor_uar_lock);
163 #else
164 	dapls_atomic_assign_64(HTOBE_64(doorbell),
165 	    &((tavor_hw_uar_t *)ia_uar)->send);
166 #endif
167 #endif
168 }
169 
170 /*
171  * dapli_hermon_wqe_send_build()
172  * Constructs a WQE for a given ibt_send_wr_t
173  */
174 static int
dapli_hermon_wqe_send_build(ib_qp_handle_t qp,ibt_send_wr_t * wr,uint64_t * addr,uint_t * size)175 dapli_hermon_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr,
176     uint64_t *addr, uint_t *size)
177 {
178 	tavor_hw_snd_wqe_remaddr_t	*rc;
179 	tavor_hw_snd_wqe_bind_t		*bn;
180 	tavor_hw_wqe_sgl_t		*ds;
181 	ibt_wr_ds_t			*sgl;
182 	uint8_t				*src, *dst, *maxdst;
183 	uint32_t			nds;
184 	int				len, thislen, maxlen;
185 	uint32_t			new_rkey;
186 	uint32_t			old_rkey;
187 	int				i, num_ds;
188 	int				max_inline_bytes = -1;
189 	uint64_t			ctrl;
190 	uint64_t			nopcode;
191 	uint_t				my_size;
192 
193 	nds = wr->wr_nds;
194 	sgl = wr->wr_sgl;
195 	num_ds = 0;
196 	ctrl = ((wr->wr_flags & IBT_WR_SEND_SIGNAL) ?
197 	    HERMON_WQE_SEND_SIGNALED_MASK : 0) |
198 	    ((wr->wr_flags & IBT_WR_SEND_SOLICIT) ?
199 	    HERMON_WQE_SEND_SOLICIT_MASK : 0);
200 
201 	/*
202 	 * RC is the only supported transport in UDAPL
203 	 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write"
204 	 */
205 	switch (wr->wr_opcode) {
206 	case IBT_WRC_SEND:
207 		/*
208 		 * If this is a Send request, then all we need is
209 		 * the Data Segment processing below.
210 		 * Initialize the information for the Data Segments
211 		 */
212 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
213 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
214 		if (qp->qp_sq_inline != 0)
215 			max_inline_bytes =
216 			    qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND;
217 		nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
218 		break;
219 	case IBT_WRC_RDMAW:
220 		if (qp->qp_sq_inline != 0)
221 			max_inline_bytes =
222 			    qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW;
223 		nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
224 		/* FALLTHROUGH */
225 	case IBT_WRC_RDMAR:
226 		if (wr->wr_opcode == IBT_WRC_RDMAR) {
227 			if (qp->qp_sq_inline < 0)
228 				qp->qp_sq_inline = 0;
229 			nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
230 		}
231 		/*
232 		 * If this is an RDMA Read or RDMA Write request, then fill
233 		 * in the "Remote Address" header fields.
234 		 */
235 		rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr +
236 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
237 
238 		/*
239 		 * Build the Remote Address Segment for the WQE, using
240 		 * the information from the RC work request.
241 		 */
242 		TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma);
243 
244 		/* Update "ds" for filling in Data Segments (below) */
245 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
246 		    sizeof (tavor_hw_snd_wqe_remaddr_t));
247 		break;
248 	case IBT_WRC_BIND:
249 		/*
250 		 * Generate a new R_key
251 		 * Increment the upper "unconstrained" bits and need to keep
252 		 * the lower "constrained" bits the same it represents
253 		 * the MPT index.
254 		 */
255 #if 0
256 	/* XXX - need equiv of "hermon_wr_bind_check(state, wr);" */
257 	/* XXX - uses hermon_mr_keycalc - what about Sinai vs. Arbel??? */
258 #endif
259 		old_rkey = wr->wr.rc.rcwr.bind->bind_rkey;
260 		new_rkey = old_rkey >> 8;	/* index */
261 		old_rkey = (old_rkey + 1) & 0xff; /* incremented key */
262 		new_rkey = (new_rkey << 8) | old_rkey;
263 
264 		wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
265 
266 		bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr +
267 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
268 
269 		/*
270 		 * Build the Bind Memory Window Segments for the WQE,
271 		 * using the information from the RC Bind memory
272 		 * window work request.
273 		 */
274 		TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind);
275 
276 		/*
277 		 * Update the "ds" pointer.  Even though the "bind"
278 		 * operation requires no SGLs, this is necessary to
279 		 * facilitate the correct descriptor size calculations
280 		 * (below).
281 		 */
282 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
283 		    sizeof (tavor_hw_snd_wqe_bind_t));
284 		nds = 0;
285 		nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
286 		break;
287 	default:
288 		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
289 		    "dapli_hermon_wqe_send_build: invalid wr_opcode=%d\n",
290 		    wr->wr_opcode);
291 		return (DAT_INTERNAL_ERROR);
292 	}
293 
294 	/*
295 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
296 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
297 	 * Start by checking for a valid number of SGL entries
298 	 */
299 	if (nds > qp->qp_sq_sgl) {
300 		return (DAT_INVALID_PARAMETER);
301 	}
302 
303 	/*
304 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
305 	 * segments.  Note: We skip any SGL with zero size because Tavor
306 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
307 	 * the encoding for zero means a 2GB transfer.  Because of this special
308 	 * encoding in the hardware, we mask the requested length with
309 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
310 	 * zero.)
311 	 */
312 	if (max_inline_bytes != -1) {		/* compute total_len */
313 		len = 0;
314 		for (i = 0; i < nds; i++)
315 			len += sgl[i].ds_len;
316 		if (len == 0)
317 			max_inline_bytes = -1; /* do not inline */
318 		else {
319 			/* need to reduce the length by dword "len" fields */
320 			max_inline_bytes -= (len / 64) * sizeof (uint32_t);
321 			if (len > max_inline_bytes)
322 				max_inline_bytes = -1;	/* too big for inline */
323 		}
324 	}
325 	if (max_inline_bytes != -1) {		/* do "inline" */
326 
327 		dst = (uint8_t *)((uint32_t *)ds + 1);
328 		maxdst = (uint8_t *)(((uintptr_t)dst + 64) & ~(64 - 1));
329 		maxlen = maxdst - dst;
330 		thislen = 0;
331 		i = 0;
332 		src = (uint8_t *)(uintptr_t)sgl[i].ds_va;
333 		len = sgl[i].ds_len;
334 		do {
335 			/* if this sgl overflows the inline segment */
336 			if (len > maxlen) {
337 				if (maxlen) /* might be 0 */
338 					(void) dapl_os_memcpy(dst,
339 					    src, maxlen);
340 				membar_producer();
341 				*(uint32_t *)ds =
342 				    HTOBE_32((thislen + maxlen) |
343 				    TAVOR_WQE_SGL_INLINE_MASK);
344 				thislen = 0;
345 				len -= maxlen;
346 				src += maxlen;
347 				dst = maxdst + sizeof (uint32_t);
348 				ds = (tavor_hw_wqe_sgl_t *)(void *)maxdst;
349 				maxdst += 64;
350 				maxlen = 64 - sizeof (uint32_t);
351 			} else { /* this sgl fully fits */
352 				(void) dapl_os_memcpy(dst,
353 				    src, len);
354 				maxlen -= len;  /* room left */
355 				thislen += len;
356 				dst += len;
357 				while (++i < nds)
358 					if (sgl[i].ds_len)
359 						break;
360 				if (i >= nds)
361 					break;
362 				src = (uint8_t *)(uintptr_t)sgl[i].ds_va;
363 				len = sgl[i].ds_len;
364 			}
365 		} while (i < nds);
366 		membar_producer();
367 		*(uint32_t *)ds = HTOBE_32(thislen |
368 		    TAVOR_WQE_SGL_INLINE_MASK);
369 
370 		/* Return the size of descriptor (in 16-byte chunks) */
371 		my_size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4;
372 		if (my_size <= (256 >> 4))
373 			*size = my_size;	/* use Hermon Blueflame */
374 		else
375 			*size = 0;
376 	} else {
377 		for (i = 0; i < nds; i++) {
378 			if (sgl[i].ds_len == 0) {
379 				continue;
380 			}
381 
382 			/*
383 			 * Fill in the Data Segment(s) for the current WQE,
384 			 * using the information contained in the
385 			 * scatter-gather list of the work request.
386 			 */
387 			HERMON_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]);
388 			num_ds++;
389 		}
390 
391 		/* Return the size of descriptor (in 16-byte chunks) */
392 		my_size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4;
393 		*size = 0;	/* do not use Hermon Blueflame */
394 	}
395 	HERMON_WQE_SETCTRL(addr, ctrl);
396 	membar_producer();
397 	HERMON_WQE_SETNEXT(addr, nopcode << 32, my_size,
398 	    (wr->wr_flags & IBT_WR_SEND_FENCE) ?
399 	    HERMON_WQE_SEND_FENCE_MASK : 0);
400 
401 	return (DAT_SUCCESS);
402 }
403 
404 /*
405  * dapli_hermon_wqe_recv_build()
406  * Builds the recv WQE for a given ibt_recv_wr_t
407  */
408 static DAT_RETURN
dapli_hermon_wqe_recv_build(ib_qp_handle_t qp,ibt_recv_wr_t * wr,uint64_t * addr,uint_t * size)409 dapli_hermon_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr,
410     uint64_t *addr, uint_t *size)
411 {
412 	tavor_hw_wqe_sgl_t	*ds;
413 	int			i;
414 	int			num_ds;
415 
416 	/* Fill in the Data Segments (SGL) for the Recv WQE */
417 	ds = (tavor_hw_wqe_sgl_t *)addr;
418 	num_ds = 0;
419 
420 	/* Check for valid number of SGL entries */
421 	if (wr->wr_nds > qp->qp_rq_sgl) {
422 		return (DAT_INVALID_PARAMETER);
423 	}
424 
425 	/*
426 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
427 	 * segments.  Note: We skip any SGL with zero size because Tavor
428 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
429 	 * the encoding for zero means a 2GB transfer.  Because of this special
430 	 * encoding in the hardware, we mask the requested length with
431 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
432 	 * zero.)
433 	 */
434 	for (i = 0; i < wr->wr_nds; i++) {
435 		if (wr->wr_sgl[i].ds_len == 0) {
436 			continue;
437 		}
438 
439 		/*
440 		 * Fill in the Data Segment(s) for the receive WQE, using the
441 		 * information contained in the scatter-gather list of the
442 		 * work request.
443 		 */
444 		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
445 		num_ds++;
446 	}
447 	if (i < qp->qp_rq_sgl) {
448 		ibt_wr_ds_t sgl;
449 		sgl.ds_va  = (ib_vaddr_t)0;
450 		sgl.ds_len = (ib_msglen_t)0;
451 		sgl.ds_key = (ibt_lkey_t)HERMON_WQE_SGL_INVALID_LKEY;
452 		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl);
453 	}
454 
455 	/* Return the size of descriptor (in 16-byte chunks) */
456 	*size = qp->qp_rq_wqesz >> 4;
457 
458 	return (DAT_SUCCESS);
459 }
460 
461 /*
462  * dapli_hermon_wqe_srq_build()
463  * Builds the recv WQE for a given ibt_recv_wr_t
464  */
465 static DAT_RETURN
dapli_hermon_wqe_srq_build(ib_srq_handle_t srq,ibt_recv_wr_t * wr,uint64_t * addr)466 dapli_hermon_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr,
467     uint64_t *addr)
468 {
469 	tavor_hw_wqe_sgl_t	*ds;
470 	ibt_wr_ds_t		end_sgl;
471 	int			i;
472 	int			num_ds;
473 
474 	/* Fill in the Data Segments (SGL) for the Recv WQE */
475 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
476 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
477 	num_ds = 0;
478 
479 	/* Check for valid number of SGL entries */
480 	if (wr->wr_nds > srq->srq_wq_sgl) {
481 		return (DAT_INVALID_PARAMETER);
482 	}
483 
484 	/*
485 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
486 	 * segments.  Note: We skip any SGL with zero size because Tavor
487 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
488 	 * the encoding for zero means a 2GB transfer.  Because of this special
489 	 * encoding in the hardware, we mask the requested length with
490 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
491 	 * zero.)
492 	 */
493 	for (i = 0; i < wr->wr_nds; i++) {
494 		if (wr->wr_sgl[i].ds_len == 0) {
495 			continue;
496 		}
497 
498 		/*
499 		 * Fill in the Data Segment(s) for the receive WQE, using the
500 		 * information contained in the scatter-gather list of the
501 		 * work request.
502 		 */
503 		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
504 		num_ds++;
505 	}
506 
507 	/*
508 	 * For SRQ, if the number of data segments is less than the maximum
509 	 * specified at alloc, then we have to fill in a special "key" entry in
510 	 * the sgl entry after the last valid one in this post request.  We do
511 	 * that here.
512 	 */
513 	if (num_ds < srq->srq_wq_sgl) {
514 		end_sgl.ds_va  = (ib_vaddr_t)0;
515 		end_sgl.ds_len = (ib_msglen_t)0;
516 		end_sgl.ds_key = (ibt_lkey_t)HERMON_WQE_SGL_INVALID_LKEY;
517 		TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl);
518 	}
519 
520 	return (DAT_SUCCESS);
521 }
522 
523 /*
524  * dapli_hermon_cq_peek()
525  * Peeks into a given CQ to check if there are any events that can be
526  * polled. It returns the number of CQEs that can be polled.
527  */
528 static void
dapli_hermon_cq_peek(ib_cq_handle_t cq,int * num_cqe)529 dapli_hermon_cq_peek(ib_cq_handle_t cq, int *num_cqe)
530 {
531 	uint32_t		*cqe;
532 	uint32_t		imm_eth_pkey_cred;
533 	uint32_t		cons_indx;
534 	int			polled_cnt;
535 	uint_t			doorbell_cnt;
536 	uint_t			opcode;
537 
538 	/* Get the consumer index */
539 	cons_indx = cq->cq_consindx & cq_wrap_around_mask;
540 
541 	/* Calculate the pointer to the first CQ entry */
542 	cqe = (uint32_t *)&cq->cq_addr[cons_indx];
543 
544 	/*
545 	 * Count entries in the CQ until we find an entry owned by
546 	 * the hardware.
547 	 */
548 	polled_cnt = 0;
549 	while (HERMON_CQE_OWNER_IS_SW(cq, cqe)) {
550 		opcode = HERMON_CQE_OPCODE_GET(cqe);
551 		/* Error CQE map to multiple work completions */
552 		if (opcode == HERMON_CQE_ERR_OPCODE) {
553 			imm_eth_pkey_cred =
554 			    TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
555 			doorbell_cnt =
556 			    imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
557 			polled_cnt += (doorbell_cnt + 1);
558 		} else {
559 			polled_cnt++;
560 		}
561 		/* Increment the consumer index */
562 		cons_indx = (cons_indx + 1) & cq_wrap_around_mask;
563 
564 		/* Update the pointer to the next CQ entry */
565 		cqe = (uint32_t *)&cq->cq_addr[cons_indx];
566 	}
567 
568 	*num_cqe = polled_cnt;
569 }
570 
571 #define	dapli_hermon_cq_update_ci(cq, dbp) \
572 	(dbp)[0] = HTOBE_32(cq->cq_consindx & 0xFFFFFF)
573 
574 /*
575  * dapli_hermon_cq_resize_helper()
576  * This routine switches from the pre-cq_resize buffer to the new buffer.
577  */
578 static int
dapli_hermon_cq_resize_helper(ib_cq_handle_t cq)579 dapli_hermon_cq_resize_helper(ib_cq_handle_t cq)
580 {
581 	int i;
582 
583 	if ((cq->cq_resize_addr == 0) ||
584 	    (munmap((char *)cq->cq_addr, cq->cq_map_len) < 0)) {
585 		dapl_dbg_log(DAPL_DBG_TYPE_ERR, "cq_resize_helper: "
586 		    "munmap(%p:0x%llx) failed(%d)\n", cq->cq_addr,
587 		    cq->cq_map_len, errno);
588 		return (1);	/* FAILED */
589 	}
590 	cq->cq_addr		= cq->cq_resize_addr;
591 	cq->cq_map_offset	= cq->cq_resize_map_offset;
592 	cq->cq_map_len		= cq->cq_resize_map_len;
593 	cq->cq_size		= cq->cq_resize_size;
594 	cq->cq_cqesz		= cq->cq_resize_cqesz;
595 	cq->cq_resize_addr	= 0;
596 	cq->cq_resize_map_offset = 0;
597 	cq->cq_resize_map_len	= 0;
598 	cq->cq_resize_size	= 0;
599 	cq->cq_resize_cqesz	= 0;
600 	for (i = 0; (1 << i) < cq->cq_size; i++)
601 		;
602 	cq->cq_log_cqsz = i;
603 
604 	cq->cq_consindx++;	/* consume the RESIZE cqe */
605 
606 	return (0);	/* SUCCESS */
607 }
608 
609 /*
610  * dapli_hermon_cq_poll()
611  * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t
612  * array that is passed in.
613  */
614 static DAT_RETURN
dapli_hermon_cq_poll(ib_cq_handle_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)615 dapli_hermon_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc,
616     uint_t *num_polled)
617 {
618 	uint32_t		*cqe;
619 	uint32_t		cons_indx;
620 	uint32_t		polled_cnt;
621 	DAT_RETURN		dat_status;
622 	int			status;
623 
624 	/* Get the consumer index */
625 	cons_indx = cq->cq_consindx & cq_wrap_around_mask;
626 
627 	/* Calculate the pointer to the first CQ entry */
628 	cqe = (uint32_t *)&cq->cq_addr[cons_indx];
629 
630 	/*
631 	 * Keep pulling entries from the CQ until we find an entry owned by
632 	 * the hardware.  As long as there the CQE's owned by SW, process
633 	 * each entry by calling dapli_hermon_cq_cqe_consume() and updating the
634 	 * CQ consumer index.  Note:  We only update the consumer index if
635 	 * dapli_hermon_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
636 	 * Otherwise, it indicates that we are going to "recycle" the CQE
637 	 * (probably because it is a error CQE and corresponds to more than one
638 	 * completion).
639 	 */
640 	polled_cnt = 0;
641 	while (HERMON_CQE_OWNER_IS_SW(cq, cqe)) {
642 		if (HERMON_CQE_OPCODE_GET(cqe) == HERMON_CQE_RESIZE_OPCODE) {
643 			if (dapli_hermon_cq_resize_helper(cq))
644 				return (DAT_ERROR(DAT_INTERNAL_ERROR, 0));
645 			cons_indx = cq->cq_consindx & cq_wrap_around_mask;
646 			cqe = (uint32_t *)&cq->cq_addr[cons_indx];
647 			continue;
648 		}
649 		status = dapli_hermon_cq_cqe_consume(cq, cqe,
650 		    &wc_p[polled_cnt++]);
651 		if (status == TAVOR_CQ_SYNC_AND_DB) {
652 			/* Reset to hardware ownership is implicit in Hermon */
653 			cq->cq_consindx++;	/* incr the total counter */
654 
655 			/* Increment the consumer index */
656 			cons_indx = (cons_indx + 1) & cq_wrap_around_mask;
657 
658 			/* Update the pointer to the next CQ entry */
659 			cqe = (uint32_t *)&cq->cq_addr[cons_indx];
660 		}
661 
662 		/*
663 		 * If we have run out of space to store work completions,
664 		 * then stop and return the ones we have pulled of the CQ.
665 		 */
666 		if (polled_cnt >= num_wc) {
667 			break;
668 		}
669 	}
670 
671 	dat_status = DAT_SUCCESS;
672 	/*
673 	 * Now we only ring the doorbell (to update the consumer index) if
674 	 * we've actually consumed a CQ entry.  If we have, for example,
675 	 * pulled from a CQE that we are still in the process of "recycling"
676 	 * for error purposes, then we would not update the consumer index.
677 	 */
678 	if (polled_cnt != 0) {
679 		/*
680 		 * Update the consumer index in both the CQ handle and the
681 		 * doorbell record.
682 		 */
683 		dapli_hermon_cq_update_ci(cq, cq->cq_poll_dbp);
684 	} else if (polled_cnt == 0) {
685 		/*
686 		 * If the CQ is empty, we can try to free up some of the WRID
687 		 * list containers.
688 		 */
689 		if (cq->cq_wrid_reap_head)	/* look before leaping */
690 			dapls_tavor_wrid_cq_reap(cq);
691 		dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
692 	}
693 
694 	if (num_polled != NULL) {
695 		*num_polled = polled_cnt;
696 	}
697 
698 	return (dat_status);
699 }
700 
701 /*
702  * dapli_hermon_cq_poll_one()
703  * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t
704  * that is passed in.  See above for more comments/details.
705  */
706 static DAT_RETURN
dapli_hermon_cq_poll_one(ib_cq_handle_t cq,ibt_wc_t * wc_p)707 dapli_hermon_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p)
708 {
709 	uint32_t		*cqe;
710 	uint32_t		cons_indx;
711 	DAT_RETURN		dat_status;
712 	int			status;
713 
714 start_over:
715 	/* Get the consumer index */
716 	cons_indx = cq->cq_consindx & cq_wrap_around_mask;
717 
718 	/* Calculate the pointer to the first CQ entry */
719 	cqe = (uint32_t *)&cq->cq_addr[cons_indx];
720 
721 	/*
722 	 * Keep pulling entries from the CQ until we find an entry owned by
723 	 * the hardware.  As long as there the CQE's owned by SW, process
724 	 * each entry by calling dapli_hermon_cq_cqe_consume() and updating the
725 	 * CQ consumer index.  Note:  We only update the consumer index if
726 	 * dapli_hermon_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
727 	 * Otherwise, it indicates that we are going to "recycle" the CQE
728 	 * (probably because it is a error CQE and corresponds to more than one
729 	 * completion).
730 	 */
731 	if (HERMON_CQE_OWNER_IS_SW(cq, cqe)) {
732 		if (HERMON_CQE_OPCODE_GET(cqe) == HERMON_CQE_RESIZE_OPCODE) {
733 			if (dapli_hermon_cq_resize_helper(cq))
734 				return (DAT_ERROR(DAT_INTERNAL_ERROR, 0));
735 			goto start_over;
736 		}
737 		status = dapli_hermon_cq_cqe_consume(cq, cqe, wc_p);
738 		if (status == TAVOR_CQ_SYNC_AND_DB) {
739 			/* Reset to hardware ownership is implicit in Hermon */
740 
741 			/* Increment the consumer index */
742 			cq->cq_consindx++;
743 			dapli_hermon_cq_update_ci(cq, cq->cq_poll_dbp);
744 		}
745 		dat_status = DAT_SUCCESS;
746 	} else {
747 		if (cq->cq_wrid_reap_head)	/* look before leaping */
748 			dapls_tavor_wrid_cq_reap(cq);
749 		dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
750 	}
751 	return (dat_status);
752 }
753 
754 /*
755  * dapli_hermon_cq_cqe_consume()
756  * Converts a given CQE into a ibt_wc_t object
757  */
758 static int
dapli_hermon_cq_cqe_consume(ib_cq_handle_t cqhdl,uint32_t * cqe,ibt_wc_t * wc)759 dapli_hermon_cq_cqe_consume(ib_cq_handle_t cqhdl, uint32_t *cqe,
760     ibt_wc_t *wc)
761 {
762 	uint_t		flags;
763 	uint_t		type;
764 	uint_t		opcode;
765 	int		status;
766 
767 	/*
768 	 * Determine if this is an "error" CQE by examining "opcode".  If it
769 	 * is an error CQE, then call dapli_hermon_cq_errcqe_consume() and
770 	 * return whatever status it returns.  Otherwise, this is a successful
771 	 * completion.
772 	 */
773 	opcode = HERMON_CQE_OPCODE_GET(cqe);
774 	if (opcode == HERMON_CQE_ERR_OPCODE) {
775 		status = dapli_hermon_cq_errcqe_consume(cqhdl, cqe, wc);
776 		return (status);
777 	}
778 	TAVOR_CQE_WQEADDRSZ_SET(cqe, (HTOBE_32(cqe[6]) >> 10) &
779 	    ~HERMON_WQE_NDS_MASK);
780 
781 	/*
782 	 * Fetch the Work Request ID using the information in the CQE.
783 	 * See tavor_wr.c for more details.
784 	 */
785 	wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, (tavor_hw_cqe_t *)cqe,
786 	    HERMON_CQE_SENDRECV_GET(cqe) >> 6, 0, NULL);
787 	wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
788 
789 	/*
790 	 * Parse the CQE opcode to determine completion type.  This will set
791 	 * not only the type of the completion, but also any flags that might
792 	 * be associated with it (e.g. whether immediate data is present).
793 	 */
794 	flags = IBT_WC_NO_FLAGS;
795 	if (HERMON_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) {
796 
797 		/*
798 		 * Send CQE
799 		 *
800 		 * The following opcodes will not be generated in uDAPL
801 		 * case TAVOR_CQE_SND_RDMAWR_IMM:
802 		 * case TAVOR_CQE_SND_SEND_IMM:
803 		 * case TAVOR_CQE_SND_ATOMIC_CS:
804 		 * case TAVOR_CQE_SND_ATOMIC_FA:
805 		 */
806 		switch (opcode) {
807 		case TAVOR_CQE_SND_RDMAWR:
808 			type = IBT_WRC_RDMAW;
809 			break;
810 
811 		case TAVOR_CQE_SND_SEND:
812 			type = IBT_WRC_SEND;
813 			break;
814 
815 		case TAVOR_CQE_SND_RDMARD:
816 			type = IBT_WRC_RDMAR;
817 			wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
818 			break;
819 
820 		case TAVOR_CQE_SND_BIND_MW:
821 			type = IBT_WRC_BIND;
822 			break;
823 
824 		default:
825 			wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
826 			return (TAVOR_CQ_SYNC_AND_DB);
827 		}
828 	} else {
829 
830 		/*
831 		 * Receive CQE
832 		 *
833 		 * The following opcodes will not be generated in uDAPL
834 		 *
835 		 * case TAVOR_CQE_RCV_RECV_IMM:
836 		 * case TAVOR_CQE_RCV_RECV_IMM2:
837 		 * case TAVOR_CQE_RCV_RDMAWR_IMM:
838 		 * case TAVOR_CQE_RCV_RDMAWR_IMM2:
839 		 */
840 		switch (opcode) {
841 		case HERMON_CQE_RCV_SEND:
842 			type = IBT_WRC_RECV;
843 			wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
844 			break;
845 		default:
846 			wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
847 			return (TAVOR_CQ_SYNC_AND_DB);
848 		}
849 	}
850 	wc->wc_type = type;
851 	wc->wc_flags = flags;
852 	/* If we got here, completion status must be success */
853 	wc->wc_status = IBT_WC_SUCCESS;
854 
855 	return (TAVOR_CQ_SYNC_AND_DB);
856 }
857 
858 /*
859  * dapli_hermon_cq_errcqe_consume()
860  */
861 static int
dapli_hermon_cq_errcqe_consume(ib_cq_handle_t cqhdl,uint32_t * cqe,ibt_wc_t * wc)862 dapli_hermon_cq_errcqe_consume(ib_cq_handle_t cqhdl, uint32_t *cqe,
863     ibt_wc_t *wc)
864 {
865 	dapls_tavor_wrid_entry_t	wre;
866 	uint_t			status;
867 	uint_t			send_or_recv;
868 
869 	dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n",
870 	    TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
871 	    TAVOR_CQE_WQEADDRSZ_GET(cqe));
872 
873 	status = ((uint8_t *)cqe)[0x1B];
874 	TAVOR_CQE_WQEADDRSZ_SET(cqe, (HTOBE_32(cqe[6]) >> 10) &
875 	    ~HERMON_WQE_NDS_MASK);
876 	if (HERMON_CQE_SENDRECV_GET(cqe) == 0) {
877 		send_or_recv = 0;
878 	} else {
879 		send_or_recv = 1;
880 	}
881 
882 	/*
883 	 * Fetch the Work Request ID using the information in the CQE.
884 	 * See tavor_wr.c for more details.
885 	 */
886 	wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, (tavor_hw_cqe_t *)cqe,
887 	    send_or_recv, 1, &wre);
888 	wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
889 
890 	/*
891 	 * Parse the CQE opcode to determine completion type.  We know that
892 	 * the CQE is an error completion, so we extract only the completion
893 	 * status here.
894 	 */
895 	switch (status) {
896 	case TAVOR_CQE_LOC_LEN_ERR:
897 		status = IBT_WC_LOCAL_LEN_ERR;
898 		break;
899 
900 	case TAVOR_CQE_LOC_OP_ERR:
901 		status = IBT_WC_LOCAL_CHAN_OP_ERR;
902 		break;
903 
904 	case TAVOR_CQE_LOC_PROT_ERR:
905 		status = IBT_WC_LOCAL_PROTECT_ERR;
906 		break;
907 
908 	case TAVOR_CQE_WR_FLUSHED_ERR:
909 		status = IBT_WC_WR_FLUSHED_ERR;
910 		break;
911 
912 	case TAVOR_CQE_MW_BIND_ERR:
913 		status = IBT_WC_MEM_WIN_BIND_ERR;
914 		break;
915 
916 	case TAVOR_CQE_BAD_RESPONSE_ERR:
917 		status = IBT_WC_BAD_RESPONSE_ERR;
918 		break;
919 
920 	case TAVOR_CQE_LOCAL_ACCESS_ERR:
921 		status = IBT_WC_LOCAL_ACCESS_ERR;
922 		break;
923 
924 	case TAVOR_CQE_REM_INV_REQ_ERR:
925 		status = IBT_WC_REMOTE_INVALID_REQ_ERR;
926 		break;
927 
928 	case TAVOR_CQE_REM_ACC_ERR:
929 		status = IBT_WC_REMOTE_ACCESS_ERR;
930 		break;
931 
932 	case TAVOR_CQE_REM_OP_ERR:
933 		status = IBT_WC_REMOTE_OP_ERR;
934 		break;
935 
936 	case TAVOR_CQE_TRANS_TO_ERR:
937 		status = IBT_WC_TRANS_TIMEOUT_ERR;
938 		break;
939 
940 	case TAVOR_CQE_RNRNAK_TO_ERR:
941 		status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
942 		break;
943 
944 	/*
945 	 * The following error codes are not supported in the Tavor driver
946 	 * as they relate only to Reliable Datagram completion statuses:
947 	 *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
948 	 *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
949 	 *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
950 	 *    case TAVOR_CQE_INV_EEC_NUM_ERR:
951 	 *    case TAVOR_CQE_INV_EEC_STATE_ERR:
952 	 *    case TAVOR_CQE_LOC_EEC_ERR:
953 	 */
954 
955 	default:
956 		status = IBT_WC_LOCAL_CHAN_OP_ERR;
957 		break;
958 	}
959 	wc->wc_status = status;
960 	wc->wc_type = 0;
961 
962 	/*
963 	 * Consume the CQE
964 	 *    Return status to indicate that doorbell and sync may be
965 	 *    necessary.
966 	 */
967 	return (TAVOR_CQ_SYNC_AND_DB);
968 }
969 
970 /*
971  * dapli_hermon_cq_notify()
972  * This function is used for arming the CQ by ringing the CQ doorbell.
973  *
974  * Note: there is something very subtle here.  This code assumes a very
975  * specific behavior of the kernel driver.  The cmd_sn field of the
976  * arm_dbr is updated by the kernel driver whenever a notification
977  * event for the cq is received.  This code extracts the cmd_sn field
978  * from the arm_dbr to know the right value to use.  The arm_dbr is
979  * always updated atomically so that neither the kernel driver nor this
980  * will get confused about what the other is doing.
981  *
982  * Note: param is not used here.  It is necessary for arming a CQ for
983  * N completions (param is N), but no uDAPL API supports this for now.
984  * Thus, we declare ARGSUSED to make lint happy.
985  */
986 /*ARGSUSED*/
987 static DAT_RETURN
dapli_hermon_cq_notify(ib_cq_handle_t cq,int flags,uint32_t param)988 dapli_hermon_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param)
989 {
990 	uint32_t	cqnum;
991 	uint32_t	*target;
992 	uint32_t	old_cmd, cmp, new, tmp, cmd_sn;
993 
994 	/*
995 	 * Determine if we are trying to get the next completion or the next
996 	 * "solicited" completion.  Then hit the appropriate doorbell.
997 	 */
998 	cqnum = cq->cq_num;
999 	target = cq->cq_arm_dbp;
1000 retry:
1001 	cmp = *target;
1002 	tmp = HTOBE_32(cmp);
1003 	old_cmd = tmp & (0x7 << 24);
1004 	cmd_sn = tmp & (0x3 << 28);
1005 
1006 	if (flags == IB_NOTIFY_ON_NEXT_COMP) {
1007 		if (old_cmd != HERMON_CQDB_NOTIFY_CQ) {
1008 			new = HTOBE_32(cmd_sn | HERMON_CQDB_NOTIFY_CQ |
1009 			    (cq->cq_consindx & 0xFFFFFF));
1010 			tmp = atomic_cas_32(target, cmp, new);
1011 			if (tmp != cmp)
1012 				goto retry;
1013 			dapli_hermon_cq_doorbell(cq->cq_iauar,
1014 			    HERMON_CQDB_NOTIFY_CQ, cqnum,
1015 			    cmd_sn, cq->cq_consindx);
1016 		} /* else it's already armed */
1017 	} else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) {
1018 		if (old_cmd != HERMON_CQDB_NOTIFY_CQ &&
1019 		    old_cmd != HERMON_CQDB_NOTIFY_CQ_SOLICIT) {
1020 			new = HTOBE_32(cmd_sn | HERMON_CQDB_NOTIFY_CQ_SOLICIT |
1021 			    (cq->cq_consindx & 0xFFFFFF));
1022 			tmp = atomic_cas_32(target, cmp, new);
1023 			if (tmp != cmp)
1024 				goto retry;
1025 			dapli_hermon_cq_doorbell(cq->cq_iauar,
1026 			    HERMON_CQDB_NOTIFY_CQ_SOLICIT, cqnum,
1027 			    cmd_sn, cq->cq_consindx);
1028 		} /* else it's already armed */
1029 	} else {
1030 		return (DAT_INVALID_PARAMETER);
1031 	}
1032 
1033 	return (DAT_SUCCESS);
1034 }
1035 
1036 /*
1037  * Since uDAPL posts 1 wqe per request, we
1038  * only need to do stores for the last one.
1039  */
1040 static void
dapli_hermon_wqe_headroom(ib_qp_handle_t qp,uint32_t start)1041 dapli_hermon_wqe_headroom(ib_qp_handle_t qp, uint32_t start)
1042 {
1043 	uint32_t *wqe_start, *wqe_top, *wqe_base, qsize, invalue;
1044 	int hdrmwqes, wqesizebytes, sectperwqe, i, j;
1045 
1046 	qsize = qp->qp_sq_numwqe;
1047 	wqesizebytes = qp->qp_sq_wqesz;
1048 	sectperwqe = wqesizebytes >> 6;
1049 	hdrmwqes = qp->qp_sq_headroom;
1050 	wqe_base = (uint32_t *)TAVOR_QP_SQ_ENTRY(qp, 0);
1051 	wqe_top = (uint32_t *)TAVOR_QP_SQ_ENTRY(qp, qsize);
1052 	wqe_start = (uint32_t *)TAVOR_QP_SQ_ENTRY(qp, start);
1053 
1054 	for (i = 0; i < hdrmwqes - 1; i++) {
1055 		wqe_start += sectperwqe * 16;
1056 		if (wqe_start == wqe_top)
1057 			wqe_start = wqe_base;
1058 	}
1059 	invalue = HTOBE_32(*wqe_start);
1060 	invalue |= 0x7FFFFFFF;
1061 	*wqe_start = HTOBE_32(invalue);
1062 	wqe_start += 16;
1063 	for (j = 1; j < sectperwqe; j++) {
1064 		*wqe_start = 0xFFFFFFFF;
1065 		wqe_start += 16;
1066 	}
1067 }
1068 
1069 /*
1070  * dapli_hermon_post_send()
1071  */
1072 /* ARGSUSED */
1073 static DAT_RETURN
dapli_hermon_post_send(DAPL_EP * ep,ibt_send_wr_t * wr,boolean_t ns)1074 dapli_hermon_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns)
1075 {
1076 	dapls_tavor_wrid_list_hdr_t	*wridlist;
1077 	dapls_tavor_wrid_entry_t	*wre_last;
1078 	uint64_t			*desc;
1079 	uint64_t			*wqe_addr;
1080 	uint32_t			desc_sz;
1081 	uint32_t			wqeaddrsz, signaled_dbd;
1082 	uint32_t			head, tail, next_tail, qsize_msk;
1083 	int				status;
1084 	ib_qp_handle_t			qp;
1085 
1086 	if ((ep->qp_state == IBT_STATE_RESET) ||
1087 	    (ep->qp_state == IBT_STATE_INIT) ||
1088 	    (ep->qp_state == IBT_STATE_RTR)) {
1089 		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1090 		    "post_send: invalid qp_state %d\n", ep->qp_state);
1091 		return (DAT_INVALID_STATE);
1092 	}
1093 
1094 	qp = ep->qp_handle;
1095 
1096 	/* Grab the lock for the WRID list */
1097 	dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1098 	wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
1099 
1100 	/* Save away some initial QP state */
1101 	qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
1102 	tail	  = qp->qp_sq_wqhdr->wq_tail;
1103 	head	  = qp->qp_sq_wqhdr->wq_head;
1104 
1105 	/*
1106 	 * Check for "queue full" condition.  If the queue is already full,
1107 	 * then no more WQEs can be posted, return an error
1108 	 */
1109 	if (qp->qp_sq_wqhdr->wq_full != 0) {
1110 		dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1111 		return (DAT_INSUFFICIENT_RESOURCES);
1112 	}
1113 
1114 	/*
1115 	 * Increment the "tail index" and check for "queue full" condition.
1116 	 * If we detect that the current work request is going to fill the
1117 	 * work queue, then we mark this condition and continue.
1118 	 */
1119 	next_tail = (tail + 1) & qsize_msk;
1120 	if (next_tail == head) {
1121 		qp->qp_sq_wqhdr->wq_full = 1;
1122 	}
1123 
1124 	/*
1125 	 * Get the user virtual address of the location where the next
1126 	 * Send WQE should be built
1127 	 */
1128 	wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail);
1129 
1130 	/*
1131 	 * Call tavor_wqe_send_build() to build the WQE at the given address.
1132 	 * This routine uses the information in the ibt_send_wr_t and
1133 	 * returns the size of the WQE when it returns.
1134 	 */
1135 	status = dapli_hermon_wqe_send_build(qp, wr, wqe_addr, &desc_sz);
1136 	if (status != DAT_SUCCESS) {
1137 		dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1138 		return (status);
1139 	}
1140 
1141 	/*
1142 	 * Get the descriptor (io address) corresponding to the location
1143 	 * Send WQE was built.
1144 	 */
1145 	desc = TAVOR_QP_SQ_ENTRY(qp, tail);
1146 
1147 	/*
1148 	 * Add a WRID entry to the WRID list.  Need to calculate the
1149 	 * "wqeaddr" to pass to dapli_tavor_wrid_add_entry().
1150 	 * signaled_dbd is still calculated, but ignored.
1151 	 */
1152 	wqeaddrsz = HERMON_QP_WQEADDRSZ(qp->qp_sq_counter);
1153 
1154 	if (wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1155 		signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
1156 	}
1157 
1158 	dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz,
1159 	    signaled_dbd);
1160 
1161 	dapli_hermon_wqe_headroom(qp, next_tail);
1162 	*(uint8_t *)desc ^= 0x80;	/* set owner bit */
1163 
1164 	/*
1165 	 * Now if the WRID tail entry is non-NULL, then this
1166 	 * represents the entry to which we are chaining the
1167 	 * new entries.  Since we are going to ring the
1168 	 * doorbell for this WQE, we want set its "dbd" bit.
1169 	 *
1170 	 * On the other hand, if the tail is NULL, even though
1171 	 * we will have rung the doorbell for the previous WQE
1172 	 * (for the hardware's sake) it is irrelevant to our
1173 	 * purposes (for tracking WRIDs) because we know the
1174 	 * request must have already completed.
1175 	 */
1176 	wre_last = wridlist->wl_wre_old_tail;
1177 	if (wre_last != NULL) {
1178 		wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1179 	}
1180 
1181 	/* Update some of the state in the QP */
1182 	qp->qp_sq_lastwqeaddr	 = wqe_addr;
1183 	qp->qp_sq_wqhdr->wq_tail = next_tail;
1184 
1185 	if (desc_sz && qp->qp_ia_bf != NULL) {	/* use Hermon Blueflame */
1186 		uint64_t *bf_dest, *src64;
1187 		uint8_t *src8;
1188 		int i;
1189 
1190 		(void) pthread_spin_lock(&hermon_bf_lock);
1191 
1192 		src8 = (uint8_t *)desc;
1193 		src8[1] = (uint8_t)(qp->qp_sq_counter >> 8);
1194 		src8[2] = (uint8_t)qp->qp_sq_counter;
1195 		src8[4] = (uint8_t)(qp->qp_num >> 16);
1196 		src8[5] = (uint8_t)(qp->qp_num >> 8);
1197 		src8[6] = (uint8_t)qp->qp_num;
1198 
1199 		src64 = (uint64_t *)desc;
1200 		bf_dest = (uint64_t *)((uintptr_t)qp->qp_ia_bf +
1201 		    *qp->qp_ia_bf_toggle);
1202 		*qp->qp_ia_bf_toggle ^= 256;	/* 2 256-byte buffers */
1203 		for (i = 0; i < desc_sz * 2; i += 8) {
1204 			bf_dest[i] = src64[i];
1205 			bf_dest[i + 1] = src64[i + 1];
1206 			bf_dest[i + 2] = src64[i + 2];
1207 			bf_dest[i + 3] = src64[i + 3];
1208 			bf_dest[i + 4] = src64[i + 4];
1209 			bf_dest[i + 5] = src64[i + 5];
1210 			bf_dest[i + 6] = src64[i + 6];
1211 			bf_dest[i + 7] = src64[i + 7];
1212 		}
1213 		(void) pthread_spin_unlock(&hermon_bf_lock);
1214 	} else {
1215 		/* Ring the doorbell */
1216 		dapli_hermon_sq_dbreg(qp->qp_iauar, qp->qp_num);
1217 	}
1218 	qp->qp_sq_counter++;
1219 
1220 	dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1221 
1222 	return (DAT_SUCCESS);
1223 }
1224 
1225 /*
1226  * dapli_hermon_post_recv()
1227  */
1228 /* ARGSUSED */
1229 static DAT_RETURN
dapli_hermon_post_recv(DAPL_EP * ep,ibt_recv_wr_t * wr,boolean_t ns)1230 dapli_hermon_post_recv(DAPL_EP	*ep, ibt_recv_wr_t *wr, boolean_t ns)
1231 {
1232 	dapls_tavor_wrid_list_hdr_t	*wridlist;
1233 	dapls_tavor_wrid_entry_t	*wre_last;
1234 	ib_qp_handle_t			qp;
1235 	DAT_RETURN			status;
1236 	uint64_t			*wqe_addr;
1237 	uint32_t			desc_sz;
1238 	uint32_t			wqeaddrsz;
1239 	uint32_t			head, tail, next_tail, qsize_msk;
1240 
1241 	if (ep->qp_state == IBT_STATE_RESET) {
1242 		dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1243 		    "post_recv: invalid qp_state %d\n", ep->qp_state);
1244 		return (DAT_INVALID_STATE);
1245 	}
1246 	qp = ep->qp_handle;
1247 
1248 	/* Grab the lock for the WRID list */
1249 	dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1250 	wridlist  = qp->qp_rq_wqhdr->wq_wrid_post;
1251 
1252 	/* Save away some initial QP state */
1253 	qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
1254 	tail	  = qp->qp_rq_wqhdr->wq_tail;
1255 	head	  = qp->qp_rq_wqhdr->wq_head;
1256 
1257 	/*
1258 	 * For the ibt_recv_wr_t passed in, parse the request and build a
1259 	 * Recv WQE. Link the WQE with the previous WQE and ring the
1260 	 * door bell.
1261 	 */
1262 
1263 	/*
1264 	 * Check for "queue full" condition.  If the queue is already full,
1265 	 * then no more WQEs can be posted. So return an error.
1266 	 */
1267 	if (qp->qp_rq_wqhdr->wq_full != 0) {
1268 		dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1269 		return (DAT_INSUFFICIENT_RESOURCES);
1270 	}
1271 
1272 	/*
1273 	 * Increment the "tail index" and check for "queue
1274 	 * full" condition.  If we detect that the current
1275 	 * work request is going to fill the work queue, then
1276 	 * we mark this condition and continue.
1277 	 */
1278 	next_tail = (tail + 1) & qsize_msk;
1279 	if (next_tail == head) {
1280 		qp->qp_rq_wqhdr->wq_full = 1;
1281 	}
1282 
1283 	/* The user virtual address of the WQE to be built */
1284 	wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail);
1285 
1286 	/*
1287 	 * Call tavor_wqe_recv_build() to build the WQE at the given
1288 	 * address. This routine uses the information in the
1289 	 * ibt_recv_wr_t and returns the size of the WQE.
1290 	 */
1291 	status = dapli_hermon_wqe_recv_build(qp, wr, wqe_addr, &desc_sz);
1292 	if (status != DAT_SUCCESS) {
1293 		dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1294 		return (DAT_INTERNAL_ERROR);
1295 	}
1296 
1297 	/*
1298 	 * Add a WRID entry to the WRID list.  Need to calculate the
1299 	 * "wqeaddr" and "signaled_dbd" values to pass to
1300 	 * dapli_tavor_wrid_add_entry().
1301 	 * Note: all Recv WQEs are essentially "signaled"
1302 	 */
1303 	wqeaddrsz = HERMON_QP_WQEADDRSZ(qp->qp_rq_counter);
1304 	dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz,
1305 	    (uint32_t)TAVOR_WRID_ENTRY_SIGNALED);
1306 
1307 	/*
1308 	 * Now if the WRID tail entry is non-NULL, then this
1309 	 * represents the entry to which we are chaining the
1310 	 * new entries.  Since we are going to ring the
1311 	 * doorbell for this WQE, we want set its "dbd" bit.
1312 	 *
1313 	 * On the other hand, if the tail is NULL, even though
1314 	 * we will have rung the doorbell for the previous WQE
1315 	 * (for the hardware's sake) it is irrelevant to our
1316 	 * purposes (for tracking WRIDs) because we know the
1317 	 * request must have already completed.
1318 	 */
1319 	wre_last = wridlist->wl_wre_old_tail;
1320 	if (wre_last != NULL) {
1321 		wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1322 	}
1323 
1324 	/* Update some of the state in the QP */
1325 	qp->qp_rq_lastwqeaddr	 = wqe_addr;
1326 	qp->qp_rq_wqhdr->wq_tail = next_tail;
1327 
1328 	/* Update the doorbell record */
1329 	qp->qp_rq_counter++;
1330 	(qp->qp_rq_dbp)[0] = HTOBE_32(qp->qp_rq_counter);
1331 
1332 	dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1333 
1334 	return (DAT_SUCCESS);
1335 }
1336 
1337 /*
1338  * dapli_hermon_post_srq()
1339  */
1340 /* ARGSUSED */
1341 static DAT_RETURN
dapli_hermon_post_srq(DAPL_SRQ * srqp,ibt_recv_wr_t * wr,boolean_t ns)1342 dapli_hermon_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns)
1343 {
1344 	ib_srq_handle_t			srq;
1345 	DAT_RETURN			status;
1346 	uint32_t			desc;
1347 	uint64_t			*wqe_addr;
1348 	uint32_t			head, next_head, qsize_msk;
1349 	uint32_t			wqe_index;
1350 
1351 
1352 	srq = srqp->srq_handle;
1353 
1354 	/* Grab the lock for the WRID list */
1355 	dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock);
1356 
1357 	/*
1358 	 * For the ibt_recv_wr_t passed in, parse the request and build a
1359 	 * Recv WQE. Link the WQE with the previous WQE and ring the
1360 	 * door bell.
1361 	 */
1362 
1363 	/*
1364 	 * Check for "queue full" condition.  If the queue is already full,
1365 	 * ie. there are no free entries, then no more WQEs can be posted.
1366 	 * So return an error.
1367 	 */
1368 	if (srq->srq_wridlist->wl_freel_entries == 0) {
1369 		dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1370 		return (DAT_INSUFFICIENT_RESOURCES);
1371 	}
1372 
1373 	/* Save away some initial SRQ state */
1374 	qsize_msk = srq->srq_wridlist->wl_size - 1;
1375 	head	  = srq->srq_wridlist->wl_freel_head;
1376 
1377 	next_head = (head + 1) & qsize_msk;
1378 
1379 	/* Get the descriptor (IO Address) of the WQE to be built */
1380 	desc = srq->srq_wridlist->wl_free_list[head];
1381 
1382 	wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc,
1383 	    srq->srq_wq_wqesz);
1384 
1385 	/* The user virtual address of the WQE to be built */
1386 	wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index);
1387 
1388 	/*
1389 	 * Call dapli_hermon_wqe_srq_build() to build the WQE at the given
1390 	 * address. This routine uses the information in the
1391 	 * ibt_recv_wr_t and returns the size of the WQE.
1392 	 */
1393 	status = dapli_hermon_wqe_srq_build(srq, wr, wqe_addr);
1394 	if (status != DAT_SUCCESS) {
1395 		dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1396 		return (status);
1397 	}
1398 
1399 	/*
1400 	 * Add a WRID entry to the WRID list.
1401 	 */
1402 	dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index);
1403 
1404 #if 0
1405 	if (srq->srq_wq_lastwqeindex == -1) {
1406 		last_wqe_addr = NULL;
1407 	} else {
1408 		last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq,
1409 		    srq->srq_wq_lastwqeindex);
1410 	}
1411 	/*
1412 	 * Now link the chain to the old chain (if there was one)
1413 	 * and update the wqe_counter in the doorbell record.
1414 	 */
1415 XXX
1416 	dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr);
1417 #endif
1418 
1419 	/* Update some of the state in the SRQ */
1420 	srq->srq_wq_lastwqeindex	 = wqe_index;
1421 	srq->srq_wridlist->wl_freel_head = next_head;
1422 	srq->srq_wridlist->wl_freel_entries--;
1423 	dapl_os_assert(srq->srq_wridlist->wl_freel_entries <=
1424 	    srq->srq_wridlist->wl_size);
1425 
1426 	/* Update the doorbell record */
1427 	srq->srq_counter++;
1428 	(srq->srq_dbp)[0] = HTOBE_32(srq->srq_counter);
1429 
1430 	dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1431 
1432 	return (DAT_SUCCESS);
1433 }
1434 
1435 /*
1436  * dapli_hermon_cq_srq_entries_flush()
1437  */
1438 static void
dapli_hermon_cq_srq_entries_flush(ib_qp_handle_t qp)1439 dapli_hermon_cq_srq_entries_flush(ib_qp_handle_t qp)
1440 {
1441 	ib_cq_handle_t		cq;
1442 	dapls_tavor_workq_hdr_t	*wqhdr;
1443 	tavor_hw_cqe_t		*cqe;
1444 	tavor_hw_cqe_t		*next_cqe;
1445 	uint32_t		cons_indx, tail_cons_indx;
1446 	uint32_t		new_indx, check_indx, indx;
1447 	int			cqe_qpnum, cqe_type;
1448 	int			outstanding_cqes, removed_cqes;
1449 	int			i;
1450 
1451 	/* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */
1452 
1453 	cq = qp->qp_rq_cqhdl;
1454 	wqhdr = qp->qp_rq_wqhdr;
1455 
1456 	dapl_os_assert(wqhdr->wq_wrid_post != NULL);
1457 	dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0);
1458 
1459 	/* Get the consumer index */
1460 	cons_indx = cq->cq_consindx;
1461 
1462 	/* Calculate the pointer to the first CQ entry */
1463 	cqe = &cq->cq_addr[cons_indx];
1464 
1465 	/*
1466 	 * Loop through the CQ looking for entries owned by software.  If an
1467 	 * entry is owned by software then we increment an 'outstanding_cqes'
1468 	 * count to know how many entries total we have on our CQ.  We use this
1469 	 * value further down to know how many entries to loop through looking
1470 	 * for our same QP number.
1471 	 */
1472 	outstanding_cqes = 0;
1473 	tail_cons_indx = cons_indx;
1474 	while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
1475 		/* increment total cqes count */
1476 		outstanding_cqes++;
1477 
1478 		/* increment the consumer index */
1479 		tail_cons_indx = (tail_cons_indx + 1) & cq_wrap_around_mask;
1480 
1481 		/* update the pointer to the next cq entry */
1482 		cqe = &cq->cq_addr[tail_cons_indx];
1483 	}
1484 
1485 	/*
1486 	 * Using the 'tail_cons_indx' that was just set, we now know how many
1487 	 * total CQEs possible there are.  Set the 'check_indx' and the
1488 	 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1489 	 */
1490 	check_indx = new_indx = (tail_cons_indx - 1) & cq_wrap_around_mask;
1491 
1492 	for (i = 0; i < outstanding_cqes; i++) {
1493 		cqe = &cq->cq_addr[check_indx];
1494 
1495 		/* Grab QP number from CQE */
1496 		cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe);
1497 		cqe_type = HERMON_CQE_SENDRECV_GET(cqe);
1498 
1499 		/*
1500 		 * If the QP number is the same in the CQE as the QP that we
1501 		 * have on this SRQ, then we must free up the entry off the
1502 		 * SRQ.  We also make sure that the completion type is of the
1503 		 * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1504 		 * this CQ will be left as-is.  The handling of returning
1505 		 * entries back to HW ownership happens further down.
1506 		 */
1507 		if (cqe_qpnum == qp->qp_num &&
1508 		    cqe_type == TAVOR_COMPLETION_RECV) {
1509 			/* Add back to SRQ free list */
1510 			(void) dapli_tavor_wrid_find_match_srq(
1511 			    wqhdr->wq_wrid_post, cqe);
1512 		} else {
1513 			/* Do Copy */
1514 			if (check_indx != new_indx) {
1515 				next_cqe = &cq->cq_addr[new_indx];
1516 				/*
1517 				 * Copy the CQE into the "next_cqe"
1518 				 * pointer.
1519 				 */
1520 				(void) dapl_os_memcpy(next_cqe, cqe,
1521 				    sizeof (tavor_hw_cqe_t));
1522 			}
1523 			new_indx = (new_indx - 1) & cq_wrap_around_mask;
1524 		}
1525 		/* Move index to next CQE to check */
1526 		check_indx = (check_indx - 1) & cq_wrap_around_mask;
1527 	}
1528 
1529 	/* Initialize removed cqes count */
1530 	removed_cqes = 0;
1531 
1532 	/* If an entry was removed */
1533 	if (check_indx != new_indx) {
1534 
1535 		/*
1536 		 * Set current pointer back to the beginning consumer index.
1537 		 * At this point, all unclaimed entries have been copied to the
1538 		 * index specified by 'new_indx'.  This 'new_indx' will be used
1539 		 * as the new consumer index after we mark all freed entries as
1540 		 * having HW ownership.  We do that here.
1541 		 */
1542 
1543 		/* Loop through all entries until we reach our new pointer */
1544 		for (indx = cons_indx; indx <= new_indx;
1545 		    indx = (indx + 1) & cq_wrap_around_mask) {
1546 			removed_cqes++;
1547 			cqe = &cq->cq_addr[indx];
1548 
1549 			/* Reset entry to hardware ownership */
1550 			TAVOR_CQE_OWNER_SET_HW(cqe);
1551 		}
1552 	}
1553 
1554 	/*
1555 	 * Update consumer index to be the 'new_indx'.  This moves it past all
1556 	 * removed entries.  Because 'new_indx' is pointing to the last
1557 	 * previously valid SW owned entry, we add 1 to point the cons_indx to
1558 	 * the first HW owned entry.
1559 	 */
1560 	cons_indx = (new_indx + 1) & cq_wrap_around_mask;
1561 
1562 	/*
1563 	 * Now we only ring the doorbell (to update the consumer index) if
1564 	 * we've actually consumed a CQ entry.  If we found no QP number
1565 	 * matches above, then we would not have removed anything.  So only if
1566 	 * something was removed do we ring the doorbell.
1567 	 */
1568 	if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1569 		/*
1570 		 * Update the consumer index in both the CQ handle and the
1571 		 * doorbell record.
1572 		 */
1573 		cq->cq_consindx = cons_indx;
1574 		dapli_hermon_cq_update_ci(cq, cq->cq_poll_dbp);
1575 	}
1576 }
1577 
1578 static void
dapli_hermon_rq_prelink(caddr_t first,uint32_t desc_off,uint32_t wqesz,uint32_t numwqe,uint32_t nds)1579 dapli_hermon_rq_prelink(caddr_t first, uint32_t desc_off, uint32_t wqesz,
1580     uint32_t numwqe, uint32_t nds)
1581 {
1582 	int i;
1583 	uint32_t *p = (uint32_t *)(uintptr_t)first;
1584 	uint32_t off = desc_off;
1585 	uint32_t pincr = wqesz / sizeof (uint32_t);
1586 	ibt_wr_ds_t sgl;
1587 
1588 	sgl.ds_va = (ib_vaddr_t)0;
1589 	sgl.ds_key = HERMON_WQE_SGL_INVALID_LKEY;
1590 	sgl.ds_len = (ib_msglen_t)0;
1591 
1592 	for (i = 0; i < numwqe - 1; i++, p += pincr) {
1593 		off += wqesz;
1594 		p[0] = HTOBE_32(off);	/* link curr to next */
1595 		p[1] = nds;		/* nds is 0 for SRQ */
1596 		TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl);
1597 	}
1598 	p[0] = HTOBE_32(desc_off); /* link last to first */
1599 	p[1] = nds;
1600 	TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl);
1601 }
1602 
1603 static void
dapli_hermon_sq_init(caddr_t first,uint32_t wqesz,uint32_t numwqe)1604 dapli_hermon_sq_init(caddr_t first, uint32_t wqesz, uint32_t numwqe)
1605 {
1606 	int i, j;
1607 	uint64_t *wqe = (uint64_t *)(uintptr_t)first;
1608 
1609 	for (i = 0; i < numwqe; i++) {
1610 		for (j = 0; j < wqesz; j += 64, wqe += 8)
1611 			*(uint32_t *)wqe = 0xFFFFFFFF;
1612 	}
1613 }
1614 
1615 static void
dapli_hermon_qp_init(ib_qp_handle_t qp)1616 dapli_hermon_qp_init(ib_qp_handle_t qp)
1617 {
1618 	dapli_hermon_sq_init(qp->qp_sq_buf, qp->qp_sq_wqesz, qp->qp_sq_numwqe);
1619 	qp->qp_rq_counter = 0;
1620 	qp->qp_sq_counter = 0;
1621 }
1622 
1623 static void
dapli_hermon_cq_init(ib_cq_handle_t cq)1624 dapli_hermon_cq_init(ib_cq_handle_t cq)
1625 {
1626 	uint32_t i;
1627 
1628 	(cq->cq_arm_dbp)[0] = HTOBE_32(1 << 28);
1629 	for (i = 0; (1 << i) < cq->cq_size; i++)
1630 		;
1631 	cq->cq_log_cqsz = i;
1632 	cq->cq_consindx = 0;
1633 
1634 	/* cq_resize -- needs testing */
1635 }
1636 
1637 static void
dapli_hermon_srq_init(ib_srq_handle_t srq)1638 dapli_hermon_srq_init(ib_srq_handle_t srq)
1639 {
1640 	/* pre-link the whole shared receive queue */
1641 	dapli_hermon_rq_prelink(srq->srq_addr, srq->srq_wq_desc_addr,
1642 	    srq->srq_wq_wqesz, srq->srq_wq_numwqe, 0);
1643 	srq->srq_counter = 0;
1644 
1645 	/* needs testing */
1646 }
1647 
1648 void
dapls_init_funcs_hermon(DAPL_HCA * hca_ptr)1649 dapls_init_funcs_hermon(DAPL_HCA *hca_ptr)
1650 {
1651 	hca_ptr->post_send = dapli_hermon_post_send;
1652 	hca_ptr->post_recv = dapli_hermon_post_recv;
1653 	hca_ptr->post_srq = dapli_hermon_post_srq;
1654 	hca_ptr->cq_peek = dapli_hermon_cq_peek;
1655 	hca_ptr->cq_poll = dapli_hermon_cq_poll;
1656 	hca_ptr->cq_poll_one = dapli_hermon_cq_poll_one;
1657 	hca_ptr->cq_notify = dapli_hermon_cq_notify;
1658 	hca_ptr->srq_flush = dapli_hermon_cq_srq_entries_flush;
1659 	hca_ptr->qp_init = dapli_hermon_qp_init;
1660 	hca_ptr->cq_init = dapli_hermon_cq_init;
1661 	hca_ptr->srq_init = dapli_hermon_srq_init;
1662 	hca_ptr->hermon_resize_cq = 1;
1663 
1664 	(void) pthread_spin_init(&hermon_bf_lock, 0);
1665 }
1666