xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_buf.c (revision cb6207858a9fcc2feaee22e626912fba281ac969)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #pragma ident	"%Z%%M%	%I%	%E% SMI"
76 
77 #include <sys/ib/clients/rds/rdsib_cm.h>
78 #include <sys/ib/clients/rds/rdsib_ib.h>
79 #include <sys/ib/clients/rds/rdsib_buf.h>
80 #include <sys/ib/clients/rds/rdsib_ep.h>
81 #include <sys/ib/clients/rds/rds_kstat.h>
82 
83 /*
84  * This File contains the buffer management code
85  */
86 
87 #define	DUMP_USER_PARAMS()	\
88 	RDS_DPRINTF3(LABEL, "UserBufferSize = %d", UserBufferSize); \
89 	RDS_DPRINTF3(LABEL, "RdsPktSize = %d", RdsPktSize); \
90 	RDS_DPRINTF3(LABEL, "MaxRecvMemory = %d", MaxRecvMemory); \
91 	RDS_DPRINTF3(LABEL, "MaxDataSendBuffers = %d", MaxDataSendBuffers); \
92 	RDS_DPRINTF3(LABEL, "MaxDataRecvBuffers = %d", MaxDataRecvBuffers); \
93 	RDS_DPRINTF3(LABEL, "MaxCtrlSendBuffers = %d", MaxCtrlSendBuffers); \
94 	RDS_DPRINTF3(LABEL, "MaxCtrlRecvBuffers = %d", MaxCtrlRecvBuffers); \
95 	RDS_DPRINTF3(LABEL, "DataRecvBufferLWM = %d", DataRecvBufferLWM); \
96 	RDS_DPRINTF3(LABEL, "PendingRxPktsHWM = %d", PendingRxPktsHWM); \
97 	RDS_DPRINTF3(LABEL, "MinRnrRetry = %d", MinRnrRetry)
98 
99 static void
100 rds_free_mblk(char *arg)
101 {
102 	rds_buf_t *bp = (rds_buf_t *)(uintptr_t)arg;
103 
104 	/* Free the recv buffer */
105 	RDS_DPRINTF4("rds_free_mblk", "Enter: BP(%p)", bp);
106 	ASSERT(bp->buf_state == RDS_RCVBUF_ONSOCKQ);
107 	rds_free_recv_buf(bp, 1);
108 	RDS_DECR_RXPKTS_PEND(1);
109 	RDS_DPRINTF4("rds_free_mblk", "Return: BP(%p)", bp);
110 }
111 
112 void
113 rds_free_recv_caches(rds_state_t *statep)
114 {
115 	rds_hca_t	*hcap;
116 	int		ret;
117 
118 	RDS_DPRINTF4("rds_free_recv_caches", "Enter");
119 
120 	mutex_enter(&rds_dpool.pool_lock);
121 	if (rds_dpool.pool_memp == NULL) {
122 		RDS_DPRINTF2("rds_free_recv_caches", "Caches are empty");
123 		mutex_exit(&rds_dpool.pool_lock);
124 		return;
125 	}
126 
127 	/*
128 	 * All buffers must have been freed as all sessions are closed
129 	 * and destroyed
130 	 */
131 	ASSERT(rds_dpool.pool_nbusy == 0);
132 	RDS_DPRINTF2("rds_free_recv_caches", "Data Pool has "
133 	    "pending buffers: %d", rds_dpool.pool_nbusy);
134 	while (rds_dpool.pool_nbusy != 0) {
135 		mutex_exit(&rds_dpool.pool_lock);
136 		delay(drv_usectohz(1000000));
137 		mutex_enter(&rds_dpool.pool_lock);
138 	}
139 
140 	hcap = statep->rds_hcalistp;
141 	while (hcap != NULL) {
142 		if (hcap->hca_mrhdl != NULL) {
143 			ret = ibt_deregister_mr(hcap->hca_hdl,
144 			    hcap->hca_mrhdl);
145 			if (ret == IBT_SUCCESS) {
146 				hcap->hca_mrhdl = NULL;
147 				hcap->hca_lkey = 0;
148 				hcap->hca_rkey = 0;
149 			} else {
150 				RDS_DPRINTF2(LABEL, "ibt_deregister_mr "
151 				    "failed: %d, mrhdl: 0x%p", ret,
152 				    hcap->hca_mrhdl);
153 			}
154 		}
155 		hcap = hcap->hca_nextp;
156 	}
157 
158 	kmem_free(rds_dpool.pool_bufmemp, (rds_dpool.pool_nbuffers +
159 	    rds_cpool.pool_nbuffers) * sizeof (rds_buf_t));
160 	rds_dpool.pool_bufmemp = NULL;
161 
162 	kmem_free(rds_dpool.pool_memp, rds_dpool.pool_memsize);
163 	rds_dpool.pool_memp = NULL;
164 
165 	mutex_exit(&rds_dpool.pool_lock);
166 
167 	RDS_DPRINTF4("rds_free_recv_caches", "Return");
168 }
169 
170 int
171 rds_init_recv_caches(rds_state_t *statep)
172 {
173 	uint8_t		*mp;
174 	rds_buf_t	*bp;
175 	rds_hca_t	*hcap;
176 	uint32_t	nsessions;
177 	uint_t		ix;
178 	uint_t		ndatarx, nctrlrx;
179 	uint8_t		*memp;
180 	uint_t		memsize, nbuf;
181 	rds_buf_t	*bufmemp;
182 	ibt_mr_attr_t	mem_attr;
183 	ibt_mr_desc_t	mem_desc;
184 	int		ret;
185 
186 	RDS_DPRINTF4("rds_init_recv_caches", "Enter");
187 
188 	DUMP_USER_PARAMS();
189 
190 	mutex_enter(&rds_dpool.pool_lock);
191 	if (rds_dpool.pool_memp != NULL) {
192 		RDS_DPRINTF2("rds_init_recv_caches", "Pools are already "
193 		    "initialized");
194 		mutex_exit(&rds_dpool.pool_lock);
195 		return (0);
196 	}
197 
198 	/* Max number of receive buffers on the system */
199 	ndatarx = (MaxRecvMemory * 1024)/UserBufferSize;
200 
201 	/*
202 	 * High water mark for the receive buffers in the system. If the
203 	 * number of buffers used crosses this mark then all sockets in
204 	 * would be stalled. The port quota for the sockets is set based
205 	 * on this limit.
206 	 */
207 	rds_rx_pkts_pending_hwm = (PendingRxPktsHWM * ndatarx)/100;
208 
209 	/* nsessions can never be less than 1 */
210 	nsessions = ndatarx/MaxDataRecvBuffers;
211 	nctrlrx = (nsessions + 1) * MaxCtrlRecvBuffers;
212 
213 	RDS_DPRINTF3(LABEL, "Number of Possible Sessions: %d", nsessions);
214 
215 	/* Add the hdr */
216 	RdsPktSize = UserBufferSize + RDS_DATA_HDR_SZ;
217 
218 	memsize = (ndatarx * RdsPktSize) + (nctrlrx * RDS_CTRLPKT_SIZE);
219 	nbuf = ndatarx + nctrlrx;
220 	RDS_DPRINTF3(LABEL, "RDS Buffer Pool Memory: %lld", memsize);
221 	RDS_DPRINTF3(LABEL, "Total Buffers: %d", nbuf);
222 
223 	memp = (uint8_t *)kmem_zalloc(memsize, KM_NOSLEEP);
224 	if (memp == NULL) {
225 		RDS_DPRINTF1(LABEL, "RDS Memory allocation failed");
226 		mutex_exit(&rds_dpool.pool_lock);
227 		return (-1);
228 	}
229 
230 	RDS_DPRINTF3(LABEL, "RDS Buffer Entries Memory: %lld",
231 	    nbuf * sizeof (rds_buf_t));
232 
233 	/* allocate memory for buffer entries */
234 	bufmemp = (rds_buf_t *)kmem_zalloc(nbuf * sizeof (rds_buf_t),
235 	    KM_SLEEP);
236 
237 	/* register the memory with all HCAs */
238 	mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)memp;
239 	mem_attr.mr_len = memsize;
240 	mem_attr.mr_as = NULL;
241 	mem_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
242 
243 	hcap = statep->rds_hcalistp;
244 	while (hcap != NULL) {
245 		ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl,
246 		    &mem_attr, &hcap->hca_mrhdl, &mem_desc);
247 		if (ret != IBT_SUCCESS) {
248 			RDS_DPRINTF2(LABEL, "ibt_register_mr failed: %d", ret);
249 			return (-1);
250 		}
251 
252 		hcap->hca_lkey = mem_desc.md_lkey;
253 		hcap->hca_rkey = mem_desc.md_rkey;
254 
255 		hcap = hcap->hca_nextp;
256 	}
257 
258 	/* Initialize data pool */
259 	rds_dpool.pool_memp = memp;
260 	rds_dpool.pool_memsize = memsize;
261 	rds_dpool.pool_bufmemp = bufmemp;
262 	rds_dpool.pool_nbuffers = ndatarx;
263 	rds_dpool.pool_nbusy = 0;
264 	rds_dpool.pool_nfree = ndatarx;
265 
266 	/* chain the buffers */
267 	mp = memp;
268 	bp = bufmemp;
269 	for (ix = 0; ix < ndatarx; ix++) {
270 		bp[ix].buf_nextp = &bp[ix + 1];
271 		bp[ix].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp;
272 		bp[ix].buf_state = RDS_RCVBUF_FREE;
273 		bp[ix].buf_frtn.free_func = rds_free_mblk;
274 		bp[ix].buf_frtn.free_arg = (char *)&bp[ix];
275 		mp = mp + RdsPktSize;
276 	}
277 	bp[ndatarx - 1].buf_nextp = NULL;
278 	rds_dpool.pool_headp = &bp[0];
279 	rds_dpool.pool_tailp = &bp[ndatarx - 1];
280 
281 	/* Initialize ctrl pool */
282 	rds_cpool.pool_nbuffers = nctrlrx;
283 	rds_cpool.pool_nbusy = 0;
284 	rds_cpool.pool_nfree = nctrlrx;
285 
286 	/* chain the buffers */
287 	for (ix = ndatarx; ix < nbuf - 1; ix++) {
288 		bp[ix].buf_nextp = &bp[ix + 1];
289 		bp[ix].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp;
290 		mp = mp + RDS_CTRLPKT_SIZE;
291 	}
292 	bp[nbuf - 1].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp;
293 	bp[nbuf - 1].buf_nextp = NULL;
294 	rds_cpool.pool_headp = &bp[ndatarx];
295 	rds_cpool.pool_tailp = &bp[nbuf - 1];
296 
297 	mutex_exit(&rds_dpool.pool_lock);
298 
299 	RDS_DPRINTF3(LABEL, "rdsmemp start: %p end: %p", memp, mp);
300 	RDS_DPRINTF4("rds_init_recv_caches", "Return");
301 	return (0);
302 }
303 
304 void
305 rds_free_send_pool(rds_ep_t *ep)
306 {
307 	rds_bufpool_t   *pool;
308 	rds_hca_t	*hcap;
309 	int		ret;
310 
311 	pool = &ep->ep_sndpool;
312 
313 	mutex_enter(&pool->pool_lock);
314 	if (pool->pool_memp == NULL) {
315 		mutex_exit(&pool->pool_lock);
316 		RDS_DPRINTF2("rds_free_send_pool",
317 		    "EP(%p) DOUBLE Free on Send Pool", ep);
318 		return;
319 	}
320 
321 	/* get the hcap for the HCA hosting this channel */
322 	hcap = rds_get_hcap(rdsib_statep, ep->ep_hca_guid);
323 	if (hcap == NULL) {
324 		RDS_DPRINTF2("rds_free_send_pool", "HCA (0x%llx) not found",
325 		    ep->ep_hca_guid);
326 	} else {
327 		ret = ibt_deregister_mr(hcap->hca_hdl, ep->ep_snd_mrhdl);
328 		if (ret != IBT_SUCCESS) {
329 			RDS_DPRINTF2(LABEL,
330 			    "ibt_deregister_mr failed: %d, mrhdl: 0x%p",
331 			    ret, ep->ep_snd_mrhdl);
332 		}
333 
334 		if (ep->ep_ack_addr) {
335 			ret = ibt_deregister_mr(hcap->hca_hdl, ep->ep_ackhdl);
336 			if (ret != IBT_SUCCESS) {
337 				RDS_DPRINTF2(LABEL,
338 				    "ibt_deregister_mr ackhdl failed: %d, "
339 				    "mrhdl: 0x%p", ret, ep->ep_ackhdl);
340 			}
341 
342 			kmem_free((void *)ep->ep_ack_addr, sizeof (uintptr_t));
343 			ep->ep_ack_addr = NULL;
344 		}
345 	}
346 
347 	kmem_free(pool->pool_memp, pool->pool_memsize);
348 	kmem_free(pool->pool_bufmemp,
349 	    pool->pool_nbuffers * sizeof (rds_buf_t));
350 	pool->pool_memp = NULL;
351 	pool->pool_bufmemp = NULL;
352 	mutex_exit(&pool->pool_lock);
353 }
354 
355 int
356 rds_init_send_pool(rds_ep_t *ep)
357 {
358 	uint8_t		*mp;
359 	rds_buf_t	*bp;
360 	rds_hca_t	*hcap;
361 	uint_t		ix, rcv_len;
362 	ibt_mr_attr_t   mem_attr;
363 	ibt_mr_desc_t   mem_desc;
364 	uint8_t		*memp;
365 	rds_buf_t	*bufmemp;
366 	uintptr_t	ack_addr = NULL;
367 	uint_t		memsize;
368 	uint_t		nbuf;
369 	rds_bufpool_t   *spool;
370 	rds_data_hdr_t	*pktp;
371 	int		ret;
372 
373 	RDS_DPRINTF2("rds_init_send_pool", "Enter");
374 
375 	spool = &ep->ep_sndpool;
376 
377 	ASSERT(spool->pool_memp == NULL);
378 
379 	/* get the hcap for the HCA hosting this channel */
380 	hcap = rds_get_hcap(rdsib_statep, ep->ep_hca_guid);
381 	if (hcap == NULL) {
382 		RDS_DPRINTF2("rds_init_send_pool", "HCA (0x%llx) not found",
383 		    ep->ep_hca_guid);
384 		return (-1);
385 	}
386 
387 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
388 		spool->pool_nbuffers = MaxDataSendBuffers;
389 		spool->pool_nbusy = 0;
390 		spool->pool_nfree = MaxDataSendBuffers;
391 		memsize = (MaxDataSendBuffers * RdsPktSize) +
392 		    sizeof (uintptr_t);
393 		rcv_len = RdsPktSize;
394 	} else {
395 		spool->pool_nbuffers = MaxCtrlSendBuffers;
396 		spool->pool_nbusy = 0;
397 		spool->pool_nfree = MaxCtrlSendBuffers;
398 		memsize = MaxCtrlSendBuffers * RDS_CTRLPKT_SIZE;
399 		rcv_len = RDS_CTRLPKT_SIZE;
400 	}
401 	nbuf = spool->pool_nbuffers;
402 
403 	RDS_DPRINTF3(LABEL, "RDS Send Pool Memory: %lld", memsize);
404 
405 	memp = (uint8_t *)kmem_zalloc(memsize, KM_NOSLEEP);
406 	if (memp == NULL) {
407 		RDS_DPRINTF1(LABEL, "RDS Send Memory allocation failed");
408 		return (-1);
409 	}
410 
411 	RDS_DPRINTF3(LABEL, "RDS Buffer Entries Memory: %lld",
412 	    nbuf * sizeof (rds_buf_t));
413 
414 	/* allocate memory for buffer entries */
415 	bufmemp = (rds_buf_t *)kmem_zalloc(nbuf * sizeof (rds_buf_t),
416 	    KM_SLEEP);
417 
418 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
419 		ack_addr = (uintptr_t)kmem_zalloc(sizeof (uintptr_t), KM_SLEEP);
420 
421 		/* register the memory with the HCA for this channel */
422 		mem_attr.mr_vaddr = (ib_vaddr_t)ack_addr;
423 		mem_attr.mr_len = sizeof (uintptr_t);
424 		mem_attr.mr_as = NULL;
425 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
426 		    IBT_MR_ENABLE_REMOTE_WRITE;
427 
428 		ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl,
429 		    &mem_attr, &ep->ep_ackhdl, &mem_desc);
430 		if (ret != IBT_SUCCESS) {
431 			RDS_DPRINTF2("rds_init_send_pool",
432 			    "EP(%p): ibt_register_mr for ack failed: %d",
433 			    ep, ret);
434 			kmem_free(memp, memsize);
435 			kmem_free(bufmemp, nbuf * sizeof (rds_buf_t));
436 			kmem_free((void *)ack_addr, sizeof (uintptr_t));
437 			return (-1);
438 		}
439 		ep->ep_ack_rkey = mem_desc.md_rkey;
440 		ep->ep_ack_addr = ack_addr;
441 	}
442 
443 	/* register the memory with the HCA for this channel */
444 	mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)memp;
445 	mem_attr.mr_len = memsize;
446 	mem_attr.mr_as = NULL;
447 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
448 
449 	ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl,
450 	    &mem_attr, &ep->ep_snd_mrhdl, &mem_desc);
451 	if (ret != IBT_SUCCESS) {
452 		RDS_DPRINTF2("rds_init_send_pool", "EP(%p): ibt_register_mr "
453 		    "failed: %d", ep, ret);
454 		kmem_free(memp, memsize);
455 		kmem_free(bufmemp, nbuf * sizeof (rds_buf_t));
456 		if (ack_addr != NULL)
457 			kmem_free((void *)ack_addr, sizeof (uintptr_t));
458 		return (-1);
459 	}
460 	ep->ep_snd_lkey = mem_desc.md_lkey;
461 
462 
463 	/* Initialize the pool */
464 	spool->pool_memp = memp;
465 	spool->pool_memsize = memsize;
466 	spool->pool_bufmemp = bufmemp;
467 	spool->pool_sqpoll_pending = B_FALSE;
468 
469 	/* chain the buffers and initialize them */
470 	mp = memp;
471 	bp = bufmemp;
472 
473 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
474 		for (ix = 0; ix < nbuf - 1; ix++) {
475 			bp[ix].buf_nextp = &bp[ix + 1];
476 			bp[ix].buf_ep = ep;
477 			bp[ix].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp;
478 			bp[ix].buf_ds.ds_key = ep->ep_snd_lkey;
479 			bp[ix].buf_state = RDS_SNDBUF_FREE;
480 			pktp = (rds_data_hdr_t *)(uintptr_t)mp;
481 			pktp->dh_bufid = (uintptr_t)&bp[ix];
482 			mp = mp + rcv_len;
483 		}
484 		bp[nbuf - 1].buf_nextp = NULL;
485 		bp[nbuf - 1].buf_ep = ep;
486 		bp[nbuf - 1].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp;
487 		bp[nbuf - 1].buf_ds.ds_key = ep->ep_snd_lkey;
488 		bp[nbuf - 1].buf_state = RDS_SNDBUF_FREE;
489 		pktp = (rds_data_hdr_t *)(uintptr_t)mp;
490 		pktp->dh_bufid = (uintptr_t)&bp[nbuf - 1];
491 
492 		spool->pool_headp = &bp[0];
493 		spool->pool_tailp = &bp[nbuf - 1];
494 
495 		mp = mp + rcv_len;
496 		ep->ep_ackds.ds_va = (ib_vaddr_t)(uintptr_t)mp;
497 		ep->ep_ackds.ds_key = ep->ep_snd_lkey;
498 		ep->ep_ackds.ds_len = sizeof (uintptr_t);
499 
500 		*(uintptr_t *)ep->ep_ack_addr = (uintptr_t)spool->pool_tailp;
501 	} else {
502 		/* control send pool */
503 		for (ix = 0; ix < nbuf - 1; ix++) {
504 			bp[ix].buf_nextp = &bp[ix + 1];
505 			bp[ix].buf_ep = ep;
506 			bp[ix].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp;
507 			bp[ix].buf_ds.ds_key = ep->ep_snd_lkey;
508 			bp[ix].buf_state = RDS_SNDBUF_FREE;
509 			mp = mp + rcv_len;
510 		}
511 		bp[nbuf - 1].buf_nextp = NULL;
512 		bp[nbuf - 1].buf_ep = ep;
513 		bp[nbuf - 1].buf_ds.ds_va = (ib_vaddr_t)(uintptr_t)mp;
514 		bp[nbuf - 1].buf_ds.ds_key = ep->ep_snd_lkey;
515 		bp[nbuf - 1].buf_state = RDS_SNDBUF_FREE;
516 		spool->pool_headp = &bp[0];
517 		spool->pool_tailp = &bp[nbuf - 1];
518 	}
519 
520 	RDS_DPRINTF3(LABEL, "rdsmemp start: %p end: %p", memp, mp);
521 	RDS_DPRINTF2("rds_init_send_pool", "Return");
522 
523 	return (0);
524 }
525 
526 void
527 rds_free_recv_pool(rds_ep_t *ep)
528 {
529 	rds_bufpool_t *pool;
530 
531 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
532 		pool = &rds_dpool;
533 	} else {
534 		pool = &rds_cpool;
535 	}
536 
537 	mutex_enter(&ep->ep_rcvpool.pool_lock);
538 	if (ep->ep_rcvpool.pool_nfree != 0) {
539 		rds_free_buf(pool, ep->ep_rcvpool.pool_headp,
540 		    ep->ep_rcvpool.pool_nfree);
541 		ep->ep_rcvpool.pool_nfree = 0;
542 		ep->ep_rcvpool.pool_headp = NULL;
543 		ep->ep_rcvpool.pool_tailp = NULL;
544 	}
545 	mutex_exit(&ep->ep_rcvpool.pool_lock);
546 }
547 
548 int
549 rds_init_recv_pool(rds_ep_t *ep)
550 {
551 	rds_bufpool_t	*rpool;
552 	rds_qp_t	*recvqp;
553 
554 	recvqp = &ep->ep_recvqp;
555 	rpool = &ep->ep_rcvpool;
556 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
557 		recvqp->qp_depth = MaxDataRecvBuffers;
558 		recvqp->qp_level = 0;
559 		recvqp->qp_lwm = (DataRecvBufferLWM * MaxDataRecvBuffers)/100;
560 		recvqp->qp_taskqpending = B_FALSE;
561 
562 		rpool->pool_nbuffers = MaxDataRecvBuffers;
563 		rpool->pool_nbusy = 0;
564 		rpool->pool_nfree = 0;
565 	} else {
566 		recvqp->qp_depth = MaxCtrlRecvBuffers;
567 		recvqp->qp_level = 0;
568 		recvqp->qp_lwm = (CtrlRecvBufferLWM * MaxCtrlRecvBuffers)/100;
569 		recvqp->qp_taskqpending = B_FALSE;
570 
571 		rpool->pool_nbuffers = MaxCtrlRecvBuffers;
572 		rpool->pool_nbusy = 0;
573 		rpool->pool_nfree = 0;
574 	}
575 
576 	return (0);
577 }
578 
579 /* Free buffers to the global pool, either cpool or dpool */
580 void
581 rds_free_buf(rds_bufpool_t *pool, rds_buf_t *bp, uint_t nbuf)
582 {
583 	uint_t		ix;
584 
585 	RDS_DPRINTF4("rds_free_buf", "Enter");
586 
587 	ASSERT(nbuf != 0);
588 
589 	mutex_enter(&pool->pool_lock);
590 
591 	if (pool->pool_nfree != 0) {
592 		pool->pool_tailp->buf_nextp = bp;
593 	} else {
594 		pool->pool_headp = bp;
595 	}
596 
597 	if (nbuf == 1) {
598 		ASSERT(bp->buf_state == RDS_RCVBUF_FREE);
599 		bp->buf_ep = NULL;
600 		bp->buf_nextp = NULL;
601 		pool->pool_tailp = bp;
602 	} else {
603 		for (ix = 1; ix < nbuf; ix++) {
604 			ASSERT(bp->buf_state == RDS_RCVBUF_FREE);
605 			bp->buf_ep = NULL;
606 			bp = bp->buf_nextp;
607 		}
608 		ASSERT(bp->buf_state == RDS_RCVBUF_FREE);
609 		bp->buf_ep = NULL;
610 		bp->buf_nextp = NULL;
611 		pool->pool_tailp = bp;
612 	}
613 	/* tail is always the last buffer */
614 	pool->pool_tailp->buf_nextp = NULL;
615 
616 	pool->pool_nfree += nbuf;
617 	pool->pool_nbusy -= nbuf;
618 
619 	mutex_exit(&pool->pool_lock);
620 
621 	RDS_DPRINTF4("rds_free_buf", "Return");
622 }
623 
624 /* Get buffers from the global pools, either cpool or dpool */
625 rds_buf_t *
626 rds_get_buf(rds_bufpool_t *pool, uint_t nbuf, uint_t *nret)
627 {
628 	rds_buf_t	*bp = NULL, *bp1;
629 	uint_t		ix;
630 
631 	RDS_DPRINTF4("rds_get_buf", "Enter");
632 
633 	mutex_enter(&pool->pool_lock);
634 
635 	RDS_DPRINTF3("rds_get_buf", "Available: %d Needed: %d",
636 	    pool->pool_nfree, nbuf);
637 
638 	if (nbuf < pool->pool_nfree) {
639 		*nret = nbuf;
640 
641 		bp1 = pool->pool_headp;
642 		for (ix = 1; ix < nbuf; ix++) {
643 			bp1 = bp1->buf_nextp;
644 		}
645 
646 		bp = pool->pool_headp;
647 		pool->pool_headp = bp1->buf_nextp;
648 		bp1->buf_nextp = NULL;
649 
650 		pool->pool_nfree -= nbuf;
651 		pool->pool_nbusy += nbuf;
652 	} else if (nbuf >= pool->pool_nfree) {
653 		*nret = pool->pool_nfree;
654 
655 		bp = pool->pool_headp;
656 
657 		pool->pool_headp = NULL;
658 		pool->pool_tailp = NULL;
659 
660 		pool->pool_nbusy += pool->pool_nfree;
661 		pool->pool_nfree = 0;
662 	}
663 
664 	mutex_exit(&pool->pool_lock);
665 
666 	RDS_DPRINTF4("rds_get_buf", "Return");
667 
668 	return (bp);
669 }
670 
671 boolean_t
672 rds_is_recvq_empty(rds_ep_t *ep, boolean_t wait)
673 {
674 	rds_qp_t	*recvqp;
675 	rds_bufpool_t	*rpool;
676 	boolean_t ret = B_TRUE;
677 
678 	recvqp = &ep->ep_recvqp;
679 	mutex_enter(&recvqp->qp_lock);
680 	RDS_DPRINTF2("rds_is_recvq_empty", "EP(%p): QP has %d WRs",
681 	    ep, recvqp->qp_level);
682 	if (wait) {
683 		/* wait until the RQ is empty */
684 		while (recvqp->qp_level != 0) {
685 			/* wait one second and try again */
686 			mutex_exit(&recvqp->qp_lock);
687 			delay(drv_usectohz(1000000));
688 			mutex_enter(&recvqp->qp_lock);
689 		}
690 	} else if (recvqp->qp_level != 0) {
691 			ret = B_FALSE;
692 	}
693 	mutex_exit(&recvqp->qp_lock);
694 
695 	rpool = &ep->ep_rcvpool;
696 	mutex_enter(&rpool->pool_lock);
697 	RDS_DPRINTF2("rds_is_recvq_empty", "EP(%p): "
698 	    "There are %d pending buffers on sockqs", ep, rpool->pool_nbusy);
699 	if (wait) {
700 		/* Wait for all buffers to be freed by sockfs */
701 		while (rpool->pool_nbusy != 0) {
702 			/* wait one second and try again */
703 			mutex_exit(&rpool->pool_lock);
704 			delay(drv_usectohz(1000000));
705 			mutex_enter(&rpool->pool_lock);
706 		}
707 	} else if (rpool->pool_nbusy != 0) {
708 			ret = B_FALSE;
709 	}
710 	mutex_exit(&rpool->pool_lock);
711 
712 	return (ret);
713 }
714 
715 boolean_t
716 rds_is_sendq_empty(rds_ep_t *ep, uint_t wait)
717 {
718 	rds_bufpool_t	*spool;
719 	rds_buf_t	*bp;
720 	boolean_t	ret1 = B_TRUE;
721 
722 	/* check if all the sends completed */
723 	spool = &ep->ep_sndpool;
724 	mutex_enter(&spool->pool_lock);
725 	RDS_DPRINTF2("rds_is_sendq_empty", "EP(%p): "
726 	    "Send Pool contains: %d", ep, spool->pool_nbusy);
727 	if (wait) {
728 		while (spool->pool_nbusy != 0) {
729 			if (rds_no_interrupts) {
730 				/* wait one second and try again */
731 				delay(drv_usectohz(1000000));
732 				rds_poll_send_completions(ep->ep_sendcq, ep,
733 				    B_TRUE);
734 			} else {
735 				/* wait one second and try again */
736 				mutex_exit(&spool->pool_lock);
737 				delay(drv_usectohz(1000000));
738 				mutex_enter(&spool->pool_lock);
739 			}
740 		}
741 
742 		if ((wait == 2) && (ep->ep_type == RDS_EP_TYPE_DATA)) {
743 			rds_buf_t	*ackbp;
744 
745 			/*
746 			 * If the last one is acknowledged then everything
747 			 * is acknowledged
748 			 */
749 			bp = spool->pool_tailp;
750 			ackbp = *(rds_buf_t **)ep->ep_ack_addr;
751 			RDS_DPRINTF2("rds_is_sendq_empty", "EP(%p): "
752 			    "Checking for acknowledgements", ep);
753 			while (bp != ackbp) {
754 				RDS_DPRINTF2("rds_is_sendq_empty",
755 				    "EP(%p) BP(0x%p/0x%p) last "
756 				    "sent/acknowledged", ep, bp, ackbp);
757 				mutex_exit(&spool->pool_lock);
758 				delay(drv_usectohz(1000000));
759 				mutex_enter(&spool->pool_lock);
760 
761 				bp = spool->pool_tailp;
762 				ackbp = *(rds_buf_t **)ep->ep_ack_addr;
763 			}
764 		}
765 	} else if (spool->pool_nbusy != 0) {
766 			ret1 = B_FALSE;
767 	}
768 	mutex_exit(&spool->pool_lock);
769 
770 	/* check if all the rdma acks completed */
771 	mutex_enter(&ep->ep_lock);
772 	RDS_DPRINTF2("rds_is_sendq_empty", "EP(%p): "
773 	    "Outstanding RDMA Acks: %d", ep, ep->ep_rdmacnt);
774 	if (wait) {
775 		while (ep->ep_rdmacnt != 0) {
776 			if (rds_no_interrupts) {
777 				/* wait one second and try again */
778 				delay(drv_usectohz(1000000));
779 				rds_poll_send_completions(ep->ep_sendcq, ep,
780 				    B_FALSE);
781 			} else {
782 				/* wait one second and try again */
783 				mutex_exit(&ep->ep_lock);
784 				delay(drv_usectohz(1000000));
785 				mutex_enter(&ep->ep_lock);
786 			}
787 		}
788 	} else if (ep->ep_rdmacnt != 0) {
789 			ret1 = B_FALSE;
790 	}
791 	mutex_exit(&ep->ep_lock);
792 
793 	return (ret1);
794 }
795 
796 /* Get buffers from the send pool */
797 rds_buf_t *
798 rds_get_send_buf(rds_ep_t *ep, uint_t nbuf)
799 {
800 	rds_buf_t	*bp = NULL, *bp1;
801 	rds_bufpool_t	*spool;
802 	uint_t		waittime = rds_waittime_ms * 1000;
803 	uint_t		ix;
804 	int		ret;
805 
806 	RDS_DPRINTF4("rds_get_send_buf", "Enter: EP(%p) Buffers requested: %d",
807 	    ep, nbuf);
808 
809 	spool = &ep->ep_sndpool;
810 	mutex_enter(&spool->pool_lock);
811 
812 	if (rds_no_interrupts) {
813 		if ((spool->pool_sqpoll_pending == B_FALSE) &&
814 		    (spool->pool_nbusy >
815 		    (spool->pool_nbuffers * rds_poll_percent_full)/100)) {
816 			spool->pool_sqpoll_pending = B_TRUE;
817 			mutex_exit(&spool->pool_lock);
818 			rds_poll_send_completions(ep->ep_sendcq, ep, B_FALSE);
819 			mutex_enter(&spool->pool_lock);
820 			spool->pool_sqpoll_pending = B_FALSE;
821 		}
822 	}
823 
824 	if (spool->pool_nfree < nbuf) {
825 		/* wait for buffers to become available */
826 		spool->pool_cv_count += nbuf;
827 		ret = cv_timedwait_sig(&spool->pool_cv, &spool->pool_lock,
828 		    ddi_get_lbolt() + drv_usectohz(waittime));
829 		/* ret = cv_wait_sig(&spool->pool_cv, &spool->pool_lock); */
830 		if (ret == 0) {
831 			/* signal pending */
832 			spool->pool_cv_count -= nbuf;
833 			mutex_exit(&spool->pool_lock);
834 			return (NULL);
835 		}
836 
837 		spool->pool_cv_count -= nbuf;
838 	}
839 
840 	/* Have the number of buffers needed */
841 	if (spool->pool_nfree > nbuf) {
842 		bp = spool->pool_headp;
843 
844 		if (ep->ep_type == RDS_EP_TYPE_DATA) {
845 			rds_buf_t *ackbp;
846 			ackbp = *(rds_buf_t **)ep->ep_ack_addr;
847 
848 			/* check if all the needed buffers are acknowledged */
849 			bp1 = bp;
850 			for (ix = 0; ix < nbuf; ix++) {
851 				if ((bp1 == ackbp) ||
852 				    (bp1->buf_state != RDS_SNDBUF_FREE)) {
853 					/*
854 					 * The buffer is not yet signalled or
855 					 * is not yet acknowledged
856 					 */
857 					RDS_DPRINTF5("rds_get_send_buf",
858 					    "EP(%p) Buffer (%p) not yet "
859 					    "acked/completed", ep, bp1);
860 					mutex_exit(&spool->pool_lock);
861 					return (NULL);
862 				}
863 
864 				bp1 = bp1->buf_nextp;
865 			}
866 		}
867 
868 		/* mark the buffers as pending */
869 		bp1 = bp;
870 		for (ix = 1; ix < nbuf; ix++) {
871 			ASSERT(bp1->buf_state == RDS_SNDBUF_FREE);
872 			bp1->buf_state = RDS_SNDBUF_PENDING;
873 			bp1 = bp1->buf_nextp;
874 		}
875 		ASSERT(bp1->buf_state == RDS_SNDBUF_FREE);
876 		bp1->buf_state = RDS_SNDBUF_PENDING;
877 
878 		spool->pool_headp = bp1->buf_nextp;
879 		bp1->buf_nextp = NULL;
880 		if (spool->pool_headp == NULL)
881 			spool->pool_tailp = NULL;
882 		spool->pool_nfree -= nbuf;
883 		spool->pool_nbusy += nbuf;
884 	}
885 	mutex_exit(&spool->pool_lock);
886 
887 	RDS_DPRINTF4("rds_get_send_buf", "Return: EP(%p) Buffers requested: %d",
888 	    ep, nbuf);
889 
890 	return (bp);
891 }
892 
893 #define	RDS_MIN_BUF_TO_WAKE_THREADS	10
894 
895 void
896 rds_free_send_buf(rds_ep_t *ep, rds_buf_t *headp, rds_buf_t *tailp, uint_t nbuf,
897     boolean_t lock)
898 {
899 	rds_bufpool_t	*spool;
900 	rds_buf_t	*tmp;
901 
902 	RDS_DPRINTF4("rds_free_send_buf", "Enter");
903 
904 	ASSERT(nbuf != 0);
905 
906 	if (tailp == NULL) {
907 		if (nbuf > 1) {
908 			tmp = headp;
909 			while (tmp->buf_nextp) {
910 				tmp = tmp->buf_nextp;
911 			}
912 			tailp = tmp;
913 		} else {
914 			tailp = headp;
915 		}
916 	}
917 
918 	spool = &ep->ep_sndpool;
919 
920 	if (lock == B_FALSE) {
921 		/* lock is not held outside */
922 		mutex_enter(&spool->pool_lock);
923 	}
924 
925 	if (spool->pool_nfree) {
926 		spool->pool_tailp->buf_nextp = headp;
927 	} else {
928 		spool->pool_headp = headp;
929 	}
930 	spool->pool_tailp = tailp;
931 
932 	spool->pool_nfree += nbuf;
933 	spool->pool_nbusy -= nbuf;
934 
935 	if ((spool->pool_cv_count > 0) &&
936 	    (spool->pool_nfree > RDS_MIN_BUF_TO_WAKE_THREADS)) {
937 		if (spool->pool_nfree >= spool->pool_cv_count)
938 			cv_broadcast(&spool->pool_cv);
939 		else
940 			cv_signal(&spool->pool_cv);
941 	}
942 
943 	if (lock == B_FALSE) {
944 		mutex_exit(&spool->pool_lock);
945 	}
946 
947 	RDS_DPRINTF4("rds_free_send_buf", "Return");
948 }
949 
950 #define	RDS_NBUFFERS_TO_PUTBACK	100
951 void
952 rds_free_recv_buf(rds_buf_t *bp, uint_t nbuf)
953 {
954 	rds_ep_t	*ep;
955 	rds_bufpool_t	*rpool;
956 	rds_buf_t	*bp1;
957 	uint_t		ix;
958 
959 	RDS_DPRINTF4("rds_free_recv_buf", "Enter");
960 
961 	ASSERT(nbuf != 0);
962 
963 	ep = bp->buf_ep;
964 	rpool = &ep->ep_rcvpool;
965 
966 	mutex_enter(&rpool->pool_lock);
967 
968 	/* Add the buffers to the local pool */
969 	if (rpool->pool_tailp == NULL) {
970 		ASSERT(rpool->pool_headp == NULL);
971 		ASSERT(rpool->pool_nfree == 0);
972 		rpool->pool_headp = bp;
973 		bp1 = bp;
974 		for (ix = 1; ix < nbuf; ix++) {
975 			if (bp1->buf_state == RDS_RCVBUF_ONSOCKQ) {
976 				rpool->pool_nbusy--;
977 			}
978 			bp1->buf_state = RDS_RCVBUF_FREE;
979 			bp1 = bp1->buf_nextp;
980 		}
981 		bp1->buf_nextp = NULL;
982 		if (bp->buf_state == RDS_RCVBUF_ONSOCKQ) {
983 			rpool->pool_nbusy--;
984 		}
985 		bp->buf_state = RDS_RCVBUF_FREE;
986 		rpool->pool_tailp = bp1;
987 		rpool->pool_nfree += nbuf;
988 	} else {
989 		bp1 = bp;
990 		for (ix = 1; ix < nbuf; ix++) {
991 			if (bp1->buf_state == RDS_RCVBUF_ONSOCKQ) {
992 				rpool->pool_nbusy--;
993 			}
994 			bp1->buf_state = RDS_RCVBUF_FREE;
995 			bp1 = bp1->buf_nextp;
996 		}
997 		bp1->buf_nextp = NULL;
998 		if (bp->buf_state == RDS_RCVBUF_ONSOCKQ) {
999 			rpool->pool_nbusy--;
1000 		}
1001 		bp->buf_state = RDS_RCVBUF_FREE;
1002 		rpool->pool_tailp->buf_nextp = bp;
1003 		rpool->pool_tailp = bp1;
1004 		rpool->pool_nfree += nbuf;
1005 	}
1006 
1007 	if (rpool->pool_nfree >= RDS_NBUFFERS_TO_PUTBACK) {
1008 		bp = rpool->pool_headp;
1009 		nbuf = rpool->pool_nfree;
1010 		rpool->pool_headp = NULL;
1011 		rpool->pool_tailp = NULL;
1012 		rpool->pool_nfree = 0;
1013 		mutex_exit(&rpool->pool_lock);
1014 
1015 		/* Free the buffers to the global pool */
1016 		if (ep->ep_type == RDS_EP_TYPE_DATA) {
1017 			rds_free_buf(&rds_dpool, bp, nbuf);
1018 		} else {
1019 			rds_free_buf(&rds_cpool, bp, nbuf);
1020 		}
1021 
1022 		return;
1023 	}
1024 	mutex_exit(&rpool->pool_lock);
1025 
1026 	RDS_DPRINTF4("rds_free_recv_buf", "Return");
1027 }
1028