xref: /titanic_52/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c (revision 1a5e258f5471356ca102c7176637cdce45bac147)
1 /*
2  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3  */
4 
5 /*
6  * This file contains code imported from the OFED rds source file ib_recv.c
7  * Oracle elects to have and use the contents of ib_recv.c under and governed
8  * by the OpenIB.org BSD license (see below for full license text). However,
9  * the following notice accompanied the original version of this file:
10  */
11 
12 /*
13  * Copyright (c) 2006 Oracle.  All rights reserved.
14  *
15  * This software is available to you under a choice of one of two
16  * licenses.  You may choose to be licensed under the terms of the GNU
17  * General Public License (GPL) Version 2, available from the file
18  * COPYING in the main directory of this source tree, or the
19  * OpenIB.org BSD license below:
20  *
21  *     Redistribution and use in source and binary forms, with or
22  *     without modification, are permitted provided that the following
23  *     conditions are met:
24  *
25  *      - Redistributions of source code must retain the above
26  *        copyright notice, this list of conditions and the following
27  *        disclaimer.
28  *
29  *      - Redistributions in binary form must reproduce the above
30  *        copyright notice, this list of conditions and the following
31  *        disclaimer in the documentation and/or other materials
32  *        provided with the distribution.
33  *
34  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41  * SOFTWARE.
42  *
43  */
44 #include <sys/types.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/rds.h>
48 
49 #include <sys/ib/clients/rdsv3/rdsv3.h>
50 #include <sys/ib/clients/rdsv3/ib.h>
51 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
52 
53 static struct kmem_cache *rdsv3_ib_incoming_slab;
54 static atomic_t	rdsv3_ib_allocation = ATOMIC_INIT(0);
55 
56 void
57 rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic)
58 {
59 	struct rdsv3_ib_recv_work *recv;
60 	struct rdsv3_header *hdrp;
61 	uint32_t i;
62 
63 	RDSV3_DPRINTF4("rdsv3_ib_recv_init_ring", "ic: %p", ic);
64 
65 	hdrp = ic->i_recv_hdrs;
66 	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
67 		recv->r_ibinc = NULL;
68 		recv->r_frag = NULL;
69 
70 		/* initialize the hdr sgl permanently */
71 		recv->r_sge[0].ds_va = (ib_vaddr_t)(uintptr_t)hdrp++;
72 		recv->r_sge[0].ds_len = sizeof (struct rdsv3_header);
73 		recv->r_sge[0].ds_key = ic->i_mr->lkey;
74 	}
75 }
76 
77 static void
78 rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection *ic,
79     struct rdsv3_ib_recv_work *recv)
80 {
81 	RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "ic: %p, recv: %p",
82 	    ic, recv);
83 
84 	if (recv->r_ibinc) {
85 		rdsv3_inc_put(&recv->r_ibinc->ii_inc);
86 		recv->r_ibinc = NULL;
87 	}
88 
89 	if (recv->r_frag) {
90 		kmem_cache_free(ic->rds_ibdev->ib_frag_slab, recv->r_frag);
91 		recv->r_frag = NULL;
92 	}
93 
94 	RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "Return: ic: %p, recv: %p",
95 	    ic, recv);
96 }
97 
98 void
99 rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic)
100 {
101 	uint32_t i;
102 
103 	RDSV3_DPRINTF4("rdsv3_ib_recv_clear_ring", "ic: %p", ic);
104 
105 	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
106 		rdsv3_ib_recv_clear_one(ic, &ic->i_recvs[i]);
107 }
108 
109 extern int atomic_add_unless(atomic_t *, uint_t, ulong_t);
110 
111 static int
112 rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn,
113     struct rdsv3_ib_recv_work *recv)
114 {
115 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
116 	ibt_mi_hdl_t mi_hdl;
117 	ibt_iov_attr_t iov_attr;
118 	ibt_iov_t iov_arr[1];
119 
120 	RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p",
121 	    conn, recv);
122 
123 	if (!recv->r_ibinc) {
124 		if (!atomic_add_unless(&rdsv3_ib_allocation, 1,
125 		    ic->i_max_recv_alloc)) {
126 			rdsv3_ib_stats_inc(s_ib_rx_alloc_limit);
127 			goto out;
128 		}
129 		recv->r_ibinc = kmem_cache_alloc(rdsv3_ib_incoming_slab,
130 		    KM_NOSLEEP);
131 		if (recv->r_ibinc == NULL) {
132 			atomic_dec_32(&rdsv3_ib_allocation);
133 			goto out;
134 		}
135 		rdsv3_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
136 		recv->r_ibinc->ii_ibdev = ic->rds_ibdev;
137 		recv->r_ibinc->ii_pool = ic->rds_ibdev->inc_pool;
138 	}
139 
140 	if (!recv->r_frag) {
141 		recv->r_frag = kmem_cache_alloc(ic->rds_ibdev->ib_frag_slab,
142 		    KM_NOSLEEP);
143 		if (!recv->r_frag)
144 			goto out;
145 	}
146 
147 	/* Data sge, structure copy */
148 	recv->r_sge[1] = recv->r_frag->f_sge;
149 
150 	RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "Return: conn: %p, recv: %p",
151 	    conn, recv);
152 
153 	return (0);
154 out:
155 	if (recv->r_ibinc) {
156 		kmem_cache_free(rdsv3_ib_incoming_slab, recv->r_ibinc);
157 		atomic_dec_32(&rdsv3_ib_allocation);
158 		recv->r_ibinc = NULL;
159 	}
160 	return (-ENOMEM);
161 }
162 
163 /*
164  * This tries to allocate and post unused work requests after making sure that
165  * they have all the allocations they need to queue received fragments into
166  * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
167  * pairs don't go unmatched.
168  *
169  * -1 is returned if posting fails due to temporary resource exhaustion.
170  */
171 int
172 rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill)
173 {
174 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
175 	struct rdsv3_ib_recv_work *recv;
176 	unsigned int posted = 0;
177 	int ret = 0, avail;
178 	uint32_t pos, i;
179 
180 	RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "conn: %p, prefill: %d",
181 	    conn, prefill);
182 
183 	if (prefill || rdsv3_conn_up(conn)) {
184 		uint_t w_nr = ic->i_recv_ring.w_nr;
185 
186 		avail = rdsv3_ib_ring_alloc(&ic->i_recv_ring, w_nr, &pos);
187 		if ((avail <= 0) || (pos >= w_nr)) {
188 			RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
189 			    "Argh - ring alloc returned pos=%u, avail: %d",
190 			    pos, avail);
191 			return (-EINVAL);
192 		}
193 
194 		/* populate the WRs */
195 		for (i = 0; i < avail; i++) {
196 			recv = &ic->i_recvs[pos];
197 			ret = rdsv3_ib_recv_refill_one(conn, recv);
198 			if (ret) {
199 				rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
200 				    avail - i);
201 				break;
202 			}
203 			ic->i_recv_wrs[i].wr_id = (ibt_wrid_t)pos;
204 			ic->i_recv_wrs[i].wr_nds = RDSV3_IB_RECV_SGE;
205 			ic->i_recv_wrs[i].wr_sgl = &recv->r_sge[0];
206 
207 			pos = (pos + 1) % w_nr;
208 		}
209 
210 		if (i) {
211 			/* post the WRs at one shot */
212 			ret = ibt_post_recv(ib_get_ibt_channel_hdl(ic->i_cm_id),
213 			    &ic->i_recv_wrs[0], i, &posted);
214 			RDSV3_DPRINTF3("rdsv3_ib_recv_refill",
215 			    "attempted: %d posted: %d WRs ret %d",
216 			    i, posted, ret);
217 			if (ret) {
218 				RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
219 				    "disconnecting and reconnecting\n",
220 				    NIPQUAD(conn->c_faddr), ret);
221 				rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
222 				    i - posted);
223 				rdsv3_conn_drop(conn);
224 			}
225 		}
226 	}
227 
228 	/* We're doing flow control - update the window. */
229 	if (ic->i_flowctl && posted)
230 		rdsv3_ib_advertise_credits(conn, posted);
231 
232 	RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "Return: conn: %p, posted: %d",
233 	    conn, posted);
234 	return (ret);
235 }
236 
237 /*
238  * delayed freed incoming's
239  */
240 struct rdsv3_inc_pool {
241 	list_t			f_list;	/* list of freed incoming */
242 	kmutex_t		f_lock; /* lock of fmr pool */
243 	int32_t			f_listcnt;
244 };
245 
246 void
247 rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *rds_ibdev)
248 {
249 	struct rdsv3_inc_pool *pool = rds_ibdev->inc_pool;
250 
251 	if (pool) {
252 		list_destroy(&pool->f_list);
253 		kmem_free((void *) pool, sizeof (*pool));
254 	}
255 }
256 
257 int
258 rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *rds_ibdev)
259 {
260 	struct rdsv3_inc_pool *pool;
261 
262 	pool = (struct rdsv3_inc_pool *)kmem_zalloc(sizeof (*pool), KM_NOSLEEP);
263 	if (pool == NULL) {
264 		return (-ENOMEM);
265 	}
266 	list_create(&pool->f_list, sizeof (struct rdsv3_ib_incoming),
267 	    offsetof(struct rdsv3_ib_incoming, ii_obj));
268 	mutex_init(&pool->f_lock, NULL, MUTEX_DRIVER, NULL);
269 	rds_ibdev->inc_pool = pool;
270 	return (0);
271 }
272 
273 static void
274 rdsv3_ib_inc_drop(struct rdsv3_ib_incoming *ibinc)
275 {
276 	struct rdsv3_page_frag *frag;
277 	struct rdsv3_page_frag *pos;
278 
279 	RDSV3_FOR_EACH_LIST_NODE_SAFE(frag, pos, &ibinc->ii_frags, f_item) {
280 		list_remove_node(&frag->f_item);
281 		kmem_cache_free(ibinc->ii_ibdev->ib_frag_slab, frag);
282 	}
283 
284 	ASSERT(list_is_empty(&ibinc->ii_frags));
285 	kmem_cache_free(rdsv3_ib_incoming_slab, ibinc);
286 	atomic_dec_uint(&rdsv3_ib_allocation);
287 }
288 
289 void
290 rdsv3_ib_drain_inclist(void *data)
291 {
292 	struct rdsv3_inc_pool *pool = (struct rdsv3_inc_pool *)data;
293 	struct rdsv3_ib_incoming *ibinc;
294 	list_t *listp = &pool->f_list;
295 	kmutex_t *lockp = &pool->f_lock;
296 	int i = 0;
297 
298 	for (;;) {
299 		mutex_enter(lockp);
300 		ibinc = (struct rdsv3_ib_incoming *)list_remove_head(listp);
301 		if (ibinc)
302 			pool->f_listcnt--;
303 		mutex_exit(lockp);
304 		if (!ibinc)
305 			break;
306 		i++;
307 		rdsv3_ib_inc_drop(ibinc);
308 	}
309 }
310 
311 void
312 rdsv3_ib_inc_free(struct rdsv3_incoming *inc)
313 {
314 	struct rdsv3_ib_incoming *ibinc;
315 	rdsv3_af_thr_t *af_thr;
316 
317 	RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc);
318 
319 	ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
320 	/* save af_thr in a local as ib_inc might be freed at mutex_exit */
321 	af_thr = ibinc->ii_ibdev->inc_soft_cq;
322 
323 	mutex_enter(&ibinc->ii_pool->f_lock);
324 	list_insert_tail(&ibinc->ii_pool->f_list, ibinc);
325 	ibinc->ii_pool->f_listcnt++;
326 	mutex_exit(&ibinc->ii_pool->f_lock);
327 
328 	rdsv3_af_thr_fire(af_thr);
329 }
330 
331 int
332 rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop,
333     size_t size)
334 {
335 	struct rdsv3_ib_incoming *ibinc;
336 	struct rdsv3_page_frag *frag;
337 	unsigned long to_copy;
338 	unsigned long frag_off = 0;
339 	int copied = 0;
340 	int ret;
341 	uint32_t len;
342 
343 	ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
344 	frag = list_head(&ibinc->ii_frags);
345 	len = ntohl(inc->i_hdr.h_len);
346 
347 	RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", "inc: %p, size: %d len: %d",
348 	    inc, size, len);
349 
350 	while (copied < size && copied < len) {
351 		if (frag_off == RDSV3_FRAG_SIZE) {
352 			frag = list_next(&ibinc->ii_frags, frag);
353 			frag_off = 0;
354 		}
355 
356 		to_copy = min(len - copied, RDSV3_FRAG_SIZE - frag_off);
357 		to_copy = min(size - copied, to_copy);
358 
359 		RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user",
360 		    "%lu bytes to user %p from frag [%p, %u] + %lu",
361 		    to_copy, uiop,
362 		    frag->f_page, frag->f_offset, frag_off);
363 
364 		ret = uiomove((caddr_t)(frag->f_page +
365 		    frag->f_offset + frag_off),
366 		    to_copy, UIO_READ, uiop);
367 		if (ret) {
368 			RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user",
369 			    "uiomove (%d) returned: %d", to_copy, ret);
370 			break;
371 		}
372 
373 		frag_off += to_copy;
374 		copied += to_copy;
375 	}
376 
377 	RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user",
378 	    "Return: inc: %p, copied: %d", inc, copied);
379 
380 	return (copied);
381 }
382 
383 /* ic starts out kmem_zalloc()ed */
384 void
385 rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic)
386 {
387 	ibt_send_wr_t *wr = &ic->i_ack_wr;
388 	ibt_wr_ds_t *sge = &ic->i_ack_sge;
389 
390 	RDSV3_DPRINTF4("rdsv3_ib_recv_init_ack", "ic: %p", ic);
391 
392 	sge->ds_va = ic->i_ack_dma;
393 	sge->ds_len = sizeof (struct rdsv3_header);
394 	sge->ds_key = ic->i_mr->lkey;
395 
396 	wr->wr_sgl = sge;
397 	wr->wr_nds = 1;
398 	wr->wr_opcode = IBT_WRC_SEND;
399 	wr->wr_id = RDSV3_IB_ACK_WR_ID;
400 	wr->wr_flags = IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
401 }
402 
403 /*
404  * You'd think that with reliable IB connections you wouldn't need to ack
405  * messages that have been received.  The problem is that IB hardware generates
406  * an ack message before it has DMAed the message into memory.  This creates a
407  * potential message loss if the HCA is disabled for any reason between when it
408  * sends the ack and before the message is DMAed and processed.  This is only a
409  * potential issue if another HCA is available for fail-over.
410  *
411  * When the remote host receives our ack they'll free the sent message from
412  * their send queue.  To decrease the latency of this we always send an ack
413  * immediately after we've received messages.
414  *
415  * For simplicity, we only have one ack in flight at a time.  This puts
416  * pressure on senders to have deep enough send queues to absorb the latency of
417  * a single ack frame being in flight.  This might not be good enough.
418  *
419  * This is implemented by have a long-lived send_wr and sge which point to a
420  * statically allocated ack frame.  This ack wr does not fall under the ring
421  * accounting that the tx and rx wrs do.  The QP attribute specifically makes
422  * room for it beyond the ring size.  Send completion notices its special
423  * wr_id and avoids working with the ring in that case.
424  */
425 void
426 rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq,
427     int ack_required)
428 {
429 	RDSV3_DPRINTF4("rdsv3_ib_set_ack", "ic: %p, seq: %lld ack: %d",
430 	    ic, seq, ack_required);
431 
432 	mutex_enter(&ic->i_ack_lock);
433 	ic->i_ack_next = seq;
434 	if (ack_required)
435 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
436 	mutex_exit(&ic->i_ack_lock);
437 }
438 
439 static uint64_t
440 rdsv3_ib_get_ack(struct rdsv3_ib_connection *ic)
441 {
442 	uint64_t seq;
443 
444 	RDSV3_DPRINTF4("rdsv3_ib_get_ack", "ic: %p", ic);
445 
446 	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
447 
448 	mutex_enter(&ic->i_ack_lock);
449 	seq = ic->i_ack_next;
450 	mutex_exit(&ic->i_ack_lock);
451 
452 	return (seq);
453 }
454 
455 static void
456 rdsv3_ib_send_ack(struct rdsv3_ib_connection *ic, unsigned int adv_credits)
457 {
458 	struct rdsv3_header *hdr = ic->i_ack;
459 	uint64_t seq;
460 	int ret;
461 
462 	RDSV3_DPRINTF4("rdsv3_ib_send_ack", "ic: %p adv_credits: %d",
463 	    ic, adv_credits);
464 
465 	seq = rdsv3_ib_get_ack(ic);
466 
467 	RDSV3_DPRINTF4("rdsv3_ib_send_ack", "send_ack: ic %p ack %llu",
468 	    ic, (unsigned long long) seq);
469 	rdsv3_message_populate_header(hdr, 0, 0, 0);
470 	hdr->h_ack = htonll(seq);
471 	hdr->h_credit = adv_credits;
472 	rdsv3_message_make_checksum(hdr);
473 	ic->i_ack_queued = jiffies;
474 
475 	ret = ibt_post_send(RDSV3_QP2CHANHDL(ic->i_cm_id->qp), &ic->i_ack_wr, 1,
476 	    NULL);
477 	if (ret) {
478 		/*
479 		 * Failed to send. Release the WR, and
480 		 * force another ACK.
481 		 */
482 		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
483 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
484 		rdsv3_ib_stats_inc(s_ib_ack_send_failure);
485 		RDSV3_DPRINTF2("rdsv3_ib_send_ack", "sending ack failed\n");
486 		rdsv3_conn_drop(ic->conn);
487 	} else {
488 		rdsv3_ib_stats_inc(s_ib_ack_sent);
489 	}
490 	RDSV3_DPRINTF4("rdsv3_ib_send_ack", "Return: ic: %p adv_credits: %d",
491 	    ic, adv_credits);
492 }
493 
494 /*
495  * There are 3 ways of getting acknowledgements to the peer:
496  *  1.	We call rdsv3_ib_attempt_ack from the recv completion handler
497  *	to send an ACK-only frame.
498  *	However, there can be only one such frame in the send queue
499  *	at any time, so we may have to postpone it.
500  *  2.	When another (data) packet is transmitted while there's
501  *	an ACK in the queue, we piggyback the ACK sequence number
502  *	on the data packet.
503  *  3.	If the ACK WR is done sending, we get called from the
504  *	send queue completion handler, and check whether there's
505  *	another ACK pending (postponed because the WR was on the
506  *	queue). If so, we transmit it.
507  *
508  * We maintain 2 variables:
509  *  -	i_ack_flags, which keeps track of whether the ACK WR
510  *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
511  *  -	i_ack_next, which is the last sequence number we received
512  *
513  * Potentially, send queue and receive queue handlers can run concurrently.
514  * It would be nice to not have to use a spinlock to synchronize things,
515  * but the one problem that rules this out is that 64bit updates are
516  * not atomic on all platforms. Things would be a lot simpler if
517  * we had atomic64 or maybe cmpxchg64 everywhere.
518  *
519  * Reconnecting complicates this picture just slightly. When we
520  * reconnect, we may be seeing duplicate packets. The peer
521  * is retransmitting them, because it hasn't seen an ACK for
522  * them. It is important that we ACK these.
523  *
524  * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
525  * this flag set *MUST* be acknowledged immediately.
526  */
527 
528 /*
529  * When we get here, we're called from the recv queue handler.
530  * Check whether we ought to transmit an ACK.
531  */
532 void
533 rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic)
534 {
535 	unsigned int adv_credits;
536 
537 	RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "ic: %p", ic);
538 
539 	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
540 		return;
541 
542 	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
543 		rdsv3_ib_stats_inc(s_ib_ack_send_delayed);
544 		return;
545 	}
546 
547 	/* Can we get a send credit? */
548 	if (!rdsv3_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
549 		rdsv3_ib_stats_inc(s_ib_tx_throttle);
550 		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
551 		return;
552 	}
553 
554 	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
555 	rdsv3_ib_send_ack(ic, adv_credits);
556 
557 	RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "Return: ic: %p", ic);
558 }
559 
560 /*
561  * We get here from the send completion handler, when the
562  * adapter tells us the ACK frame was sent.
563  */
564 void
565 rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic)
566 {
567 	RDSV3_DPRINTF4("rdsv3_ib_ack_send_complete", "ic: %p", ic);
568 	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
569 	rdsv3_ib_attempt_ack(ic);
570 }
571 
572 /*
573  * This is called by the regular xmit code when it wants to piggyback
574  * an ACK on an outgoing frame.
575  */
576 uint64_t
577 rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic)
578 {
579 	RDSV3_DPRINTF4("rdsv3_ib_piggyb_ack", "ic: %p", ic);
580 	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) {
581 		rdsv3_ib_stats_inc(s_ib_ack_send_piggybacked);
582 	}
583 	return (rdsv3_ib_get_ack(ic));
584 }
585 
586 /*
587  * It's kind of lame that we're copying from the posted receive pages into
588  * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
589  * them.  But receiving new congestion bitmaps should be a *rare* event, so
590  * hopefully we won't need to invest that complexity in making it more
591  * efficient.  By copying we can share a simpler core with TCP which has to
592  * copy.
593  */
594 static void
595 rdsv3_ib_cong_recv(struct rdsv3_connection *conn,
596     struct rdsv3_ib_incoming *ibinc)
597 {
598 	struct rdsv3_cong_map *map;
599 	unsigned int map_off;
600 	unsigned int map_page;
601 	struct rdsv3_page_frag *frag;
602 	unsigned long frag_off;
603 	unsigned long to_copy;
604 	unsigned long copied;
605 	uint64_t uncongested = 0;
606 	caddr_t addr;
607 
608 	RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "conn: %p, ibinc: %p",
609 	    conn, ibinc);
610 
611 	/* catch completely corrupt packets */
612 	if (ntohl(ibinc->ii_inc.i_hdr.h_len) != RDSV3_CONG_MAP_BYTES)
613 		return;
614 
615 	map = conn->c_fcong;
616 	map_page = 0;
617 	map_off = 0;
618 
619 	frag = list_head(&ibinc->ii_frags);
620 	frag_off = 0;
621 
622 	copied = 0;
623 
624 	while (copied < RDSV3_CONG_MAP_BYTES) {
625 		uint64_t *src, *dst;
626 		unsigned int k;
627 
628 		to_copy = min(RDSV3_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
629 		ASSERT(!(to_copy & 7)); /* Must be 64bit aligned. */
630 
631 		addr = frag->f_page + frag->f_offset;
632 
633 		src = (uint64_t *)(addr + frag_off);
634 		dst = (uint64_t *)(map->m_page_addrs[map_page] + map_off);
635 		RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
636 		    "src: %p dst: %p copied: %d", src, dst, copied);
637 		for (k = 0; k < to_copy; k += 8) {
638 			/*
639 			 * Record ports that became uncongested, ie
640 			 * bits that changed from 0 to 1.
641 			 */
642 			uncongested |= ~(*src) & *dst;
643 			*dst++ = *src++;
644 		}
645 
646 		copied += to_copy;
647 		RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
648 		    "src: %p dst: %p copied: %d", src, dst, copied);
649 
650 		map_off += to_copy;
651 		if (map_off == PAGE_SIZE) {
652 			map_off = 0;
653 			map_page++;
654 		}
655 
656 		frag_off += to_copy;
657 		if (frag_off == RDSV3_FRAG_SIZE) {
658 			frag = list_next(&ibinc->ii_frags, frag);
659 			frag_off = 0;
660 		}
661 	}
662 
663 #if 0
664 XXX
665 	/* the congestion map is in little endian order */
666 	uncongested = le64_to_cpu(uncongested);
667 #endif
668 
669 	rdsv3_cong_map_updated(map, uncongested);
670 
671 	RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "Return: conn: %p, ibinc: %p",
672 	    conn, ibinc);
673 }
674 
675 static void
676 rdsv3_ib_process_recv(struct rdsv3_connection *conn,
677     struct rdsv3_ib_recv_work *recv, uint32_t data_len,
678     struct rdsv3_ib_ack_state *state)
679 {
680 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
681 	struct rdsv3_ib_incoming *ibinc = ic->i_ibinc;
682 	struct rdsv3_header *ihdr, *hdr;
683 
684 	/* XXX shut down the connection if port 0,0 are seen? */
685 
686 	RDSV3_DPRINTF5("rdsv3_ib_process_recv",
687 	    "ic %p ibinc %p recv %p byte len %u", ic, ibinc, recv, data_len);
688 
689 	if (data_len < sizeof (struct rdsv3_header)) {
690 		RDSV3_DPRINTF2("rdsv3_ib_process_recv",
691 		    "incoming message from %u.%u.%u.%u didn't include a "
692 		    "header, disconnecting and reconnecting",
693 		    NIPQUAD(conn->c_faddr));
694 		rdsv3_conn_drop(conn);
695 		return;
696 	}
697 	data_len -= sizeof (struct rdsv3_header);
698 
699 	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
700 
701 	/* Validate the checksum. */
702 	if (!rdsv3_message_verify_checksum(ihdr)) {
703 		RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message "
704 		    "from %u.%u.%u.%u has corrupted header - "
705 		    "forcing a reconnect",
706 		    NIPQUAD(conn->c_faddr));
707 		rdsv3_conn_drop(conn);
708 		rdsv3_stats_inc(s_recv_drop_bad_checksum);
709 		return;
710 	}
711 
712 	/* Process the ACK sequence which comes with every packet */
713 	state->ack_recv = ntohll(ihdr->h_ack);
714 	state->ack_recv_valid = 1;
715 
716 	/* Process the credits update if there was one */
717 	if (ihdr->h_credit)
718 		rdsv3_ib_send_add_credits(conn, ihdr->h_credit);
719 
720 	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
721 		/*
722 		 * This is an ACK-only packet. The fact that it gets
723 		 * special treatment here is that historically, ACKs
724 		 * were rather special beasts.
725 		 */
726 		rdsv3_ib_stats_inc(s_ib_ack_received);
727 		return;
728 	}
729 
730 	/*
731 	 * If we don't already have an inc on the connection then this
732 	 * fragment has a header and starts a message.. copy its header
733 	 * into the inc and save the inc so we can hang upcoming fragments
734 	 * off its list.
735 	 */
736 	if (!ibinc) {
737 		ibinc = recv->r_ibinc;
738 		recv->r_ibinc = NULL;
739 		ic->i_ibinc = ibinc;
740 
741 		hdr = &ibinc->ii_inc.i_hdr;
742 		(void) memcpy(hdr, ihdr, sizeof (*hdr));
743 		ic->i_recv_data_rem = ntohl(hdr->h_len);
744 
745 		RDSV3_DPRINTF5("rdsv3_ib_process_recv",
746 		    "ic %p ibinc %p rem %u flag 0x%x", ic, ibinc,
747 		    ic->i_recv_data_rem, hdr->h_flags);
748 	} else {
749 		hdr = &ibinc->ii_inc.i_hdr;
750 		/*
751 		 * We can't just use memcmp here; fragments of a
752 		 * single message may carry different ACKs
753 		 */
754 		if (hdr->h_sequence != ihdr->h_sequence ||
755 		    hdr->h_len != ihdr->h_len ||
756 		    hdr->h_sport != ihdr->h_sport ||
757 		    hdr->h_dport != ihdr->h_dport) {
758 			RDSV3_DPRINTF2("rdsv3_ib_process_recv",
759 			    "fragment header mismatch; forcing reconnect");
760 			rdsv3_conn_drop(conn);
761 			return;
762 		}
763 	}
764 
765 	list_insert_tail(&ibinc->ii_frags, recv->r_frag);
766 	recv->r_frag = NULL;
767 
768 	if (ic->i_recv_data_rem > RDSV3_FRAG_SIZE)
769 		ic->i_recv_data_rem -= RDSV3_FRAG_SIZE;
770 	else {
771 		ic->i_recv_data_rem = 0;
772 		ic->i_ibinc = NULL;
773 
774 		if (ibinc->ii_inc.i_hdr.h_flags == RDSV3_FLAG_CONG_BITMAP)
775 			rdsv3_ib_cong_recv(conn, ibinc);
776 		else {
777 			rdsv3_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
778 			    &ibinc->ii_inc, KM_NOSLEEP);
779 			state->ack_next = ntohll(hdr->h_sequence);
780 			state->ack_next_valid = 1;
781 		}
782 
783 		/*
784 		 * Evaluate the ACK_REQUIRED flag *after* we received
785 		 * the complete frame, and after bumping the next_rx
786 		 * sequence.
787 		 */
788 		if (hdr->h_flags & RDSV3_FLAG_ACK_REQUIRED) {
789 			rdsv3_stats_inc(s_recv_ack_required);
790 			state->ack_required = 1;
791 		}
792 
793 		rdsv3_inc_put(&ibinc->ii_inc);
794 	}
795 
796 	RDSV3_DPRINTF4("rdsv3_ib_process_recv",
797 	    "Return: conn: %p recv: %p len: %d state: %p",
798 	    conn, recv, data_len, state);
799 }
800 
801 void
802 rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc,
803     struct rdsv3_ib_ack_state *state)
804 {
805 	struct rdsv3_connection *conn = ic->conn;
806 	struct rdsv3_ib_recv_work *recv;
807 	struct rdsv3_ib_work_ring *recv_ringp = &ic->i_recv_ring;
808 
809 	RDSV3_DPRINTF4("rdsv3_ib_recv_cqe_handler",
810 	    "rwc wc_id 0x%llx status %u byte_len %u imm_data %u\n",
811 	    (unsigned long long)wc->wc_id, wc->wc_status,
812 	    wc->wc_bytes_xfer, ntohl(wc->wc_immed_data));
813 
814 	rdsv3_ib_stats_inc(s_ib_rx_cq_event);
815 
816 	recv = &ic->i_recvs[rdsv3_ib_ring_oldest(recv_ringp)];
817 
818 	/*
819 	 * Also process recvs in connecting state because it is possible
820 	 * to get a recv completion _before_ the rdmacm ESTABLISHED
821 	 * event is processed.
822 	 */
823 	if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) {
824 		/* We expect errors as the qp is drained during shutdown */
825 		if (wc->wc_status == IBT_WC_SUCCESS) {
826 			rdsv3_ib_process_recv(conn, recv,
827 			    wc->wc_bytes_xfer, state);
828 		} else {
829 			RDSV3_DPRINTF2("rdsv3_ib_recv_cqe_handler",
830 			    "recv completion on "
831 			    "%u.%u.%u.%u had status %u, "
832 			    "disconnecting and reconnecting\n",
833 			    NIPQUAD(conn->c_faddr),
834 			    wc->wc_status);
835 			rdsv3_conn_drop(conn);
836 		}
837 	}
838 
839 	rdsv3_ib_ring_free(recv_ringp, 1);
840 
841 	/*
842 	 * If we ever end up with a really empty receive ring, we're
843 	 * in deep trouble, as the sender will definitely see RNR
844 	 * timeouts.
845 	 */
846 	if (rdsv3_ib_ring_empty(recv_ringp))
847 		rdsv3_ib_stats_inc(s_ib_rx_ring_empty);
848 
849 	if (rdsv3_ib_ring_low(recv_ringp)) {
850 		rdsv3_af_thr_fire(ic->i_refill_rq);
851 	}
852 }
853 
854 int
855 rdsv3_ib_recv(struct rdsv3_connection *conn)
856 {
857 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
858 	int ret = 0;
859 
860 	RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn);
861 
862 	if (rdsv3_conn_up(conn))
863 		rdsv3_ib_attempt_ack(ic);
864 
865 	RDSV3_DPRINTF4("rdsv3_ib_recv", "Return: conn: %p", conn);
866 
867 	return (ret);
868 }
869 
870 extern int rdsv3_ib_inc_constructor(void *buf, void *arg, int kmflags);
871 extern void rdsv3_ib_inc_destructor(void *buf, void *arg);
872 
873 int
874 rdsv3_ib_recv_init(void)
875 {
876 	RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Enter");
877 
878 	rdsv3_ib_incoming_slab = kmem_cache_create("rdsv3_ib_incoming",
879 	    sizeof (struct rdsv3_ib_incoming), 0, rdsv3_ib_inc_constructor,
880 	    rdsv3_ib_inc_destructor, NULL, NULL, NULL, 0);
881 	if (!rdsv3_ib_incoming_slab) {
882 		RDSV3_DPRINTF2("rdsv3_ib_recv_init", "kmem_cache_create "
883 		    "failed");
884 		return (-ENOMEM);
885 	}
886 
887 	RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Return");
888 	return (0);
889 }
890 
891 void
892 rdsv3_ib_recv_exit(void)
893 {
894 	RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Enter");
895 	kmem_cache_destroy(rdsv3_ib_incoming_slab);
896 	RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Return");
897 }
898