1 /*
2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3 */
4
5 /*
6 * This file contains code imported from the OFED rds source file ib_recv.c
7 * Oracle elects to have and use the contents of ib_recv.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
10 */
11
12 /*
13 * Copyright (c) 2006 Oracle. All rights reserved.
14 *
15 * This software is available to you under a choice of one of two
16 * licenses. You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
20 *
21 * Redistribution and use in source and binary forms, with or
22 * without modification, are permitted provided that the following
23 * conditions are met:
24 *
25 * - Redistributions of source code must retain the above
26 * copyright notice, this list of conditions and the following
27 * disclaimer.
28 *
29 * - Redistributions in binary form must reproduce the above
30 * copyright notice, this list of conditions and the following
31 * disclaimer in the documentation and/or other materials
32 * provided with the distribution.
33 *
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 * SOFTWARE.
42 *
43 */
44 #include <sys/types.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/rds.h>
48
49 #include <sys/ib/clients/rdsv3/rdsv3.h>
50 #include <sys/ib/clients/rdsv3/ib.h>
51 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
52
53 static struct kmem_cache *rdsv3_ib_incoming_slab;
54 static atomic_t rdsv3_ib_allocation = ATOMIC_INIT(0);
55
56 void
rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection * ic)57 rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic)
58 {
59 struct rdsv3_ib_recv_work *recv;
60 struct rdsv3_header *hdrp;
61 uint32_t i;
62
63 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ring", "ic: %p", ic);
64
65 hdrp = ic->i_recv_hdrs;
66 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
67 recv->r_ibinc = NULL;
68 recv->r_frag = NULL;
69
70 /* initialize the hdr sgl permanently */
71 recv->r_sge[0].ds_va = (ib_vaddr_t)(uintptr_t)hdrp++;
72 recv->r_sge[0].ds_len = sizeof (struct rdsv3_header);
73 recv->r_sge[0].ds_key = ic->i_mr->lkey;
74 }
75 }
76
77 static void
rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection * ic,struct rdsv3_ib_recv_work * recv)78 rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection *ic,
79 struct rdsv3_ib_recv_work *recv)
80 {
81 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "ic: %p, recv: %p",
82 ic, recv);
83
84 if (recv->r_ibinc) {
85 rdsv3_inc_put(&recv->r_ibinc->ii_inc);
86 recv->r_ibinc = NULL;
87 }
88
89 if (recv->r_frag) {
90 kmem_cache_free(ic->rds_ibdev->ib_frag_slab, recv->r_frag);
91 recv->r_frag = NULL;
92 }
93
94 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "Return: ic: %p, recv: %p",
95 ic, recv);
96 }
97
98 void
rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection * ic)99 rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic)
100 {
101 uint32_t i;
102
103 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_ring", "ic: %p", ic);
104
105 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
106 rdsv3_ib_recv_clear_one(ic, &ic->i_recvs[i]);
107 }
108
109 extern int atomic_add_unless(atomic_t *, uint_t, ulong_t);
110
111 static int
rdsv3_ib_recv_refill_one(struct rdsv3_connection * conn,struct rdsv3_ib_recv_work * recv)112 rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn,
113 struct rdsv3_ib_recv_work *recv)
114 {
115 struct rdsv3_ib_connection *ic = conn->c_transport_data;
116 ibt_mi_hdl_t mi_hdl;
117 ibt_iov_attr_t iov_attr;
118 ibt_iov_t iov_arr[1];
119
120 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p",
121 conn, recv);
122
123 if (!recv->r_ibinc) {
124 if (!atomic_add_unless(&rdsv3_ib_allocation, 1,
125 ic->i_max_recv_alloc)) {
126 rdsv3_ib_stats_inc(s_ib_rx_alloc_limit);
127 goto out;
128 }
129 recv->r_ibinc = kmem_cache_alloc(rdsv3_ib_incoming_slab,
130 KM_NOSLEEP);
131 if (recv->r_ibinc == NULL) {
132 atomic_dec_32(&rdsv3_ib_allocation);
133 goto out;
134 }
135 rdsv3_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
136 recv->r_ibinc->ii_ibdev = ic->rds_ibdev;
137 recv->r_ibinc->ii_pool = ic->rds_ibdev->inc_pool;
138 }
139
140 if (!recv->r_frag) {
141 recv->r_frag = kmem_cache_alloc(ic->rds_ibdev->ib_frag_slab,
142 KM_NOSLEEP);
143 if (!recv->r_frag)
144 goto out;
145 }
146
147 /* Data sge, structure copy */
148 recv->r_sge[1] = recv->r_frag->f_sge;
149
150 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "Return: conn: %p, recv: %p",
151 conn, recv);
152
153 return (0);
154 out:
155 if (recv->r_ibinc) {
156 kmem_cache_free(rdsv3_ib_incoming_slab, recv->r_ibinc);
157 atomic_dec_32(&rdsv3_ib_allocation);
158 recv->r_ibinc = NULL;
159 }
160 return (-ENOMEM);
161 }
162
163 /*
164 * This tries to allocate and post unused work requests after making sure that
165 * they have all the allocations they need to queue received fragments into
166 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
167 * pairs don't go unmatched.
168 *
169 * -1 is returned if posting fails due to temporary resource exhaustion.
170 */
171 int
rdsv3_ib_recv_refill(struct rdsv3_connection * conn,int prefill)172 rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill)
173 {
174 struct rdsv3_ib_connection *ic = conn->c_transport_data;
175 struct rdsv3_ib_recv_work *recv;
176 unsigned int posted = 0;
177 int ret = 0, avail;
178 uint32_t pos, i;
179
180 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "conn: %p, prefill: %d",
181 conn, prefill);
182
183 if (prefill || rdsv3_conn_up(conn)) {
184 uint_t w_nr = ic->i_recv_ring.w_nr;
185
186 avail = rdsv3_ib_ring_alloc(&ic->i_recv_ring, w_nr, &pos);
187 if ((avail <= 0) || (pos >= w_nr)) {
188 RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
189 "Argh - ring alloc returned pos=%u, avail: %d",
190 pos, avail);
191 return (-EINVAL);
192 }
193
194 /* populate the WRs */
195 for (i = 0; i < avail; i++) {
196 recv = &ic->i_recvs[pos];
197 ret = rdsv3_ib_recv_refill_one(conn, recv);
198 if (ret) {
199 rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
200 avail - i);
201 break;
202 }
203 ic->i_recv_wrs[i].wr_id = (ibt_wrid_t)pos;
204 ic->i_recv_wrs[i].wr_nds = RDSV3_IB_RECV_SGE;
205 ic->i_recv_wrs[i].wr_sgl = &recv->r_sge[0];
206
207 pos = (pos + 1) % w_nr;
208 }
209
210 if (i) {
211 /* post the WRs at one shot */
212 ret = ibt_post_recv(ib_get_ibt_channel_hdl(ic->i_cm_id),
213 &ic->i_recv_wrs[0], i, &posted);
214 RDSV3_DPRINTF3("rdsv3_ib_recv_refill",
215 "attempted: %d posted: %d WRs ret %d",
216 i, posted, ret);
217 if (ret) {
218 RDSV3_DPRINTF2("rdsv3_ib_recv_refill",
219 "disconnecting and reconnecting\n",
220 NIPQUAD(conn->c_faddr), ret);
221 rdsv3_ib_ring_unalloc(&ic->i_recv_ring,
222 i - posted);
223 rdsv3_conn_drop(conn);
224 }
225 }
226 }
227
228 /* We're doing flow control - update the window. */
229 if (ic->i_flowctl && posted)
230 rdsv3_ib_advertise_credits(conn, posted);
231
232 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "Return: conn: %p, posted: %d",
233 conn, posted);
234 return (ret);
235 }
236
237 /*
238 * delayed freed incoming's
239 */
240 struct rdsv3_inc_pool {
241 list_t f_list; /* list of freed incoming */
242 kmutex_t f_lock; /* lock of fmr pool */
243 int32_t f_listcnt;
244 };
245
246 void
rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device * rds_ibdev)247 rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *rds_ibdev)
248 {
249 struct rdsv3_inc_pool *pool = rds_ibdev->inc_pool;
250
251 if (pool) {
252 list_destroy(&pool->f_list);
253 kmem_free((void *) pool, sizeof (*pool));
254 }
255 }
256
257 int
rdsv3_ib_create_inc_pool(struct rdsv3_ib_device * rds_ibdev)258 rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *rds_ibdev)
259 {
260 struct rdsv3_inc_pool *pool;
261
262 pool = (struct rdsv3_inc_pool *)kmem_zalloc(sizeof (*pool), KM_NOSLEEP);
263 if (pool == NULL) {
264 return (-ENOMEM);
265 }
266 list_create(&pool->f_list, sizeof (struct rdsv3_ib_incoming),
267 offsetof(struct rdsv3_ib_incoming, ii_obj));
268 mutex_init(&pool->f_lock, NULL, MUTEX_DRIVER, NULL);
269 rds_ibdev->inc_pool = pool;
270 return (0);
271 }
272
273 static void
rdsv3_ib_inc_drop(struct rdsv3_ib_incoming * ibinc)274 rdsv3_ib_inc_drop(struct rdsv3_ib_incoming *ibinc)
275 {
276 struct rdsv3_page_frag *frag;
277 struct rdsv3_page_frag *pos;
278
279 RDSV3_FOR_EACH_LIST_NODE_SAFE(frag, pos, &ibinc->ii_frags, f_item) {
280 list_remove_node(&frag->f_item);
281 kmem_cache_free(ibinc->ii_ibdev->ib_frag_slab, frag);
282 }
283
284 ASSERT(list_is_empty(&ibinc->ii_frags));
285 kmem_cache_free(rdsv3_ib_incoming_slab, ibinc);
286 atomic_dec_uint(&rdsv3_ib_allocation);
287 }
288
289 void
rdsv3_ib_drain_inclist(void * data)290 rdsv3_ib_drain_inclist(void *data)
291 {
292 struct rdsv3_inc_pool *pool = (struct rdsv3_inc_pool *)data;
293 struct rdsv3_ib_incoming *ibinc;
294 list_t *listp = &pool->f_list;
295 kmutex_t *lockp = &pool->f_lock;
296 int i = 0;
297
298 for (;;) {
299 mutex_enter(lockp);
300 ibinc = (struct rdsv3_ib_incoming *)list_remove_head(listp);
301 if (ibinc)
302 pool->f_listcnt--;
303 mutex_exit(lockp);
304 if (!ibinc)
305 break;
306 i++;
307 rdsv3_ib_inc_drop(ibinc);
308 }
309 }
310
311 void
rdsv3_ib_inc_free(struct rdsv3_incoming * inc)312 rdsv3_ib_inc_free(struct rdsv3_incoming *inc)
313 {
314 struct rdsv3_ib_incoming *ibinc;
315 rdsv3_af_thr_t *af_thr;
316
317 RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc);
318
319 ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
320 /* save af_thr in a local as ib_inc might be freed at mutex_exit */
321 af_thr = ibinc->ii_ibdev->inc_soft_cq;
322
323 mutex_enter(&ibinc->ii_pool->f_lock);
324 list_insert_tail(&ibinc->ii_pool->f_list, ibinc);
325 ibinc->ii_pool->f_listcnt++;
326 mutex_exit(&ibinc->ii_pool->f_lock);
327
328 rdsv3_af_thr_fire(af_thr);
329 }
330
331 int
rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming * inc,uio_t * uiop,size_t size)332 rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop,
333 size_t size)
334 {
335 struct rdsv3_ib_incoming *ibinc;
336 struct rdsv3_page_frag *frag;
337 unsigned long to_copy;
338 unsigned long frag_off = 0;
339 int copied = 0;
340 int ret;
341 uint32_t len;
342
343 ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc);
344 frag = list_head(&ibinc->ii_frags);
345 len = ntohl(inc->i_hdr.h_len);
346
347 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", "inc: %p, size: %d len: %d",
348 inc, size, len);
349
350 while (copied < size && copied < len) {
351 if (frag_off == RDSV3_FRAG_SIZE) {
352 frag = list_next(&ibinc->ii_frags, frag);
353 frag_off = 0;
354 }
355
356 to_copy = min(len - copied, RDSV3_FRAG_SIZE - frag_off);
357 to_copy = min(size - copied, to_copy);
358
359 RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user",
360 "%lu bytes to user %p from frag [%p, %u] + %lu",
361 to_copy, uiop,
362 frag->f_page, frag->f_offset, frag_off);
363
364 ret = uiomove((caddr_t)(frag->f_page +
365 frag->f_offset + frag_off),
366 to_copy, UIO_READ, uiop);
367 if (ret) {
368 RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user",
369 "uiomove (%d) returned: %d", to_copy, ret);
370 break;
371 }
372
373 frag_off += to_copy;
374 copied += to_copy;
375 }
376
377 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user",
378 "Return: inc: %p, copied: %d", inc, copied);
379
380 return (copied);
381 }
382
383 /* ic starts out kmem_zalloc()ed */
384 void
rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection * ic)385 rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic)
386 {
387 ibt_send_wr_t *wr = &ic->i_ack_wr;
388 ibt_wr_ds_t *sge = &ic->i_ack_sge;
389
390 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ack", "ic: %p", ic);
391
392 sge->ds_va = ic->i_ack_dma;
393 sge->ds_len = sizeof (struct rdsv3_header);
394 sge->ds_key = ic->i_mr->lkey;
395
396 wr->wr_sgl = sge;
397 wr->wr_nds = 1;
398 wr->wr_opcode = IBT_WRC_SEND;
399 wr->wr_id = RDSV3_IB_ACK_WR_ID;
400 wr->wr_flags = IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT;
401 }
402
403 /*
404 * You'd think that with reliable IB connections you wouldn't need to ack
405 * messages that have been received. The problem is that IB hardware generates
406 * an ack message before it has DMAed the message into memory. This creates a
407 * potential message loss if the HCA is disabled for any reason between when it
408 * sends the ack and before the message is DMAed and processed. This is only a
409 * potential issue if another HCA is available for fail-over.
410 *
411 * When the remote host receives our ack they'll free the sent message from
412 * their send queue. To decrease the latency of this we always send an ack
413 * immediately after we've received messages.
414 *
415 * For simplicity, we only have one ack in flight at a time. This puts
416 * pressure on senders to have deep enough send queues to absorb the latency of
417 * a single ack frame being in flight. This might not be good enough.
418 *
419 * This is implemented by have a long-lived send_wr and sge which point to a
420 * statically allocated ack frame. This ack wr does not fall under the ring
421 * accounting that the tx and rx wrs do. The QP attribute specifically makes
422 * room for it beyond the ring size. Send completion notices its special
423 * wr_id and avoids working with the ring in that case.
424 */
425 void
rdsv3_ib_set_ack(struct rdsv3_ib_connection * ic,uint64_t seq,int ack_required)426 rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq,
427 int ack_required)
428 {
429 RDSV3_DPRINTF4("rdsv3_ib_set_ack", "ic: %p, seq: %lld ack: %d",
430 ic, seq, ack_required);
431
432 mutex_enter(&ic->i_ack_lock);
433 ic->i_ack_next = seq;
434 if (ack_required)
435 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
436 mutex_exit(&ic->i_ack_lock);
437 }
438
439 static uint64_t
rdsv3_ib_get_ack(struct rdsv3_ib_connection * ic)440 rdsv3_ib_get_ack(struct rdsv3_ib_connection *ic)
441 {
442 uint64_t seq;
443
444 RDSV3_DPRINTF4("rdsv3_ib_get_ack", "ic: %p", ic);
445
446 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
447
448 mutex_enter(&ic->i_ack_lock);
449 seq = ic->i_ack_next;
450 mutex_exit(&ic->i_ack_lock);
451
452 return (seq);
453 }
454
455 static void
rdsv3_ib_send_ack(struct rdsv3_ib_connection * ic,unsigned int adv_credits)456 rdsv3_ib_send_ack(struct rdsv3_ib_connection *ic, unsigned int adv_credits)
457 {
458 struct rdsv3_header *hdr = ic->i_ack;
459 uint64_t seq;
460 int ret;
461
462 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "ic: %p adv_credits: %d",
463 ic, adv_credits);
464
465 seq = rdsv3_ib_get_ack(ic);
466
467 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "send_ack: ic %p ack %llu",
468 ic, (unsigned long long) seq);
469 rdsv3_message_populate_header(hdr, 0, 0, 0);
470 hdr->h_ack = htonll(seq);
471 hdr->h_credit = adv_credits;
472 rdsv3_message_make_checksum(hdr);
473 ic->i_ack_queued = jiffies;
474
475 ret = ibt_post_send(RDSV3_QP2CHANHDL(ic->i_cm_id->qp), &ic->i_ack_wr, 1,
476 NULL);
477 if (ret) {
478 /*
479 * Failed to send. Release the WR, and
480 * force another ACK.
481 */
482 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
483 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
484 rdsv3_ib_stats_inc(s_ib_ack_send_failure);
485 RDSV3_DPRINTF2("rdsv3_ib_send_ack", "sending ack failed\n");
486 rdsv3_conn_drop(ic->conn);
487 } else {
488 rdsv3_ib_stats_inc(s_ib_ack_sent);
489 }
490 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "Return: ic: %p adv_credits: %d",
491 ic, adv_credits);
492 }
493
494 /*
495 * There are 3 ways of getting acknowledgements to the peer:
496 * 1. We call rdsv3_ib_attempt_ack from the recv completion handler
497 * to send an ACK-only frame.
498 * However, there can be only one such frame in the send queue
499 * at any time, so we may have to postpone it.
500 * 2. When another (data) packet is transmitted while there's
501 * an ACK in the queue, we piggyback the ACK sequence number
502 * on the data packet.
503 * 3. If the ACK WR is done sending, we get called from the
504 * send queue completion handler, and check whether there's
505 * another ACK pending (postponed because the WR was on the
506 * queue). If so, we transmit it.
507 *
508 * We maintain 2 variables:
509 * - i_ack_flags, which keeps track of whether the ACK WR
510 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
511 * - i_ack_next, which is the last sequence number we received
512 *
513 * Potentially, send queue and receive queue handlers can run concurrently.
514 * It would be nice to not have to use a spinlock to synchronize things,
515 * but the one problem that rules this out is that 64bit updates are
516 * not atomic on all platforms. Things would be a lot simpler if
517 * we had atomic64 or maybe cmpxchg64 everywhere.
518 *
519 * Reconnecting complicates this picture just slightly. When we
520 * reconnect, we may be seeing duplicate packets. The peer
521 * is retransmitting them, because it hasn't seen an ACK for
522 * them. It is important that we ACK these.
523 *
524 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
525 * this flag set *MUST* be acknowledged immediately.
526 */
527
528 /*
529 * When we get here, we're called from the recv queue handler.
530 * Check whether we ought to transmit an ACK.
531 */
532 void
rdsv3_ib_attempt_ack(struct rdsv3_ib_connection * ic)533 rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic)
534 {
535 unsigned int adv_credits;
536
537 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "ic: %p", ic);
538
539 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
540 return;
541
542 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
543 rdsv3_ib_stats_inc(s_ib_ack_send_delayed);
544 return;
545 }
546
547 /* Can we get a send credit? */
548 if (!rdsv3_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
549 rdsv3_ib_stats_inc(s_ib_tx_throttle);
550 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
551 return;
552 }
553
554 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
555 rdsv3_ib_send_ack(ic, adv_credits);
556
557 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "Return: ic: %p", ic);
558 }
559
560 /*
561 * We get here from the send completion handler, when the
562 * adapter tells us the ACK frame was sent.
563 */
564 void
rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection * ic)565 rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic)
566 {
567 RDSV3_DPRINTF4("rdsv3_ib_ack_send_complete", "ic: %p", ic);
568 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
569 rdsv3_ib_attempt_ack(ic);
570 }
571
572 /*
573 * This is called by the regular xmit code when it wants to piggyback
574 * an ACK on an outgoing frame.
575 */
576 uint64_t
rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection * ic)577 rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic)
578 {
579 RDSV3_DPRINTF4("rdsv3_ib_piggyb_ack", "ic: %p", ic);
580 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) {
581 rdsv3_ib_stats_inc(s_ib_ack_send_piggybacked);
582 }
583 return (rdsv3_ib_get_ack(ic));
584 }
585
586 /*
587 * It's kind of lame that we're copying from the posted receive pages into
588 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
589 * them. But receiving new congestion bitmaps should be a *rare* event, so
590 * hopefully we won't need to invest that complexity in making it more
591 * efficient. By copying we can share a simpler core with TCP which has to
592 * copy.
593 */
594 static void
rdsv3_ib_cong_recv(struct rdsv3_connection * conn,struct rdsv3_ib_incoming * ibinc)595 rdsv3_ib_cong_recv(struct rdsv3_connection *conn,
596 struct rdsv3_ib_incoming *ibinc)
597 {
598 struct rdsv3_cong_map *map;
599 unsigned int map_off;
600 unsigned int map_page;
601 struct rdsv3_page_frag *frag;
602 unsigned long frag_off;
603 unsigned long to_copy;
604 unsigned long copied;
605 uint64_t uncongested = 0;
606 caddr_t addr;
607
608 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "conn: %p, ibinc: %p",
609 conn, ibinc);
610
611 /* catch completely corrupt packets */
612 if (ntohl(ibinc->ii_inc.i_hdr.h_len) != RDSV3_CONG_MAP_BYTES)
613 return;
614
615 map = conn->c_fcong;
616 map_page = 0;
617 map_off = 0;
618
619 frag = list_head(&ibinc->ii_frags);
620 frag_off = 0;
621
622 copied = 0;
623
624 while (copied < RDSV3_CONG_MAP_BYTES) {
625 uint64_t *src, *dst;
626 unsigned int k;
627
628 to_copy = min(RDSV3_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
629 ASSERT(!(to_copy & 7)); /* Must be 64bit aligned. */
630
631 addr = frag->f_page + frag->f_offset;
632
633 src = (uint64_t *)(addr + frag_off);
634 dst = (uint64_t *)(map->m_page_addrs[map_page] + map_off);
635 RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
636 "src: %p dst: %p copied: %d", src, dst, copied);
637 for (k = 0; k < to_copy; k += 8) {
638 /*
639 * Record ports that became uncongested, ie
640 * bits that changed from 0 to 1.
641 */
642 uncongested |= ~(*src) & *dst;
643 *dst++ = *src++;
644 }
645
646 copied += to_copy;
647 RDSV3_DPRINTF4("rdsv3_ib_cong_recv",
648 "src: %p dst: %p copied: %d", src, dst, copied);
649
650 map_off += to_copy;
651 if (map_off == PAGE_SIZE) {
652 map_off = 0;
653 map_page++;
654 }
655
656 frag_off += to_copy;
657 if (frag_off == RDSV3_FRAG_SIZE) {
658 frag = list_next(&ibinc->ii_frags, frag);
659 frag_off = 0;
660 }
661 }
662
663 #if 0
664 XXX
665 /* the congestion map is in little endian order */
666 uncongested = le64_to_cpu(uncongested);
667 #endif
668
669 rdsv3_cong_map_updated(map, uncongested);
670
671 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "Return: conn: %p, ibinc: %p",
672 conn, ibinc);
673 }
674
675 static void
rdsv3_ib_process_recv(struct rdsv3_connection * conn,struct rdsv3_ib_recv_work * recv,uint32_t data_len,struct rdsv3_ib_ack_state * state)676 rdsv3_ib_process_recv(struct rdsv3_connection *conn,
677 struct rdsv3_ib_recv_work *recv, uint32_t data_len,
678 struct rdsv3_ib_ack_state *state)
679 {
680 struct rdsv3_ib_connection *ic = conn->c_transport_data;
681 struct rdsv3_ib_incoming *ibinc = ic->i_ibinc;
682 struct rdsv3_header *ihdr, *hdr;
683
684 /* XXX shut down the connection if port 0,0 are seen? */
685
686 RDSV3_DPRINTF5("rdsv3_ib_process_recv",
687 "ic %p ibinc %p recv %p byte len %u", ic, ibinc, recv, data_len);
688
689 if (data_len < sizeof (struct rdsv3_header)) {
690 RDSV3_DPRINTF2("rdsv3_ib_process_recv",
691 "incoming message from %u.%u.%u.%u didn't include a "
692 "header, disconnecting and reconnecting",
693 NIPQUAD(conn->c_faddr));
694 rdsv3_conn_drop(conn);
695 return;
696 }
697 data_len -= sizeof (struct rdsv3_header);
698
699 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
700
701 /* Validate the checksum. */
702 if (!rdsv3_message_verify_checksum(ihdr)) {
703 RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message "
704 "from %u.%u.%u.%u has corrupted header - "
705 "forcing a reconnect",
706 NIPQUAD(conn->c_faddr));
707 rdsv3_conn_drop(conn);
708 rdsv3_stats_inc(s_recv_drop_bad_checksum);
709 return;
710 }
711
712 /* Process the ACK sequence which comes with every packet */
713 state->ack_recv = ntohll(ihdr->h_ack);
714 state->ack_recv_valid = 1;
715
716 /* Process the credits update if there was one */
717 if (ihdr->h_credit)
718 rdsv3_ib_send_add_credits(conn, ihdr->h_credit);
719
720 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
721 /*
722 * This is an ACK-only packet. The fact that it gets
723 * special treatment here is that historically, ACKs
724 * were rather special beasts.
725 */
726 rdsv3_ib_stats_inc(s_ib_ack_received);
727 return;
728 }
729
730 /*
731 * If we don't already have an inc on the connection then this
732 * fragment has a header and starts a message.. copy its header
733 * into the inc and save the inc so we can hang upcoming fragments
734 * off its list.
735 */
736 if (!ibinc) {
737 ibinc = recv->r_ibinc;
738 recv->r_ibinc = NULL;
739 ic->i_ibinc = ibinc;
740
741 hdr = &ibinc->ii_inc.i_hdr;
742 (void) memcpy(hdr, ihdr, sizeof (*hdr));
743 ic->i_recv_data_rem = ntohl(hdr->h_len);
744
745 RDSV3_DPRINTF5("rdsv3_ib_process_recv",
746 "ic %p ibinc %p rem %u flag 0x%x", ic, ibinc,
747 ic->i_recv_data_rem, hdr->h_flags);
748 } else {
749 hdr = &ibinc->ii_inc.i_hdr;
750 /*
751 * We can't just use memcmp here; fragments of a
752 * single message may carry different ACKs
753 */
754 if (hdr->h_sequence != ihdr->h_sequence ||
755 hdr->h_len != ihdr->h_len ||
756 hdr->h_sport != ihdr->h_sport ||
757 hdr->h_dport != ihdr->h_dport) {
758 RDSV3_DPRINTF2("rdsv3_ib_process_recv",
759 "fragment header mismatch; forcing reconnect");
760 rdsv3_conn_drop(conn);
761 return;
762 }
763 }
764
765 list_insert_tail(&ibinc->ii_frags, recv->r_frag);
766 recv->r_frag = NULL;
767
768 if (ic->i_recv_data_rem > RDSV3_FRAG_SIZE)
769 ic->i_recv_data_rem -= RDSV3_FRAG_SIZE;
770 else {
771 ic->i_recv_data_rem = 0;
772 ic->i_ibinc = NULL;
773
774 if (ibinc->ii_inc.i_hdr.h_flags == RDSV3_FLAG_CONG_BITMAP)
775 rdsv3_ib_cong_recv(conn, ibinc);
776 else {
777 rdsv3_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
778 &ibinc->ii_inc, KM_NOSLEEP);
779 state->ack_next = ntohll(hdr->h_sequence);
780 state->ack_next_valid = 1;
781 }
782
783 /*
784 * Evaluate the ACK_REQUIRED flag *after* we received
785 * the complete frame, and after bumping the next_rx
786 * sequence.
787 */
788 if (hdr->h_flags & RDSV3_FLAG_ACK_REQUIRED) {
789 rdsv3_stats_inc(s_recv_ack_required);
790 state->ack_required = 1;
791 }
792
793 rdsv3_inc_put(&ibinc->ii_inc);
794 }
795
796 RDSV3_DPRINTF4("rdsv3_ib_process_recv",
797 "Return: conn: %p recv: %p len: %d state: %p",
798 conn, recv, data_len, state);
799 }
800
801 void
rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection * ic,ibt_wc_t * wc,struct rdsv3_ib_ack_state * state)802 rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc,
803 struct rdsv3_ib_ack_state *state)
804 {
805 struct rdsv3_connection *conn = ic->conn;
806 struct rdsv3_ib_recv_work *recv;
807 struct rdsv3_ib_work_ring *recv_ringp = &ic->i_recv_ring;
808
809 RDSV3_DPRINTF4("rdsv3_ib_recv_cqe_handler",
810 "rwc wc_id 0x%llx status %u byte_len %u imm_data %u\n",
811 (unsigned long long)wc->wc_id, wc->wc_status,
812 wc->wc_bytes_xfer, ntohl(wc->wc_immed_data));
813
814 rdsv3_ib_stats_inc(s_ib_rx_cq_event);
815
816 recv = &ic->i_recvs[rdsv3_ib_ring_oldest(recv_ringp)];
817
818 /*
819 * Also process recvs in connecting state because it is possible
820 * to get a recv completion _before_ the rdmacm ESTABLISHED
821 * event is processed.
822 */
823 if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) {
824 /* We expect errors as the qp is drained during shutdown */
825 if (wc->wc_status == IBT_WC_SUCCESS) {
826 rdsv3_ib_process_recv(conn, recv,
827 wc->wc_bytes_xfer, state);
828 } else {
829 RDSV3_DPRINTF2("rdsv3_ib_recv_cqe_handler",
830 "recv completion on "
831 "%u.%u.%u.%u had status %u, "
832 "disconnecting and reconnecting\n",
833 NIPQUAD(conn->c_faddr),
834 wc->wc_status);
835 rdsv3_conn_drop(conn);
836 }
837 }
838
839 rdsv3_ib_ring_free(recv_ringp, 1);
840
841 /*
842 * If we ever end up with a really empty receive ring, we're
843 * in deep trouble, as the sender will definitely see RNR
844 * timeouts.
845 */
846 if (rdsv3_ib_ring_empty(recv_ringp))
847 rdsv3_ib_stats_inc(s_ib_rx_ring_empty);
848
849 if (rdsv3_ib_ring_low(recv_ringp)) {
850 rdsv3_af_thr_fire(ic->i_refill_rq);
851 }
852 }
853
854 int
rdsv3_ib_recv(struct rdsv3_connection * conn)855 rdsv3_ib_recv(struct rdsv3_connection *conn)
856 {
857 struct rdsv3_ib_connection *ic = conn->c_transport_data;
858 int ret = 0;
859
860 RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn);
861
862 if (rdsv3_conn_up(conn))
863 rdsv3_ib_attempt_ack(ic);
864
865 RDSV3_DPRINTF4("rdsv3_ib_recv", "Return: conn: %p", conn);
866
867 return (ret);
868 }
869
870 extern int rdsv3_ib_inc_constructor(void *buf, void *arg, int kmflags);
871 extern void rdsv3_ib_inc_destructor(void *buf, void *arg);
872
873 int
rdsv3_ib_recv_init(void)874 rdsv3_ib_recv_init(void)
875 {
876 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Enter");
877
878 rdsv3_ib_incoming_slab = kmem_cache_create("rdsv3_ib_incoming",
879 sizeof (struct rdsv3_ib_incoming), 0, rdsv3_ib_inc_constructor,
880 rdsv3_ib_inc_destructor, NULL, NULL, NULL, 0);
881 if (!rdsv3_ib_incoming_slab) {
882 RDSV3_DPRINTF2("rdsv3_ib_recv_init", "kmem_cache_create "
883 "failed");
884 return (-ENOMEM);
885 }
886
887 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Return");
888 return (0);
889 }
890
891 void
rdsv3_ib_recv_exit(void)892 rdsv3_ib_recv_exit(void)
893 {
894 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Enter");
895 kmem_cache_destroy(rdsv3_ib_incoming_slab);
896 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Return");
897 }
898