xref: /linux/net/rds/ib_recv.c (revision 9a87ffc99ec8eb8d35eed7c4f816d75f5cc9662e)
11e23b3eeSAndy Grover /*
29b17f588SKa-Cheong Poon  * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
31e23b3eeSAndy Grover  *
41e23b3eeSAndy Grover  * This software is available to you under a choice of one of two
51e23b3eeSAndy Grover  * licenses.  You may choose to be licensed under the terms of the GNU
61e23b3eeSAndy Grover  * General Public License (GPL) Version 2, available from the file
71e23b3eeSAndy Grover  * COPYING in the main directory of this source tree, or the
81e23b3eeSAndy Grover  * OpenIB.org BSD license below:
91e23b3eeSAndy Grover  *
101e23b3eeSAndy Grover  *     Redistribution and use in source and binary forms, with or
111e23b3eeSAndy Grover  *     without modification, are permitted provided that the following
121e23b3eeSAndy Grover  *     conditions are met:
131e23b3eeSAndy Grover  *
141e23b3eeSAndy Grover  *      - Redistributions of source code must retain the above
151e23b3eeSAndy Grover  *        copyright notice, this list of conditions and the following
161e23b3eeSAndy Grover  *        disclaimer.
171e23b3eeSAndy Grover  *
181e23b3eeSAndy Grover  *      - Redistributions in binary form must reproduce the above
191e23b3eeSAndy Grover  *        copyright notice, this list of conditions and the following
201e23b3eeSAndy Grover  *        disclaimer in the documentation and/or other materials
211e23b3eeSAndy Grover  *        provided with the distribution.
221e23b3eeSAndy Grover  *
231e23b3eeSAndy Grover  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
241e23b3eeSAndy Grover  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
251e23b3eeSAndy Grover  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
261e23b3eeSAndy Grover  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
271e23b3eeSAndy Grover  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
281e23b3eeSAndy Grover  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
291e23b3eeSAndy Grover  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
301e23b3eeSAndy Grover  * SOFTWARE.
311e23b3eeSAndy Grover  *
321e23b3eeSAndy Grover  */
331e23b3eeSAndy Grover #include <linux/kernel.h>
34*2870c4d6SJakub Kicinski #include <linux/sched/clock.h>
355a0e3ad6STejun Heo #include <linux/slab.h>
361e23b3eeSAndy Grover #include <linux/pci.h>
371e23b3eeSAndy Grover #include <linux/dma-mapping.h>
381e23b3eeSAndy Grover #include <rdma/rdma_cm.h>
391e23b3eeSAndy Grover 
400cb43965SSowmini Varadhan #include "rds_single_path.h"
411e23b3eeSAndy Grover #include "rds.h"
421e23b3eeSAndy Grover #include "ib.h"
431e23b3eeSAndy Grover 
441e23b3eeSAndy Grover static struct kmem_cache *rds_ib_incoming_slab;
451e23b3eeSAndy Grover static struct kmem_cache *rds_ib_frag_slab;
461e23b3eeSAndy Grover static atomic_t	rds_ib_allocation = ATOMIC_INIT(0);
471e23b3eeSAndy Grover 
rds_ib_recv_init_ring(struct rds_ib_connection * ic)481e23b3eeSAndy Grover void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
491e23b3eeSAndy Grover {
501e23b3eeSAndy Grover 	struct rds_ib_recv_work *recv;
511e23b3eeSAndy Grover 	u32 i;
521e23b3eeSAndy Grover 
531e23b3eeSAndy Grover 	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
541e23b3eeSAndy Grover 		struct ib_sge *sge;
551e23b3eeSAndy Grover 
561e23b3eeSAndy Grover 		recv->r_ibinc = NULL;
571e23b3eeSAndy Grover 		recv->r_frag = NULL;
581e23b3eeSAndy Grover 
591e23b3eeSAndy Grover 		recv->r_wr.next = NULL;
601e23b3eeSAndy Grover 		recv->r_wr.wr_id = i;
611e23b3eeSAndy Grover 		recv->r_wr.sg_list = recv->r_sge;
621e23b3eeSAndy Grover 		recv->r_wr.num_sge = RDS_IB_RECV_SGE;
631e23b3eeSAndy Grover 
64919ced4cSAndy Grover 		sge = &recv->r_sge[0];
659b17f588SKa-Cheong Poon 		sge->addr = ic->i_recv_hdrs_dma[i];
661e23b3eeSAndy Grover 		sge->length = sizeof(struct rds_header);
67e5580242SJason Gunthorpe 		sge->lkey = ic->i_pd->local_dma_lkey;
68919ced4cSAndy Grover 
69919ced4cSAndy Grover 		sge = &recv->r_sge[1];
70919ced4cSAndy Grover 		sge->addr = 0;
71919ced4cSAndy Grover 		sge->length = RDS_FRAG_SIZE;
72e5580242SJason Gunthorpe 		sge->lkey = ic->i_pd->local_dma_lkey;
731e23b3eeSAndy Grover 	}
741e23b3eeSAndy Grover }
751e23b3eeSAndy Grover 
7633244125SChris Mason /*
7733244125SChris Mason  * The entire 'from' list, including the from element itself, is put on
7833244125SChris Mason  * to the tail of the 'to' list.
7933244125SChris Mason  */
list_splice_entire_tail(struct list_head * from,struct list_head * to)8033244125SChris Mason static void list_splice_entire_tail(struct list_head *from,
8133244125SChris Mason 				    struct list_head *to)
8233244125SChris Mason {
8333244125SChris Mason 	struct list_head *from_last = from->prev;
8433244125SChris Mason 
8533244125SChris Mason 	list_splice_tail(from_last, to);
8633244125SChris Mason 	list_add_tail(from_last, to);
8733244125SChris Mason }
8833244125SChris Mason 
rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache * cache)8933244125SChris Mason static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
9033244125SChris Mason {
9133244125SChris Mason 	struct list_head *tmp;
9233244125SChris Mason 
9333244125SChris Mason 	tmp = xchg(&cache->xfer, NULL);
9433244125SChris Mason 	if (tmp) {
9533244125SChris Mason 		if (cache->ready)
9633244125SChris Mason 			list_splice_entire_tail(tmp, cache->ready);
9733244125SChris Mason 		else
9833244125SChris Mason 			cache->ready = tmp;
9933244125SChris Mason 	}
10033244125SChris Mason }
10133244125SChris Mason 
rds_ib_recv_alloc_cache(struct rds_ib_refill_cache * cache,gfp_t gfp)102f394ad28SKa-Cheong Poon static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache, gfp_t gfp)
10333244125SChris Mason {
10433244125SChris Mason 	struct rds_ib_cache_head *head;
10533244125SChris Mason 	int cpu;
10633244125SChris Mason 
107f394ad28SKa-Cheong Poon 	cache->percpu = alloc_percpu_gfp(struct rds_ib_cache_head, gfp);
10833244125SChris Mason 	if (!cache->percpu)
10933244125SChris Mason 	       return -ENOMEM;
11033244125SChris Mason 
11133244125SChris Mason 	for_each_possible_cpu(cpu) {
11233244125SChris Mason 		head = per_cpu_ptr(cache->percpu, cpu);
11333244125SChris Mason 		head->first = NULL;
11433244125SChris Mason 		head->count = 0;
11533244125SChris Mason 	}
11633244125SChris Mason 	cache->xfer = NULL;
11733244125SChris Mason 	cache->ready = NULL;
11833244125SChris Mason 
11933244125SChris Mason 	return 0;
12033244125SChris Mason }
12133244125SChris Mason 
rds_ib_recv_alloc_caches(struct rds_ib_connection * ic,gfp_t gfp)122f394ad28SKa-Cheong Poon int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp)
12333244125SChris Mason {
12433244125SChris Mason 	int ret;
12533244125SChris Mason 
126f394ad28SKa-Cheong Poon 	ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs, gfp);
12733244125SChris Mason 	if (!ret) {
128f394ad28SKa-Cheong Poon 		ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags, gfp);
12933244125SChris Mason 		if (ret)
13033244125SChris Mason 			free_percpu(ic->i_cache_incs.percpu);
13133244125SChris Mason 	}
13233244125SChris Mason 
13333244125SChris Mason 	return ret;
13433244125SChris Mason }
13533244125SChris Mason 
rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache * cache,struct list_head * caller_list)13633244125SChris Mason static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
13733244125SChris Mason 					  struct list_head *caller_list)
13833244125SChris Mason {
13933244125SChris Mason 	struct rds_ib_cache_head *head;
14033244125SChris Mason 	int cpu;
14133244125SChris Mason 
14233244125SChris Mason 	for_each_possible_cpu(cpu) {
14333244125SChris Mason 		head = per_cpu_ptr(cache->percpu, cpu);
14433244125SChris Mason 		if (head->first) {
14533244125SChris Mason 			list_splice_entire_tail(head->first, caller_list);
14633244125SChris Mason 			head->first = NULL;
14733244125SChris Mason 		}
14833244125SChris Mason 	}
14933244125SChris Mason 
15033244125SChris Mason 	if (cache->ready) {
15133244125SChris Mason 		list_splice_entire_tail(cache->ready, caller_list);
15233244125SChris Mason 		cache->ready = NULL;
15333244125SChris Mason 	}
15433244125SChris Mason }
15533244125SChris Mason 
rds_ib_recv_free_caches(struct rds_ib_connection * ic)15633244125SChris Mason void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
15733244125SChris Mason {
15833244125SChris Mason 	struct rds_ib_incoming *inc;
15933244125SChris Mason 	struct rds_ib_incoming *inc_tmp;
16033244125SChris Mason 	struct rds_page_frag *frag;
16133244125SChris Mason 	struct rds_page_frag *frag_tmp;
16233244125SChris Mason 	LIST_HEAD(list);
16333244125SChris Mason 
16433244125SChris Mason 	rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
16533244125SChris Mason 	rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
16633244125SChris Mason 	free_percpu(ic->i_cache_incs.percpu);
16733244125SChris Mason 
16833244125SChris Mason 	list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
16933244125SChris Mason 		list_del(&inc->ii_cache_entry);
17033244125SChris Mason 		WARN_ON(!list_empty(&inc->ii_frags));
17133244125SChris Mason 		kmem_cache_free(rds_ib_incoming_slab, inc);
172b50e0587SZhu Yanjun 		atomic_dec(&rds_ib_allocation);
17333244125SChris Mason 	}
17433244125SChris Mason 
17533244125SChris Mason 	rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
17633244125SChris Mason 	rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
17733244125SChris Mason 	free_percpu(ic->i_cache_frags.percpu);
17833244125SChris Mason 
17933244125SChris Mason 	list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
18033244125SChris Mason 		list_del(&frag->f_cache_entry);
18133244125SChris Mason 		WARN_ON(!list_empty(&frag->f_item));
18233244125SChris Mason 		kmem_cache_free(rds_ib_frag_slab, frag);
18333244125SChris Mason 	}
18433244125SChris Mason }
18533244125SChris Mason 
18633244125SChris Mason /* fwd decl */
18733244125SChris Mason static void rds_ib_recv_cache_put(struct list_head *new_item,
18833244125SChris Mason 				  struct rds_ib_refill_cache *cache);
18933244125SChris Mason static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
19033244125SChris Mason 
19133244125SChris Mason 
19233244125SChris Mason /* Recycle frag and attached recv buffer f_sg */
rds_ib_frag_free(struct rds_ib_connection * ic,struct rds_page_frag * frag)19333244125SChris Mason static void rds_ib_frag_free(struct rds_ib_connection *ic,
19433244125SChris Mason 			     struct rds_page_frag *frag)
19533244125SChris Mason {
19633244125SChris Mason 	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
19733244125SChris Mason 
19833244125SChris Mason 	rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
19909b2b8f5SSantosh Shilimkar 	atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
20009b2b8f5SSantosh Shilimkar 	rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
20133244125SChris Mason }
20233244125SChris Mason 
20333244125SChris Mason /* Recycle inc after freeing attached frags */
rds_ib_inc_free(struct rds_incoming * inc)20433244125SChris Mason void rds_ib_inc_free(struct rds_incoming *inc)
20533244125SChris Mason {
20633244125SChris Mason 	struct rds_ib_incoming *ibinc;
20733244125SChris Mason 	struct rds_page_frag *frag;
20833244125SChris Mason 	struct rds_page_frag *pos;
20933244125SChris Mason 	struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
21033244125SChris Mason 
21133244125SChris Mason 	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
21233244125SChris Mason 
21333244125SChris Mason 	/* Free attached frags */
21433244125SChris Mason 	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
21533244125SChris Mason 		list_del_init(&frag->f_item);
21633244125SChris Mason 		rds_ib_frag_free(ic, frag);
21733244125SChris Mason 	}
21833244125SChris Mason 	BUG_ON(!list_empty(&ibinc->ii_frags));
21933244125SChris Mason 
22033244125SChris Mason 	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
22133244125SChris Mason 	rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
22233244125SChris Mason }
22333244125SChris Mason 
rds_ib_recv_clear_one(struct rds_ib_connection * ic,struct rds_ib_recv_work * recv)2241e23b3eeSAndy Grover static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
2251e23b3eeSAndy Grover 				  struct rds_ib_recv_work *recv)
2261e23b3eeSAndy Grover {
2271e23b3eeSAndy Grover 	if (recv->r_ibinc) {
2281e23b3eeSAndy Grover 		rds_inc_put(&recv->r_ibinc->ii_inc);
2291e23b3eeSAndy Grover 		recv->r_ibinc = NULL;
2301e23b3eeSAndy Grover 	}
2311e23b3eeSAndy Grover 	if (recv->r_frag) {
232fc24f780SAndy Grover 		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
23333244125SChris Mason 		rds_ib_frag_free(ic, recv->r_frag);
2341e23b3eeSAndy Grover 		recv->r_frag = NULL;
2351e23b3eeSAndy Grover 	}
2361e23b3eeSAndy Grover }
2371e23b3eeSAndy Grover 
rds_ib_recv_clear_ring(struct rds_ib_connection * ic)2381e23b3eeSAndy Grover void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
2391e23b3eeSAndy Grover {
2401e23b3eeSAndy Grover 	u32 i;
2411e23b3eeSAndy Grover 
2421e23b3eeSAndy Grover 	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
2431e23b3eeSAndy Grover 		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
2441e23b3eeSAndy Grover }
2451e23b3eeSAndy Grover 
rds_ib_refill_one_inc(struct rds_ib_connection * ic,gfp_t slab_mask)246037f18a3SChris Mason static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
247037f18a3SChris Mason 						     gfp_t slab_mask)
24833244125SChris Mason {
24933244125SChris Mason 	struct rds_ib_incoming *ibinc;
25033244125SChris Mason 	struct list_head *cache_item;
25133244125SChris Mason 	int avail_allocs;
25233244125SChris Mason 
25333244125SChris Mason 	cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
25433244125SChris Mason 	if (cache_item) {
25533244125SChris Mason 		ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
25633244125SChris Mason 	} else {
25733244125SChris Mason 		avail_allocs = atomic_add_unless(&rds_ib_allocation,
25833244125SChris Mason 						 1, rds_ib_sysctl_max_recv_allocation);
25933244125SChris Mason 		if (!avail_allocs) {
26033244125SChris Mason 			rds_ib_stats_inc(s_ib_rx_alloc_limit);
26133244125SChris Mason 			return NULL;
26233244125SChris Mason 		}
263037f18a3SChris Mason 		ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
26433244125SChris Mason 		if (!ibinc) {
26533244125SChris Mason 			atomic_dec(&rds_ib_allocation);
26633244125SChris Mason 			return NULL;
26733244125SChris Mason 		}
26809b2b8f5SSantosh Shilimkar 		rds_ib_stats_inc(s_ib_rx_total_incs);
26933244125SChris Mason 	}
27033244125SChris Mason 	INIT_LIST_HEAD(&ibinc->ii_frags);
271eee2fa6aSKa-Cheong Poon 	rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
27233244125SChris Mason 
27333244125SChris Mason 	return ibinc;
27433244125SChris Mason }
27533244125SChris Mason 
rds_ib_refill_one_frag(struct rds_ib_connection * ic,gfp_t slab_mask,gfp_t page_mask)276037f18a3SChris Mason static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
277037f18a3SChris Mason 						    gfp_t slab_mask, gfp_t page_mask)
27833244125SChris Mason {
27933244125SChris Mason 	struct rds_page_frag *frag;
28033244125SChris Mason 	struct list_head *cache_item;
28133244125SChris Mason 	int ret;
28233244125SChris Mason 
28333244125SChris Mason 	cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
28433244125SChris Mason 	if (cache_item) {
28533244125SChris Mason 		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
28609b2b8f5SSantosh Shilimkar 		atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
28709b2b8f5SSantosh Shilimkar 		rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
28833244125SChris Mason 	} else {
289037f18a3SChris Mason 		frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
29033244125SChris Mason 		if (!frag)
29133244125SChris Mason 			return NULL;
29233244125SChris Mason 
293b4e1da3cSChris Mason 		sg_init_table(&frag->f_sg, 1);
29433244125SChris Mason 		ret = rds_page_remainder_alloc(&frag->f_sg,
295037f18a3SChris Mason 					       RDS_FRAG_SIZE, page_mask);
29633244125SChris Mason 		if (ret) {
29733244125SChris Mason 			kmem_cache_free(rds_ib_frag_slab, frag);
29833244125SChris Mason 			return NULL;
29933244125SChris Mason 		}
30009b2b8f5SSantosh Shilimkar 		rds_ib_stats_inc(s_ib_rx_total_frags);
30133244125SChris Mason 	}
30233244125SChris Mason 
30333244125SChris Mason 	INIT_LIST_HEAD(&frag->f_item);
30433244125SChris Mason 
30533244125SChris Mason 	return frag;
30633244125SChris Mason }
30733244125SChris Mason 
rds_ib_recv_refill_one(struct rds_connection * conn,struct rds_ib_recv_work * recv,gfp_t gfp)3081e23b3eeSAndy Grover static int rds_ib_recv_refill_one(struct rds_connection *conn,
30973ce4317Ssantosh.shilimkar@oracle.com 				  struct rds_ib_recv_work *recv, gfp_t gfp)
3101e23b3eeSAndy Grover {
3111e23b3eeSAndy Grover 	struct rds_ib_connection *ic = conn->c_transport_data;
3121e23b3eeSAndy Grover 	struct ib_sge *sge;
3131e23b3eeSAndy Grover 	int ret = -ENOMEM;
3149f0bb95eSManjunath Patil 	gfp_t slab_mask = gfp;
3159f0bb95eSManjunath Patil 	gfp_t page_mask = gfp;
316037f18a3SChris Mason 
317d0164adcSMel Gorman 	if (gfp & __GFP_DIRECT_RECLAIM) {
318037f18a3SChris Mason 		slab_mask = GFP_KERNEL;
319037f18a3SChris Mason 		page_mask = GFP_HIGHUSER;
320037f18a3SChris Mason 	}
3211e23b3eeSAndy Grover 
32233244125SChris Mason 	if (!ic->i_cache_incs.ready)
32333244125SChris Mason 		rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
32433244125SChris Mason 	if (!ic->i_cache_frags.ready)
32533244125SChris Mason 		rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
32633244125SChris Mason 
3273427e854SAndy Grover 	/*
3283427e854SAndy Grover 	 * ibinc was taken from recv if recv contained the start of a message.
3293427e854SAndy Grover 	 * recvs that were continuations will still have this allocated.
3303427e854SAndy Grover 	 */
3318690bfa1SAndy Grover 	if (!recv->r_ibinc) {
332037f18a3SChris Mason 		recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
33333244125SChris Mason 		if (!recv->r_ibinc)
3341e23b3eeSAndy Grover 			goto out;
3351e23b3eeSAndy Grover 	}
3361e23b3eeSAndy Grover 
3373427e854SAndy Grover 	WARN_ON(recv->r_frag); /* leak! */
338037f18a3SChris Mason 	recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
3398690bfa1SAndy Grover 	if (!recv->r_frag)
3401e23b3eeSAndy Grover 		goto out;
3411e23b3eeSAndy Grover 
3420b088e00SAndy Grover 	ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
3430b088e00SAndy Grover 			    1, DMA_FROM_DEVICE);
3440b088e00SAndy Grover 	WARN_ON(ret != 1);
3451e23b3eeSAndy Grover 
346919ced4cSAndy Grover 	sge = &recv->r_sge[0];
3479b17f588SKa-Cheong Poon 	sge->addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
3481e23b3eeSAndy Grover 	sge->length = sizeof(struct rds_header);
3491e23b3eeSAndy Grover 
350919ced4cSAndy Grover 	sge = &recv->r_sge[1];
351a163afc8SBart Van Assche 	sge->addr = sg_dma_address(&recv->r_frag->f_sg);
352a163afc8SBart Van Assche 	sge->length = sg_dma_len(&recv->r_frag->f_sg);
3531e23b3eeSAndy Grover 
3541e23b3eeSAndy Grover 	ret = 0;
3551e23b3eeSAndy Grover out:
3561e23b3eeSAndy Grover 	return ret;
3571e23b3eeSAndy Grover }
3581e23b3eeSAndy Grover 
acquire_refill(struct rds_connection * conn)35973ce4317Ssantosh.shilimkar@oracle.com static int acquire_refill(struct rds_connection *conn)
36073ce4317Ssantosh.shilimkar@oracle.com {
36173ce4317Ssantosh.shilimkar@oracle.com 	return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0;
36273ce4317Ssantosh.shilimkar@oracle.com }
36373ce4317Ssantosh.shilimkar@oracle.com 
release_refill(struct rds_connection * conn)36473ce4317Ssantosh.shilimkar@oracle.com static void release_refill(struct rds_connection *conn)
36573ce4317Ssantosh.shilimkar@oracle.com {
36673ce4317Ssantosh.shilimkar@oracle.com 	clear_bit(RDS_RECV_REFILL, &conn->c_flags);
3679f414eb4SMikulas Patocka 	smp_mb__after_atomic();
36873ce4317Ssantosh.shilimkar@oracle.com 
36973ce4317Ssantosh.shilimkar@oracle.com 	/* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
37073ce4317Ssantosh.shilimkar@oracle.com 	 * hot path and finding waiters is very rare.  We don't want to walk
37173ce4317Ssantosh.shilimkar@oracle.com 	 * the system-wide hashed waitqueue buckets in the fast path only to
37273ce4317Ssantosh.shilimkar@oracle.com 	 * almost never find waiters.
37373ce4317Ssantosh.shilimkar@oracle.com 	 */
37473ce4317Ssantosh.shilimkar@oracle.com 	if (waitqueue_active(&conn->c_waitq))
37573ce4317Ssantosh.shilimkar@oracle.com 		wake_up_all(&conn->c_waitq);
37673ce4317Ssantosh.shilimkar@oracle.com }
37773ce4317Ssantosh.shilimkar@oracle.com 
3781e23b3eeSAndy Grover /*
3791e23b3eeSAndy Grover  * This tries to allocate and post unused work requests after making sure that
3801e23b3eeSAndy Grover  * they have all the allocations they need to queue received fragments into
38133244125SChris Mason  * sockets.
3821e23b3eeSAndy Grover  */
rds_ib_recv_refill(struct rds_connection * conn,int prefill,gfp_t gfp)38373ce4317Ssantosh.shilimkar@oracle.com void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
3841e23b3eeSAndy Grover {
3851e23b3eeSAndy Grover 	struct rds_ib_connection *ic = conn->c_transport_data;
3861e23b3eeSAndy Grover 	struct rds_ib_recv_work *recv;
3871e23b3eeSAndy Grover 	unsigned int posted = 0;
3881e23b3eeSAndy Grover 	int ret = 0;
389d0164adcSMel Gorman 	bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
39065dedd7fSChris Mason 	bool must_wake = false;
3911e23b3eeSAndy Grover 	u32 pos;
3921e23b3eeSAndy Grover 
39373ce4317Ssantosh.shilimkar@oracle.com 	/* the goal here is to just make sure that someone, somewhere
39473ce4317Ssantosh.shilimkar@oracle.com 	 * is posting buffers.  If we can't get the refill lock,
39573ce4317Ssantosh.shilimkar@oracle.com 	 * let them do their thing
39673ce4317Ssantosh.shilimkar@oracle.com 	 */
39773ce4317Ssantosh.shilimkar@oracle.com 	if (!acquire_refill(conn))
39873ce4317Ssantosh.shilimkar@oracle.com 		return;
39973ce4317Ssantosh.shilimkar@oracle.com 
400f64f9e71SJoe Perches 	while ((prefill || rds_conn_up(conn)) &&
401f64f9e71SJoe Perches 	       rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
4021e23b3eeSAndy Grover 		if (pos >= ic->i_recv_ring.w_nr) {
4031e23b3eeSAndy Grover 			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
4041e23b3eeSAndy Grover 					pos);
4051e23b3eeSAndy Grover 			break;
4061e23b3eeSAndy Grover 		}
4071e23b3eeSAndy Grover 
4081e23b3eeSAndy Grover 		recv = &ic->i_recvs[pos];
40973ce4317Ssantosh.shilimkar@oracle.com 		ret = rds_ib_recv_refill_one(conn, recv, gfp);
4101e23b3eeSAndy Grover 		if (ret) {
41165dedd7fSChris Mason 			must_wake = true;
4121e23b3eeSAndy Grover 			break;
4131e23b3eeSAndy Grover 		}
4141e23b3eeSAndy Grover 
4151cb483a5SHåkon Bugge 		rdsdebug("recv %p ibinc %p page %p addr %lu\n", recv,
4160b088e00SAndy Grover 			 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
417a163afc8SBart Van Assche 			 (long)sg_dma_address(&recv->r_frag->f_sg));
4181cb483a5SHåkon Bugge 
4191cb483a5SHåkon Bugge 		/* XXX when can this fail? */
420f112d53bSBart Van Assche 		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL);
4211e23b3eeSAndy Grover 		if (ret) {
4221e23b3eeSAndy Grover 			rds_ib_conn_error(conn, "recv post on "
423eee2fa6aSKa-Cheong Poon 			       "%pI6c returned %d, disconnecting and "
4241e23b3eeSAndy Grover 			       "reconnecting\n", &conn->c_faddr,
4251e23b3eeSAndy Grover 			       ret);
4261e23b3eeSAndy Grover 			break;
4271e23b3eeSAndy Grover 		}
4281e23b3eeSAndy Grover 
4291e23b3eeSAndy Grover 		posted++;
43065dedd7fSChris Mason 
43165dedd7fSChris Mason 		if ((posted > 128 && need_resched()) || posted > 8192) {
43265dedd7fSChris Mason 			must_wake = true;
43365dedd7fSChris Mason 			break;
43465dedd7fSChris Mason 		}
4351e23b3eeSAndy Grover 	}
4361e23b3eeSAndy Grover 
4371e23b3eeSAndy Grover 	/* We're doing flow control - update the window. */
4381e23b3eeSAndy Grover 	if (ic->i_flowctl && posted)
4391e23b3eeSAndy Grover 		rds_ib_advertise_credits(conn, posted);
4401e23b3eeSAndy Grover 
4411e23b3eeSAndy Grover 	if (ret)
4421e23b3eeSAndy Grover 		rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
44373ce4317Ssantosh.shilimkar@oracle.com 
44473ce4317Ssantosh.shilimkar@oracle.com 	release_refill(conn);
44573ce4317Ssantosh.shilimkar@oracle.com 
44673ce4317Ssantosh.shilimkar@oracle.com 	/* if we're called from the softirq handler, we'll be GFP_NOWAIT.
44773ce4317Ssantosh.shilimkar@oracle.com 	 * in this case the ring being low is going to lead to more interrupts
44873ce4317Ssantosh.shilimkar@oracle.com 	 * and we can safely let the softirq code take care of it unless the
44973ce4317Ssantosh.shilimkar@oracle.com 	 * ring is completely empty.
45073ce4317Ssantosh.shilimkar@oracle.com 	 *
45173ce4317Ssantosh.shilimkar@oracle.com 	 * if we're called from krdsd, we'll be GFP_KERNEL.  In this case
45273ce4317Ssantosh.shilimkar@oracle.com 	 * we might have raced with the softirq code while we had the refill
45373ce4317Ssantosh.shilimkar@oracle.com 	 * lock held.  Use rds_ib_ring_low() instead of ring_empty to decide
45473ce4317Ssantosh.shilimkar@oracle.com 	 * if we should requeue.
45573ce4317Ssantosh.shilimkar@oracle.com 	 */
45673ce4317Ssantosh.shilimkar@oracle.com 	if (rds_conn_up(conn) &&
45765dedd7fSChris Mason 	    (must_wake ||
45865dedd7fSChris Mason 	    (can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
45973ce4317Ssantosh.shilimkar@oracle.com 	    rds_ib_ring_empty(&ic->i_recv_ring))) {
46073ce4317Ssantosh.shilimkar@oracle.com 		queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
46173ce4317Ssantosh.shilimkar@oracle.com 	}
46265dedd7fSChris Mason 	if (can_wait)
46365dedd7fSChris Mason 		cond_resched();
4641e23b3eeSAndy Grover }
4651e23b3eeSAndy Grover 
46633244125SChris Mason /*
46733244125SChris Mason  * We want to recycle several types of recv allocations, like incs and frags.
46833244125SChris Mason  * To use this, the *_free() function passes in the ptr to a list_head within
46933244125SChris Mason  * the recyclee, as well as the cache to put it on.
47033244125SChris Mason  *
47133244125SChris Mason  * First, we put the memory on a percpu list. When this reaches a certain size,
47233244125SChris Mason  * We move it to an intermediate non-percpu list in a lockless manner, with some
47333244125SChris Mason  * xchg/compxchg wizardry.
47433244125SChris Mason  *
47533244125SChris Mason  * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
47633244125SChris Mason  * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
47733244125SChris Mason  * list_empty() will return true with one element is actually present.
47833244125SChris Mason  */
rds_ib_recv_cache_put(struct list_head * new_item,struct rds_ib_refill_cache * cache)47933244125SChris Mason static void rds_ib_recv_cache_put(struct list_head *new_item,
48033244125SChris Mason 				 struct rds_ib_refill_cache *cache)
4811e23b3eeSAndy Grover {
48233244125SChris Mason 	unsigned long flags;
483c196403bSGerald Schaefer 	struct list_head *old, *chpfirst;
4841e23b3eeSAndy Grover 
48533244125SChris Mason 	local_irq_save(flags);
4861e23b3eeSAndy Grover 
487ae4b46e9SShan Wei 	chpfirst = __this_cpu_read(cache->percpu->first);
488ae4b46e9SShan Wei 	if (!chpfirst)
48933244125SChris Mason 		INIT_LIST_HEAD(new_item);
49033244125SChris Mason 	else /* put on front */
491ae4b46e9SShan Wei 		list_add_tail(new_item, chpfirst);
49233244125SChris Mason 
493c196403bSGerald Schaefer 	__this_cpu_write(cache->percpu->first, new_item);
494ae4b46e9SShan Wei 	__this_cpu_inc(cache->percpu->count);
495ae4b46e9SShan Wei 
496ae4b46e9SShan Wei 	if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
49733244125SChris Mason 		goto end;
49833244125SChris Mason 
49933244125SChris Mason 	/*
50033244125SChris Mason 	 * Return our per-cpu first list to the cache's xfer by atomically
50133244125SChris Mason 	 * grabbing the current xfer list, appending it to our per-cpu list,
50233244125SChris Mason 	 * and then atomically returning that entire list back to the
50333244125SChris Mason 	 * cache's xfer list as long as it's still empty.
50433244125SChris Mason 	 */
50533244125SChris Mason 	do {
50633244125SChris Mason 		old = xchg(&cache->xfer, NULL);
50733244125SChris Mason 		if (old)
508ae4b46e9SShan Wei 			list_splice_entire_tail(old, chpfirst);
509ae4b46e9SShan Wei 		old = cmpxchg(&cache->xfer, NULL, chpfirst);
51033244125SChris Mason 	} while (old);
51133244125SChris Mason 
512ae4b46e9SShan Wei 
513c196403bSGerald Schaefer 	__this_cpu_write(cache->percpu->first, NULL);
514ae4b46e9SShan Wei 	__this_cpu_write(cache->percpu->count, 0);
51533244125SChris Mason end:
51633244125SChris Mason 	local_irq_restore(flags);
5171e23b3eeSAndy Grover }
5181e23b3eeSAndy Grover 
rds_ib_recv_cache_get(struct rds_ib_refill_cache * cache)51933244125SChris Mason static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
5201e23b3eeSAndy Grover {
52133244125SChris Mason 	struct list_head *head = cache->ready;
5221e23b3eeSAndy Grover 
52333244125SChris Mason 	if (head) {
52433244125SChris Mason 		if (!list_empty(head)) {
52533244125SChris Mason 			cache->ready = head->next;
52633244125SChris Mason 			list_del_init(head);
52733244125SChris Mason 		} else
52833244125SChris Mason 			cache->ready = NULL;
52933244125SChris Mason 	}
5301e23b3eeSAndy Grover 
53133244125SChris Mason 	return head;
5321e23b3eeSAndy Grover }
5331e23b3eeSAndy Grover 
rds_ib_inc_copy_to_user(struct rds_incoming * inc,struct iov_iter * to)534c310e72cSAl Viro int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
5351e23b3eeSAndy Grover {
5361e23b3eeSAndy Grover 	struct rds_ib_incoming *ibinc;
5371e23b3eeSAndy Grover 	struct rds_page_frag *frag;
5381e23b3eeSAndy Grover 	unsigned long to_copy;
5391e23b3eeSAndy Grover 	unsigned long frag_off = 0;
5401e23b3eeSAndy Grover 	int copied = 0;
5411e23b3eeSAndy Grover 	int ret;
5421e23b3eeSAndy Grover 	u32 len;
5431e23b3eeSAndy Grover 
5441e23b3eeSAndy Grover 	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
5451e23b3eeSAndy Grover 	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
5461e23b3eeSAndy Grover 	len = be32_to_cpu(inc->i_hdr.h_len);
5471e23b3eeSAndy Grover 
548c310e72cSAl Viro 	while (iov_iter_count(to) && copied < len) {
5491e23b3eeSAndy Grover 		if (frag_off == RDS_FRAG_SIZE) {
5501e23b3eeSAndy Grover 			frag = list_entry(frag->f_item.next,
5511e23b3eeSAndy Grover 					  struct rds_page_frag, f_item);
5521e23b3eeSAndy Grover 			frag_off = 0;
5531e23b3eeSAndy Grover 		}
554c310e72cSAl Viro 		to_copy = min_t(unsigned long, iov_iter_count(to),
555c310e72cSAl Viro 				RDS_FRAG_SIZE - frag_off);
5561e23b3eeSAndy Grover 		to_copy = min_t(unsigned long, to_copy, len - copied);
5571e23b3eeSAndy Grover 
5581e23b3eeSAndy Grover 		/* XXX needs + offset for multiple recvs per page */
559c310e72cSAl Viro 		rds_stats_add(s_copy_to_user, to_copy);
560c310e72cSAl Viro 		ret = copy_page_to_iter(sg_page(&frag->f_sg),
5610b088e00SAndy Grover 					frag->f_sg.offset + frag_off,
562c310e72cSAl Viro 					to_copy,
563c310e72cSAl Viro 					to);
564c310e72cSAl Viro 		if (ret != to_copy)
565c310e72cSAl Viro 			return -EFAULT;
5661e23b3eeSAndy Grover 
5671e23b3eeSAndy Grover 		frag_off += to_copy;
5681e23b3eeSAndy Grover 		copied += to_copy;
5691e23b3eeSAndy Grover 	}
5701e23b3eeSAndy Grover 
5711e23b3eeSAndy Grover 	return copied;
5721e23b3eeSAndy Grover }
5731e23b3eeSAndy Grover 
5741e23b3eeSAndy Grover /* ic starts out kzalloc()ed */
rds_ib_recv_init_ack(struct rds_ib_connection * ic)5751e23b3eeSAndy Grover void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
5761e23b3eeSAndy Grover {
5771e23b3eeSAndy Grover 	struct ib_send_wr *wr = &ic->i_ack_wr;
5781e23b3eeSAndy Grover 	struct ib_sge *sge = &ic->i_ack_sge;
5791e23b3eeSAndy Grover 
5801e23b3eeSAndy Grover 	sge->addr = ic->i_ack_dma;
5811e23b3eeSAndy Grover 	sge->length = sizeof(struct rds_header);
582e5580242SJason Gunthorpe 	sge->lkey = ic->i_pd->local_dma_lkey;
5831e23b3eeSAndy Grover 
5841e23b3eeSAndy Grover 	wr->sg_list = sge;
5851e23b3eeSAndy Grover 	wr->num_sge = 1;
5861e23b3eeSAndy Grover 	wr->opcode = IB_WR_SEND;
5871e23b3eeSAndy Grover 	wr->wr_id = RDS_IB_ACK_WR_ID;
5881e23b3eeSAndy Grover 	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
5891e23b3eeSAndy Grover }
5901e23b3eeSAndy Grover 
5911e23b3eeSAndy Grover /*
5921e23b3eeSAndy Grover  * You'd think that with reliable IB connections you wouldn't need to ack
5931e23b3eeSAndy Grover  * messages that have been received.  The problem is that IB hardware generates
5941e23b3eeSAndy Grover  * an ack message before it has DMAed the message into memory.  This creates a
5951e23b3eeSAndy Grover  * potential message loss if the HCA is disabled for any reason between when it
5961e23b3eeSAndy Grover  * sends the ack and before the message is DMAed and processed.  This is only a
5971e23b3eeSAndy Grover  * potential issue if another HCA is available for fail-over.
5981e23b3eeSAndy Grover  *
5991e23b3eeSAndy Grover  * When the remote host receives our ack they'll free the sent message from
6001e23b3eeSAndy Grover  * their send queue.  To decrease the latency of this we always send an ack
6011e23b3eeSAndy Grover  * immediately after we've received messages.
6021e23b3eeSAndy Grover  *
6031e23b3eeSAndy Grover  * For simplicity, we only have one ack in flight at a time.  This puts
6041e23b3eeSAndy Grover  * pressure on senders to have deep enough send queues to absorb the latency of
6051e23b3eeSAndy Grover  * a single ack frame being in flight.  This might not be good enough.
6061e23b3eeSAndy Grover  *
6071e23b3eeSAndy Grover  * This is implemented by have a long-lived send_wr and sge which point to a
6081e23b3eeSAndy Grover  * statically allocated ack frame.  This ack wr does not fall under the ring
6091e23b3eeSAndy Grover  * accounting that the tx and rx wrs do.  The QP attribute specifically makes
6101e23b3eeSAndy Grover  * room for it beyond the ring size.  Send completion notices its special
6111e23b3eeSAndy Grover  * wr_id and avoids working with the ring in that case.
6121e23b3eeSAndy Grover  */
6138cbd9606SAndy Grover #ifndef KERNEL_HAS_ATOMIC64
rds_ib_set_ack(struct rds_ib_connection * ic,u64 seq,int ack_required)614f4f943c9SSantosh Shilimkar void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
6151e23b3eeSAndy Grover {
6168cbd9606SAndy Grover 	unsigned long flags;
6178cbd9606SAndy Grover 
6188cbd9606SAndy Grover 	spin_lock_irqsave(&ic->i_ack_lock, flags);
6198cbd9606SAndy Grover 	ic->i_ack_next = seq;
6208cbd9606SAndy Grover 	if (ack_required)
6218cbd9606SAndy Grover 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
6228cbd9606SAndy Grover 	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
6238cbd9606SAndy Grover }
6248cbd9606SAndy Grover 
rds_ib_get_ack(struct rds_ib_connection * ic)6258cbd9606SAndy Grover static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
6268cbd9606SAndy Grover {
6278cbd9606SAndy Grover 	unsigned long flags;
6288cbd9606SAndy Grover 	u64 seq;
6298cbd9606SAndy Grover 
6308cbd9606SAndy Grover 	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
6318cbd9606SAndy Grover 
6328cbd9606SAndy Grover 	spin_lock_irqsave(&ic->i_ack_lock, flags);
6338cbd9606SAndy Grover 	seq = ic->i_ack_next;
6348cbd9606SAndy Grover 	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
6358cbd9606SAndy Grover 
6368cbd9606SAndy Grover 	return seq;
6378cbd9606SAndy Grover }
6388cbd9606SAndy Grover #else
rds_ib_set_ack(struct rds_ib_connection * ic,u64 seq,int ack_required)639f4f943c9SSantosh Shilimkar void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
6408cbd9606SAndy Grover {
6418cbd9606SAndy Grover 	atomic64_set(&ic->i_ack_next, seq);
6421e23b3eeSAndy Grover 	if (ack_required) {
6434e857c58SPeter Zijlstra 		smp_mb__before_atomic();
6441e23b3eeSAndy Grover 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
6451e23b3eeSAndy Grover 	}
6461e23b3eeSAndy Grover }
6471e23b3eeSAndy Grover 
rds_ib_get_ack(struct rds_ib_connection * ic)6481e23b3eeSAndy Grover static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
6491e23b3eeSAndy Grover {
6501e23b3eeSAndy Grover 	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
6514e857c58SPeter Zijlstra 	smp_mb__after_atomic();
6521e23b3eeSAndy Grover 
6538cbd9606SAndy Grover 	return atomic64_read(&ic->i_ack_next);
6541e23b3eeSAndy Grover }
6558cbd9606SAndy Grover #endif
6568cbd9606SAndy Grover 
6571e23b3eeSAndy Grover 
rds_ib_send_ack(struct rds_ib_connection * ic,unsigned int adv_credits)6581e23b3eeSAndy Grover static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
6591e23b3eeSAndy Grover {
6601e23b3eeSAndy Grover 	struct rds_header *hdr = ic->i_ack;
6611e23b3eeSAndy Grover 	u64 seq;
6621e23b3eeSAndy Grover 	int ret;
6631e23b3eeSAndy Grover 
6641e23b3eeSAndy Grover 	seq = rds_ib_get_ack(ic);
6651e23b3eeSAndy Grover 
6661e23b3eeSAndy Grover 	rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
66742f2611cSChristoph Hellwig 
66842f2611cSChristoph Hellwig 	ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, ic->i_ack_dma,
66942f2611cSChristoph Hellwig 				   sizeof(*hdr), DMA_TO_DEVICE);
6701e23b3eeSAndy Grover 	rds_message_populate_header(hdr, 0, 0, 0);
6711e23b3eeSAndy Grover 	hdr->h_ack = cpu_to_be64(seq);
6721e23b3eeSAndy Grover 	hdr->h_credit = adv_credits;
6731e23b3eeSAndy Grover 	rds_message_make_checksum(hdr);
67442f2611cSChristoph Hellwig 	ib_dma_sync_single_for_device(ic->rds_ibdev->dev, ic->i_ack_dma,
67542f2611cSChristoph Hellwig 				      sizeof(*hdr), DMA_TO_DEVICE);
67642f2611cSChristoph Hellwig 
6771e23b3eeSAndy Grover 	ic->i_ack_queued = jiffies;
6781e23b3eeSAndy Grover 
679f112d53bSBart Van Assche 	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
6801e23b3eeSAndy Grover 	if (unlikely(ret)) {
6811e23b3eeSAndy Grover 		/* Failed to send. Release the WR, and
6821e23b3eeSAndy Grover 		 * force another ACK.
6831e23b3eeSAndy Grover 		 */
6841e23b3eeSAndy Grover 		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
6851e23b3eeSAndy Grover 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
6861e23b3eeSAndy Grover 
6871e23b3eeSAndy Grover 		rds_ib_stats_inc(s_ib_ack_send_failure);
688735f61e6SAndy Grover 
689735f61e6SAndy Grover 		rds_ib_conn_error(ic->conn, "sending ack failed\n");
6901e23b3eeSAndy Grover 	} else
6911e23b3eeSAndy Grover 		rds_ib_stats_inc(s_ib_ack_sent);
6921e23b3eeSAndy Grover }
6931e23b3eeSAndy Grover 
6941e23b3eeSAndy Grover /*
6951e23b3eeSAndy Grover  * There are 3 ways of getting acknowledgements to the peer:
6961e23b3eeSAndy Grover  *  1.	We call rds_ib_attempt_ack from the recv completion handler
6971e23b3eeSAndy Grover  *	to send an ACK-only frame.
6981e23b3eeSAndy Grover  *	However, there can be only one such frame in the send queue
6991e23b3eeSAndy Grover  *	at any time, so we may have to postpone it.
7001e23b3eeSAndy Grover  *  2.	When another (data) packet is transmitted while there's
7011e23b3eeSAndy Grover  *	an ACK in the queue, we piggyback the ACK sequence number
7021e23b3eeSAndy Grover  *	on the data packet.
7031e23b3eeSAndy Grover  *  3.	If the ACK WR is done sending, we get called from the
7041e23b3eeSAndy Grover  *	send queue completion handler, and check whether there's
7051e23b3eeSAndy Grover  *	another ACK pending (postponed because the WR was on the
7061e23b3eeSAndy Grover  *	queue). If so, we transmit it.
7071e23b3eeSAndy Grover  *
7081e23b3eeSAndy Grover  * We maintain 2 variables:
7091e23b3eeSAndy Grover  *  -	i_ack_flags, which keeps track of whether the ACK WR
7101e23b3eeSAndy Grover  *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
7111e23b3eeSAndy Grover  *  -	i_ack_next, which is the last sequence number we received
7121e23b3eeSAndy Grover  *
7131e23b3eeSAndy Grover  * Potentially, send queue and receive queue handlers can run concurrently.
7148cbd9606SAndy Grover  * It would be nice to not have to use a spinlock to synchronize things,
7158cbd9606SAndy Grover  * but the one problem that rules this out is that 64bit updates are
7168cbd9606SAndy Grover  * not atomic on all platforms. Things would be a lot simpler if
7178cbd9606SAndy Grover  * we had atomic64 or maybe cmpxchg64 everywhere.
7181e23b3eeSAndy Grover  *
7191e23b3eeSAndy Grover  * Reconnecting complicates this picture just slightly. When we
7201e23b3eeSAndy Grover  * reconnect, we may be seeing duplicate packets. The peer
7211e23b3eeSAndy Grover  * is retransmitting them, because it hasn't seen an ACK for
7221e23b3eeSAndy Grover  * them. It is important that we ACK these.
7231e23b3eeSAndy Grover  *
7241e23b3eeSAndy Grover  * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
7251e23b3eeSAndy Grover  * this flag set *MUST* be acknowledged immediately.
7261e23b3eeSAndy Grover  */
7271e23b3eeSAndy Grover 
7281e23b3eeSAndy Grover /*
7291e23b3eeSAndy Grover  * When we get here, we're called from the recv queue handler.
7301e23b3eeSAndy Grover  * Check whether we ought to transmit an ACK.
7311e23b3eeSAndy Grover  */
rds_ib_attempt_ack(struct rds_ib_connection * ic)7321e23b3eeSAndy Grover void rds_ib_attempt_ack(struct rds_ib_connection *ic)
7331e23b3eeSAndy Grover {
7341e23b3eeSAndy Grover 	unsigned int adv_credits;
7351e23b3eeSAndy Grover 
7361e23b3eeSAndy Grover 	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
7371e23b3eeSAndy Grover 		return;
7381e23b3eeSAndy Grover 
7391e23b3eeSAndy Grover 	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
7401e23b3eeSAndy Grover 		rds_ib_stats_inc(s_ib_ack_send_delayed);
7411e23b3eeSAndy Grover 		return;
7421e23b3eeSAndy Grover 	}
7431e23b3eeSAndy Grover 
7441e23b3eeSAndy Grover 	/* Can we get a send credit? */
7457b70d033SSteve Wise 	if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
7461e23b3eeSAndy Grover 		rds_ib_stats_inc(s_ib_tx_throttle);
7471e23b3eeSAndy Grover 		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
7481e23b3eeSAndy Grover 		return;
7491e23b3eeSAndy Grover 	}
7501e23b3eeSAndy Grover 
7511e23b3eeSAndy Grover 	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
7521e23b3eeSAndy Grover 	rds_ib_send_ack(ic, adv_credits);
7531e23b3eeSAndy Grover }
7541e23b3eeSAndy Grover 
7551e23b3eeSAndy Grover /*
7561e23b3eeSAndy Grover  * We get here from the send completion handler, when the
7571e23b3eeSAndy Grover  * adapter tells us the ACK frame was sent.
7581e23b3eeSAndy Grover  */
rds_ib_ack_send_complete(struct rds_ib_connection * ic)7591e23b3eeSAndy Grover void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
7601e23b3eeSAndy Grover {
7611e23b3eeSAndy Grover 	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
7621e23b3eeSAndy Grover 	rds_ib_attempt_ack(ic);
7631e23b3eeSAndy Grover }
7641e23b3eeSAndy Grover 
7651e23b3eeSAndy Grover /*
7661e23b3eeSAndy Grover  * This is called by the regular xmit code when it wants to piggyback
7671e23b3eeSAndy Grover  * an ACK on an outgoing frame.
7681e23b3eeSAndy Grover  */
rds_ib_piggyb_ack(struct rds_ib_connection * ic)7691e23b3eeSAndy Grover u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
7701e23b3eeSAndy Grover {
7711e23b3eeSAndy Grover 	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
7721e23b3eeSAndy Grover 		rds_ib_stats_inc(s_ib_ack_send_piggybacked);
7731e23b3eeSAndy Grover 	return rds_ib_get_ack(ic);
7741e23b3eeSAndy Grover }
7751e23b3eeSAndy Grover 
7761e23b3eeSAndy Grover /*
7771e23b3eeSAndy Grover  * It's kind of lame that we're copying from the posted receive pages into
7781e23b3eeSAndy Grover  * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
7791e23b3eeSAndy Grover  * them.  But receiving new congestion bitmaps should be a *rare* event, so
7801e23b3eeSAndy Grover  * hopefully we won't need to invest that complexity in making it more
7811e23b3eeSAndy Grover  * efficient.  By copying we can share a simpler core with TCP which has to
7821e23b3eeSAndy Grover  * copy.
7831e23b3eeSAndy Grover  */
rds_ib_cong_recv(struct rds_connection * conn,struct rds_ib_incoming * ibinc)7841e23b3eeSAndy Grover static void rds_ib_cong_recv(struct rds_connection *conn,
7851e23b3eeSAndy Grover 			      struct rds_ib_incoming *ibinc)
7861e23b3eeSAndy Grover {
7871e23b3eeSAndy Grover 	struct rds_cong_map *map;
7881e23b3eeSAndy Grover 	unsigned int map_off;
7891e23b3eeSAndy Grover 	unsigned int map_page;
7901e23b3eeSAndy Grover 	struct rds_page_frag *frag;
7911e23b3eeSAndy Grover 	unsigned long frag_off;
7921e23b3eeSAndy Grover 	unsigned long to_copy;
7931e23b3eeSAndy Grover 	unsigned long copied;
794f3505745SNicholas Mc Guire 	__le64 uncongested = 0;
7951e23b3eeSAndy Grover 	void *addr;
7961e23b3eeSAndy Grover 
7971e23b3eeSAndy Grover 	/* catch completely corrupt packets */
7981e23b3eeSAndy Grover 	if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
7991e23b3eeSAndy Grover 		return;
8001e23b3eeSAndy Grover 
8011e23b3eeSAndy Grover 	map = conn->c_fcong;
8021e23b3eeSAndy Grover 	map_page = 0;
8031e23b3eeSAndy Grover 	map_off = 0;
8041e23b3eeSAndy Grover 
8051e23b3eeSAndy Grover 	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
8061e23b3eeSAndy Grover 	frag_off = 0;
8071e23b3eeSAndy Grover 
8081e23b3eeSAndy Grover 	copied = 0;
8091e23b3eeSAndy Grover 
8101e23b3eeSAndy Grover 	while (copied < RDS_CONG_MAP_BYTES) {
811f3505745SNicholas Mc Guire 		__le64 *src, *dst;
8121e23b3eeSAndy Grover 		unsigned int k;
8131e23b3eeSAndy Grover 
8141e23b3eeSAndy Grover 		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
8151e23b3eeSAndy Grover 		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
8161e23b3eeSAndy Grover 
8176114eab5SCong Wang 		addr = kmap_atomic(sg_page(&frag->f_sg));
8181e23b3eeSAndy Grover 
819579ba855Sshamir rabinovitch 		src = addr + frag->f_sg.offset + frag_off;
8201e23b3eeSAndy Grover 		dst = (void *)map->m_page_addrs[map_page] + map_off;
8211e23b3eeSAndy Grover 		for (k = 0; k < to_copy; k += 8) {
8221e23b3eeSAndy Grover 			/* Record ports that became uncongested, ie
8231e23b3eeSAndy Grover 			 * bits that changed from 0 to 1. */
8241e23b3eeSAndy Grover 			uncongested |= ~(*src) & *dst;
8251e23b3eeSAndy Grover 			*dst++ = *src++;
8261e23b3eeSAndy Grover 		}
8276114eab5SCong Wang 		kunmap_atomic(addr);
8281e23b3eeSAndy Grover 
8291e23b3eeSAndy Grover 		copied += to_copy;
8301e23b3eeSAndy Grover 
8311e23b3eeSAndy Grover 		map_off += to_copy;
8321e23b3eeSAndy Grover 		if (map_off == PAGE_SIZE) {
8331e23b3eeSAndy Grover 			map_off = 0;
8341e23b3eeSAndy Grover 			map_page++;
8351e23b3eeSAndy Grover 		}
8361e23b3eeSAndy Grover 
8371e23b3eeSAndy Grover 		frag_off += to_copy;
8381e23b3eeSAndy Grover 		if (frag_off == RDS_FRAG_SIZE) {
8391e23b3eeSAndy Grover 			frag = list_entry(frag->f_item.next,
8401e23b3eeSAndy Grover 					  struct rds_page_frag, f_item);
8411e23b3eeSAndy Grover 			frag_off = 0;
8421e23b3eeSAndy Grover 		}
8431e23b3eeSAndy Grover 	}
8441e23b3eeSAndy Grover 
8451e23b3eeSAndy Grover 	/* the congestion map is in little endian order */
846f3505745SNicholas Mc Guire 	rds_cong_map_updated(map, le64_to_cpu(uncongested));
8471e23b3eeSAndy Grover }
8481e23b3eeSAndy Grover 
rds_ib_process_recv(struct rds_connection * conn,struct rds_ib_recv_work * recv,u32 data_len,struct rds_ib_ack_state * state)8491e23b3eeSAndy Grover static void rds_ib_process_recv(struct rds_connection *conn,
850597ddd50SAndy Grover 				struct rds_ib_recv_work *recv, u32 data_len,
8511e23b3eeSAndy Grover 				struct rds_ib_ack_state *state)
8521e23b3eeSAndy Grover {
8531e23b3eeSAndy Grover 	struct rds_ib_connection *ic = conn->c_transport_data;
8541e23b3eeSAndy Grover 	struct rds_ib_incoming *ibinc = ic->i_ibinc;
8551e23b3eeSAndy Grover 	struct rds_header *ihdr, *hdr;
85642f2611cSChristoph Hellwig 	dma_addr_t dma_addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
8571e23b3eeSAndy Grover 
8581e23b3eeSAndy Grover 	/* XXX shut down the connection if port 0,0 are seen? */
8591e23b3eeSAndy Grover 
8601e23b3eeSAndy Grover 	rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
861597ddd50SAndy Grover 		 data_len);
8621e23b3eeSAndy Grover 
863597ddd50SAndy Grover 	if (data_len < sizeof(struct rds_header)) {
8641e23b3eeSAndy Grover 		rds_ib_conn_error(conn, "incoming message "
865eee2fa6aSKa-Cheong Poon 		       "from %pI6c didn't include a "
8661e23b3eeSAndy Grover 		       "header, disconnecting and "
8671e23b3eeSAndy Grover 		       "reconnecting\n",
8681e23b3eeSAndy Grover 		       &conn->c_faddr);
8691e23b3eeSAndy Grover 		return;
8701e23b3eeSAndy Grover 	}
871597ddd50SAndy Grover 	data_len -= sizeof(struct rds_header);
8721e23b3eeSAndy Grover 
8739b17f588SKa-Cheong Poon 	ihdr = ic->i_recv_hdrs[recv - ic->i_recvs];
8741e23b3eeSAndy Grover 
87542f2611cSChristoph Hellwig 	ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, dma_addr,
87642f2611cSChristoph Hellwig 				   sizeof(*ihdr), DMA_FROM_DEVICE);
8771e23b3eeSAndy Grover 	/* Validate the checksum. */
8781e23b3eeSAndy Grover 	if (!rds_message_verify_checksum(ihdr)) {
8791e23b3eeSAndy Grover 		rds_ib_conn_error(conn, "incoming message "
880eee2fa6aSKa-Cheong Poon 		       "from %pI6c has corrupted header - "
8811e23b3eeSAndy Grover 		       "forcing a reconnect\n",
8821e23b3eeSAndy Grover 		       &conn->c_faddr);
8831e23b3eeSAndy Grover 		rds_stats_inc(s_recv_drop_bad_checksum);
88442f2611cSChristoph Hellwig 		goto done;
8851e23b3eeSAndy Grover 	}
8861e23b3eeSAndy Grover 
8871e23b3eeSAndy Grover 	/* Process the ACK sequence which comes with every packet */
8881e23b3eeSAndy Grover 	state->ack_recv = be64_to_cpu(ihdr->h_ack);
8891e23b3eeSAndy Grover 	state->ack_recv_valid = 1;
8901e23b3eeSAndy Grover 
8911e23b3eeSAndy Grover 	/* Process the credits update if there was one */
8921e23b3eeSAndy Grover 	if (ihdr->h_credit)
8931e23b3eeSAndy Grover 		rds_ib_send_add_credits(conn, ihdr->h_credit);
8941e23b3eeSAndy Grover 
895597ddd50SAndy Grover 	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
8961e23b3eeSAndy Grover 		/* This is an ACK-only packet. The fact that it gets
8971e23b3eeSAndy Grover 		 * special treatment here is that historically, ACKs
8981e23b3eeSAndy Grover 		 * were rather special beasts.
8991e23b3eeSAndy Grover 		 */
9001e23b3eeSAndy Grover 		rds_ib_stats_inc(s_ib_ack_received);
9011e23b3eeSAndy Grover 
9021e23b3eeSAndy Grover 		/*
9031e23b3eeSAndy Grover 		 * Usually the frags make their way on to incs and are then freed as
9041e23b3eeSAndy Grover 		 * the inc is freed.  We don't go that route, so we have to drop the
9051e23b3eeSAndy Grover 		 * page ref ourselves.  We can't just leave the page on the recv
9061e23b3eeSAndy Grover 		 * because that confuses the dma mapping of pages and each recv's use
9070b088e00SAndy Grover 		 * of a partial page.
9081e23b3eeSAndy Grover 		 *
9091e23b3eeSAndy Grover 		 * FIXME: Fold this into the code path below.
9101e23b3eeSAndy Grover 		 */
91133244125SChris Mason 		rds_ib_frag_free(ic, recv->r_frag);
9120b088e00SAndy Grover 		recv->r_frag = NULL;
91342f2611cSChristoph Hellwig 		goto done;
9141e23b3eeSAndy Grover 	}
9151e23b3eeSAndy Grover 
9161e23b3eeSAndy Grover 	/*
9171e23b3eeSAndy Grover 	 * If we don't already have an inc on the connection then this
9181e23b3eeSAndy Grover 	 * fragment has a header and starts a message.. copy its header
9191e23b3eeSAndy Grover 	 * into the inc and save the inc so we can hang upcoming fragments
9201e23b3eeSAndy Grover 	 * off its list.
9211e23b3eeSAndy Grover 	 */
9228690bfa1SAndy Grover 	if (!ibinc) {
9231e23b3eeSAndy Grover 		ibinc = recv->r_ibinc;
9241e23b3eeSAndy Grover 		recv->r_ibinc = NULL;
9251e23b3eeSAndy Grover 		ic->i_ibinc = ibinc;
9261e23b3eeSAndy Grover 
9271e23b3eeSAndy Grover 		hdr = &ibinc->ii_inc.i_hdr;
9283289025aSSantosh Shilimkar 		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
9293289025aSSantosh Shilimkar 				local_clock();
9301e23b3eeSAndy Grover 		memcpy(hdr, ihdr, sizeof(*hdr));
9311e23b3eeSAndy Grover 		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
9323289025aSSantosh Shilimkar 		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
9333289025aSSantosh Shilimkar 				local_clock();
9341e23b3eeSAndy Grover 
9351e23b3eeSAndy Grover 		rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
9361e23b3eeSAndy Grover 			 ic->i_recv_data_rem, hdr->h_flags);
9371e23b3eeSAndy Grover 	} else {
9381e23b3eeSAndy Grover 		hdr = &ibinc->ii_inc.i_hdr;
9391e23b3eeSAndy Grover 		/* We can't just use memcmp here; fragments of a
9401e23b3eeSAndy Grover 		 * single message may carry different ACKs */
941f64f9e71SJoe Perches 		if (hdr->h_sequence != ihdr->h_sequence ||
942f64f9e71SJoe Perches 		    hdr->h_len != ihdr->h_len ||
943f64f9e71SJoe Perches 		    hdr->h_sport != ihdr->h_sport ||
944f64f9e71SJoe Perches 		    hdr->h_dport != ihdr->h_dport) {
9451e23b3eeSAndy Grover 			rds_ib_conn_error(conn,
9461e23b3eeSAndy Grover 				"fragment header mismatch; forcing reconnect\n");
94742f2611cSChristoph Hellwig 			goto done;
9481e23b3eeSAndy Grover 		}
9491e23b3eeSAndy Grover 	}
9501e23b3eeSAndy Grover 
9511e23b3eeSAndy Grover 	list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
9521e23b3eeSAndy Grover 	recv->r_frag = NULL;
9531e23b3eeSAndy Grover 
9541e23b3eeSAndy Grover 	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
9551e23b3eeSAndy Grover 		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
9561e23b3eeSAndy Grover 	else {
9571e23b3eeSAndy Grover 		ic->i_recv_data_rem = 0;
9581e23b3eeSAndy Grover 		ic->i_ibinc = NULL;
9591e23b3eeSAndy Grover 
960eee2fa6aSKa-Cheong Poon 		if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
9611e23b3eeSAndy Grover 			rds_ib_cong_recv(conn, ibinc);
962eee2fa6aSKa-Cheong Poon 		} else {
963eee2fa6aSKa-Cheong Poon 			rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
9646114eab5SCong Wang 					  &ibinc->ii_inc, GFP_ATOMIC);
9651e23b3eeSAndy Grover 			state->ack_next = be64_to_cpu(hdr->h_sequence);
9661e23b3eeSAndy Grover 			state->ack_next_valid = 1;
9671e23b3eeSAndy Grover 		}
9681e23b3eeSAndy Grover 
9691e23b3eeSAndy Grover 		/* Evaluate the ACK_REQUIRED flag *after* we received
9701e23b3eeSAndy Grover 		 * the complete frame, and after bumping the next_rx
9711e23b3eeSAndy Grover 		 * sequence. */
9721e23b3eeSAndy Grover 		if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
9731e23b3eeSAndy Grover 			rds_stats_inc(s_recv_ack_required);
9741e23b3eeSAndy Grover 			state->ack_required = 1;
9751e23b3eeSAndy Grover 		}
9761e23b3eeSAndy Grover 
9771e23b3eeSAndy Grover 		rds_inc_put(&ibinc->ii_inc);
9781e23b3eeSAndy Grover 	}
97942f2611cSChristoph Hellwig done:
98042f2611cSChristoph Hellwig 	ib_dma_sync_single_for_device(ic->rds_ibdev->dev, dma_addr,
98142f2611cSChristoph Hellwig 				      sizeof(*ihdr), DMA_FROM_DEVICE);
9821e23b3eeSAndy Grover }
9831e23b3eeSAndy Grover 
rds_ib_recv_cqe_handler(struct rds_ib_connection * ic,struct ib_wc * wc,struct rds_ib_ack_state * state)984f4f943c9SSantosh Shilimkar void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
985f4f943c9SSantosh Shilimkar 			     struct ib_wc *wc,
986d521b63bSAndy Grover 			     struct rds_ib_ack_state *state)
987d521b63bSAndy Grover {
988d521b63bSAndy Grover 	struct rds_connection *conn = ic->conn;
989d521b63bSAndy Grover 	struct rds_ib_recv_work *recv;
990d521b63bSAndy Grover 
99159f740a6SZach Brown 	rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
992f4f943c9SSantosh Shilimkar 		 (unsigned long long)wc->wr_id, wc->status,
993f4f943c9SSantosh Shilimkar 		 ib_wc_status_msg(wc->status), wc->byte_len,
994f4f943c9SSantosh Shilimkar 		 be32_to_cpu(wc->ex.imm_data));
995f4f943c9SSantosh Shilimkar 
9961e23b3eeSAndy Grover 	rds_ib_stats_inc(s_ib_rx_cq_event);
9971e23b3eeSAndy Grover 	recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
998f4f943c9SSantosh Shilimkar 	ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1,
999f4f943c9SSantosh Shilimkar 			DMA_FROM_DEVICE);
10001e23b3eeSAndy Grover 
1001f4f943c9SSantosh Shilimkar 	/* Also process recvs in connecting state because it is possible
10021e23b3eeSAndy Grover 	 * to get a recv completion _before_ the rdmacm ESTABLISHED
10031e23b3eeSAndy Grover 	 * event is processed.
10041e23b3eeSAndy Grover 	 */
1005f4f943c9SSantosh Shilimkar 	if (wc->status == IB_WC_SUCCESS) {
1006f4f943c9SSantosh Shilimkar 		rds_ib_process_recv(conn, recv, wc->byte_len, state);
10071e23b3eeSAndy Grover 	} else {
1008d455ab64SZach Brown 		/* We expect errors as the qp is drained during shutdown */
1009d455ab64SZach Brown 		if (rds_conn_up(conn) || rds_conn_connecting(conn))
1010fab401e1SSudhakar Dindukurti 			rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n",
1011ff3f19a2SSantosh Shilimkar 					  &conn->c_laddr, &conn->c_faddr,
1012fd261ce6SSantosh Shilimkar 					  conn->c_tos, wc->status,
1013fab401e1SSudhakar Dindukurti 					  ib_wc_status_msg(wc->status),
1014fab401e1SSudhakar Dindukurti 					  wc->vendor_err);
10151e23b3eeSAndy Grover 	}
10161e23b3eeSAndy Grover 
1017f4f943c9SSantosh Shilimkar 	/* rds_ib_process_recv() doesn't always consume the frag, and
101843962dd7Ssantosh.shilimkar@oracle.com 	 * we might not have called it at all if the wc didn't indicate
101943962dd7Ssantosh.shilimkar@oracle.com 	 * success. We already unmapped the frag's pages, though, and
102043962dd7Ssantosh.shilimkar@oracle.com 	 * the following rds_ib_ring_free() call tells the refill path
102143962dd7Ssantosh.shilimkar@oracle.com 	 * that it will not find an allocated frag here. Make sure we
102243962dd7Ssantosh.shilimkar@oracle.com 	 * keep that promise by freeing a frag that's still on the ring.
1023d455ab64SZach Brown 	 */
102443962dd7Ssantosh.shilimkar@oracle.com 	if (recv->r_frag) {
102543962dd7Ssantosh.shilimkar@oracle.com 		rds_ib_frag_free(ic, recv->r_frag);
102643962dd7Ssantosh.shilimkar@oracle.com 		recv->r_frag = NULL;
102743962dd7Ssantosh.shilimkar@oracle.com 	}
10281e23b3eeSAndy Grover 	rds_ib_ring_free(&ic->i_recv_ring, 1);
10291e23b3eeSAndy Grover 
10301e23b3eeSAndy Grover 	/* If we ever end up with a really empty receive ring, we're
10311e23b3eeSAndy Grover 	 * in deep trouble, as the sender will definitely see RNR
10321e23b3eeSAndy Grover 	 * timeouts. */
10331e23b3eeSAndy Grover 	if (rds_ib_ring_empty(&ic->i_recv_ring))
10341e23b3eeSAndy Grover 		rds_ib_stats_inc(s_ib_rx_ring_empty);
10351e23b3eeSAndy Grover 
103605bfd7dbSHåkon Bugge 	if (rds_ib_ring_low(&ic->i_recv_ring)) {
10379f0bb95eSManjunath Patil 		rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN);
103805bfd7dbSHåkon Bugge 		rds_ib_stats_inc(s_ib_rx_refill_from_cq);
103905bfd7dbSHåkon Bugge 	}
10401e23b3eeSAndy Grover }
10411e23b3eeSAndy Grover 
rds_ib_recv_path(struct rds_conn_path * cp)10422da43c4aSSowmini Varadhan int rds_ib_recv_path(struct rds_conn_path *cp)
10431e23b3eeSAndy Grover {
10442da43c4aSSowmini Varadhan 	struct rds_connection *conn = cp->cp_conn;
10451e23b3eeSAndy Grover 	struct rds_ib_connection *ic = conn->c_transport_data;
10461e23b3eeSAndy Grover 
10471e23b3eeSAndy Grover 	rdsdebug("conn %p\n", conn);
104873ce4317Ssantosh.shilimkar@oracle.com 	if (rds_conn_up(conn)) {
10491e23b3eeSAndy Grover 		rds_ib_attempt_ack(ic);
105073ce4317Ssantosh.shilimkar@oracle.com 		rds_ib_recv_refill(conn, 0, GFP_KERNEL);
105105bfd7dbSHåkon Bugge 		rds_ib_stats_inc(s_ib_rx_refill_from_thread);
105273ce4317Ssantosh.shilimkar@oracle.com 	}
10531e23b3eeSAndy Grover 
1054fa52531eSHåkon Bugge 	return 0;
10551e23b3eeSAndy Grover }
10561e23b3eeSAndy Grover 
rds_ib_recv_init(void)1057ef87b7eaSZach Brown int rds_ib_recv_init(void)
10581e23b3eeSAndy Grover {
10591e23b3eeSAndy Grover 	struct sysinfo si;
10601e23b3eeSAndy Grover 	int ret = -ENOMEM;
10611e23b3eeSAndy Grover 
10621e23b3eeSAndy Grover 	/* Default to 30% of all available RAM for recv memory */
10631e23b3eeSAndy Grover 	si_meminfo(&si);
10641e23b3eeSAndy Grover 	rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
10651e23b3eeSAndy Grover 
1066bf1867dbSDag Moxnes 	rds_ib_incoming_slab =
1067bf1867dbSDag Moxnes 		kmem_cache_create_usercopy("rds_ib_incoming",
10681e23b3eeSAndy Grover 					   sizeof(struct rds_ib_incoming),
1069bf1867dbSDag Moxnes 					   0, SLAB_HWCACHE_ALIGN,
1070bf1867dbSDag Moxnes 					   offsetof(struct rds_ib_incoming,
1071bf1867dbSDag Moxnes 						    ii_inc.i_usercopy),
1072bf1867dbSDag Moxnes 					   sizeof(struct rds_inc_usercopy),
1073bf1867dbSDag Moxnes 					   NULL);
10748690bfa1SAndy Grover 	if (!rds_ib_incoming_slab)
10751e23b3eeSAndy Grover 		goto out;
10761e23b3eeSAndy Grover 
10771e23b3eeSAndy Grover 	rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
10781e23b3eeSAndy Grover 					sizeof(struct rds_page_frag),
1079c20f5b96SAndy Grover 					0, SLAB_HWCACHE_ALIGN, NULL);
1080ba54d3ceSsantosh.shilimkar@oracle.com 	if (!rds_ib_frag_slab) {
10811e23b3eeSAndy Grover 		kmem_cache_destroy(rds_ib_incoming_slab);
1082ba54d3ceSsantosh.shilimkar@oracle.com 		rds_ib_incoming_slab = NULL;
1083ba54d3ceSsantosh.shilimkar@oracle.com 	} else
10841e23b3eeSAndy Grover 		ret = 0;
10851e23b3eeSAndy Grover out:
10861e23b3eeSAndy Grover 	return ret;
10871e23b3eeSAndy Grover }
10881e23b3eeSAndy Grover 
rds_ib_recv_exit(void)10891e23b3eeSAndy Grover void rds_ib_recv_exit(void)
10901e23b3eeSAndy Grover {
1091b50e0587SZhu Yanjun 	WARN_ON(atomic_read(&rds_ib_allocation));
1092b50e0587SZhu Yanjun 
10931e23b3eeSAndy Grover 	kmem_cache_destroy(rds_ib_incoming_slab);
10941e23b3eeSAndy Grover 	kmem_cache_destroy(rds_ib_frag_slab);
10951e23b3eeSAndy Grover }
1096