xref: /linux/net/smc/smc_core.c (revision b04df400c30235fa347313c9e2a0695549bd2c8e)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  Basic Transport Functions exploiting Infiniband API
6  *
7  *  Copyright IBM Corp. 2016
8  *
9  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
10  */
11 
12 #include <linux/socket.h>
13 #include <linux/if_vlan.h>
14 #include <linux/random.h>
15 #include <linux/workqueue.h>
16 #include <net/tcp.h>
17 #include <net/sock.h>
18 #include <rdma/ib_verbs.h>
19 
20 #include "smc.h"
21 #include "smc_clc.h"
22 #include "smc_core.h"
23 #include "smc_ib.h"
24 #include "smc_wr.h"
25 #include "smc_llc.h"
26 #include "smc_cdc.h"
27 #include "smc_close.h"
28 
29 #define SMC_LGR_NUM_INCR		256
30 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
31 #define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10)
32 
33 static u32 smc_lgr_num;			/* unique link group number */
34 
35 static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
36 			 bool is_rmb);
37 
38 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
39 {
40 	/* client link group creation always follows the server link group
41 	 * creation. For client use a somewhat higher removal delay time,
42 	 * otherwise there is a risk of out-of-sync link groups.
43 	 */
44 	mod_delayed_work(system_wq, &lgr->free_work,
45 			 lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT :
46 						 SMC_LGR_FREE_DELAY_SERV);
47 }
48 
49 /* Register connection's alert token in our lookup structure.
50  * To use rbtrees we have to implement our own insert core.
51  * Requires @conns_lock
52  * @smc		connection to register
53  * Returns 0 on success, != otherwise.
54  */
55 static void smc_lgr_add_alert_token(struct smc_connection *conn)
56 {
57 	struct rb_node **link, *parent = NULL;
58 	u32 token = conn->alert_token_local;
59 
60 	link = &conn->lgr->conns_all.rb_node;
61 	while (*link) {
62 		struct smc_connection *cur = rb_entry(*link,
63 					struct smc_connection, alert_node);
64 
65 		parent = *link;
66 		if (cur->alert_token_local > token)
67 			link = &parent->rb_left;
68 		else
69 			link = &parent->rb_right;
70 	}
71 	/* Put the new node there */
72 	rb_link_node(&conn->alert_node, parent, link);
73 	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
74 }
75 
76 /* Register connection in link group by assigning an alert token
77  * registered in a search tree.
78  * Requires @conns_lock
79  * Note that '0' is a reserved value and not assigned.
80  */
81 static void smc_lgr_register_conn(struct smc_connection *conn)
82 {
83 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
84 	static atomic_t nexttoken = ATOMIC_INIT(0);
85 
86 	/* find a new alert_token_local value not yet used by some connection
87 	 * in this link group
88 	 */
89 	sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
90 	while (!conn->alert_token_local) {
91 		conn->alert_token_local = atomic_inc_return(&nexttoken);
92 		if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
93 			conn->alert_token_local = 0;
94 	}
95 	smc_lgr_add_alert_token(conn);
96 	conn->lgr->conns_num++;
97 }
98 
99 /* Unregister connection and reset the alert token of the given connection<
100  */
101 static void __smc_lgr_unregister_conn(struct smc_connection *conn)
102 {
103 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
104 	struct smc_link_group *lgr = conn->lgr;
105 
106 	rb_erase(&conn->alert_node, &lgr->conns_all);
107 	lgr->conns_num--;
108 	conn->alert_token_local = 0;
109 	conn->lgr = NULL;
110 	sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
111 }
112 
113 /* Unregister connection and trigger lgr freeing if applicable
114  */
115 static void smc_lgr_unregister_conn(struct smc_connection *conn)
116 {
117 	struct smc_link_group *lgr = conn->lgr;
118 	int reduced = 0;
119 
120 	write_lock_bh(&lgr->conns_lock);
121 	if (conn->alert_token_local) {
122 		reduced = 1;
123 		__smc_lgr_unregister_conn(conn);
124 	}
125 	write_unlock_bh(&lgr->conns_lock);
126 	if (!reduced || lgr->conns_num)
127 		return;
128 	smc_lgr_schedule_free_work(lgr);
129 }
130 
131 static void smc_lgr_free_work(struct work_struct *work)
132 {
133 	struct smc_link_group *lgr = container_of(to_delayed_work(work),
134 						  struct smc_link_group,
135 						  free_work);
136 	bool conns;
137 
138 	spin_lock_bh(&smc_lgr_list.lock);
139 	if (list_empty(&lgr->list))
140 		goto free;
141 	read_lock_bh(&lgr->conns_lock);
142 	conns = RB_EMPTY_ROOT(&lgr->conns_all);
143 	read_unlock_bh(&lgr->conns_lock);
144 	if (!conns) { /* number of lgr connections is no longer zero */
145 		spin_unlock_bh(&smc_lgr_list.lock);
146 		return;
147 	}
148 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
149 free:
150 	spin_unlock_bh(&smc_lgr_list.lock);
151 	if (!delayed_work_pending(&lgr->free_work)) {
152 		if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
153 			smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
154 		smc_lgr_free(lgr);
155 	}
156 }
157 
158 /* create a new SMC link group */
159 static int smc_lgr_create(struct smc_sock *smc,
160 			  struct smc_ib_device *smcibdev, u8 ibport,
161 			  char *peer_systemid, unsigned short vlan_id)
162 {
163 	struct smc_link_group *lgr;
164 	struct smc_link *lnk;
165 	u8 rndvec[3];
166 	int rc = 0;
167 	int i;
168 
169 	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
170 	if (!lgr) {
171 		rc = -ENOMEM;
172 		goto out;
173 	}
174 	lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
175 	lgr->sync_err = 0;
176 	memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
177 	lgr->vlan_id = vlan_id;
178 	rwlock_init(&lgr->sndbufs_lock);
179 	rwlock_init(&lgr->rmbs_lock);
180 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
181 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
182 		INIT_LIST_HEAD(&lgr->rmbs[i]);
183 	}
184 	smc_lgr_num += SMC_LGR_NUM_INCR;
185 	memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE);
186 	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
187 	lgr->conns_all = RB_ROOT;
188 
189 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
190 	/* initialize link */
191 	lnk->state = SMC_LNK_ACTIVATING;
192 	lnk->link_id = SMC_SINGLE_LINK;
193 	lnk->smcibdev = smcibdev;
194 	lnk->ibport = ibport;
195 	lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
196 	if (!smcibdev->initialized)
197 		smc_ib_setup_per_ibdev(smcibdev);
198 	get_random_bytes(rndvec, sizeof(rndvec));
199 	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
200 	rc = smc_llc_link_init(lnk);
201 	if (rc)
202 		goto free_lgr;
203 	rc = smc_wr_alloc_link_mem(lnk);
204 	if (rc)
205 		goto clear_llc_lnk;
206 	rc = smc_ib_create_protection_domain(lnk);
207 	if (rc)
208 		goto free_link_mem;
209 	rc = smc_ib_create_queue_pair(lnk);
210 	if (rc)
211 		goto dealloc_pd;
212 	rc = smc_wr_create_link(lnk);
213 	if (rc)
214 		goto destroy_qp;
215 
216 	smc->conn.lgr = lgr;
217 	rwlock_init(&lgr->conns_lock);
218 	spin_lock_bh(&smc_lgr_list.lock);
219 	list_add(&lgr->list, &smc_lgr_list.list);
220 	spin_unlock_bh(&smc_lgr_list.lock);
221 	return 0;
222 
223 destroy_qp:
224 	smc_ib_destroy_queue_pair(lnk);
225 dealloc_pd:
226 	smc_ib_dealloc_protection_domain(lnk);
227 free_link_mem:
228 	smc_wr_free_link_mem(lnk);
229 clear_llc_lnk:
230 	smc_llc_link_clear(lnk);
231 free_lgr:
232 	kfree(lgr);
233 out:
234 	return rc;
235 }
236 
237 static void smc_buf_unuse(struct smc_connection *conn)
238 {
239 	if (conn->sndbuf_desc) {
240 		conn->sndbuf_desc->used = 0;
241 		conn->sndbuf_size = 0;
242 	}
243 	if (conn->rmb_desc) {
244 		if (!conn->rmb_desc->regerr) {
245 			conn->rmb_desc->reused = 1;
246 			conn->rmb_desc->used = 0;
247 			conn->rmbe_size = 0;
248 		} else {
249 			/* buf registration failed, reuse not possible */
250 			struct smc_link_group *lgr = conn->lgr;
251 			struct smc_link *lnk;
252 
253 			write_lock_bh(&lgr->rmbs_lock);
254 			list_del(&conn->rmb_desc->list);
255 			write_unlock_bh(&lgr->rmbs_lock);
256 
257 			lnk = &lgr->lnk[SMC_SINGLE_LINK];
258 			smc_buf_free(conn->rmb_desc, lnk, true);
259 		}
260 	}
261 }
262 
263 /* remove a finished connection from its link group */
264 void smc_conn_free(struct smc_connection *conn)
265 {
266 	if (!conn->lgr)
267 		return;
268 	smc_cdc_tx_dismiss_slots(conn);
269 	smc_lgr_unregister_conn(conn);
270 	smc_buf_unuse(conn);
271 }
272 
273 static void smc_link_clear(struct smc_link *lnk)
274 {
275 	lnk->peer_qpn = 0;
276 	smc_llc_link_clear(lnk);
277 	smc_ib_modify_qp_reset(lnk);
278 	smc_wr_free_link(lnk);
279 	smc_ib_destroy_queue_pair(lnk);
280 	smc_ib_dealloc_protection_domain(lnk);
281 	smc_wr_free_link_mem(lnk);
282 }
283 
284 static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
285 			 bool is_rmb)
286 {
287 	if (is_rmb) {
288 		if (buf_desc->mr_rx[SMC_SINGLE_LINK])
289 			smc_ib_put_memory_region(
290 					buf_desc->mr_rx[SMC_SINGLE_LINK]);
291 		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
292 				    DMA_FROM_DEVICE);
293 	} else {
294 		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
295 				    DMA_TO_DEVICE);
296 	}
297 	sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
298 	if (buf_desc->pages)
299 		__free_pages(buf_desc->pages, buf_desc->order);
300 	kfree(buf_desc);
301 }
302 
303 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
304 {
305 	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
306 	struct smc_buf_desc *buf_desc, *bf_desc;
307 	struct list_head *buf_list;
308 	int i;
309 
310 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
311 		if (is_rmb)
312 			buf_list = &lgr->rmbs[i];
313 		else
314 			buf_list = &lgr->sndbufs[i];
315 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
316 					 list) {
317 			list_del(&buf_desc->list);
318 			smc_buf_free(buf_desc, lnk, is_rmb);
319 		}
320 	}
321 }
322 
323 static void smc_lgr_free_bufs(struct smc_link_group *lgr)
324 {
325 	/* free send buffers */
326 	__smc_lgr_free_bufs(lgr, false);
327 	/* free rmbs */
328 	__smc_lgr_free_bufs(lgr, true);
329 }
330 
331 /* remove a link group */
332 void smc_lgr_free(struct smc_link_group *lgr)
333 {
334 	smc_lgr_free_bufs(lgr);
335 	smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
336 	kfree(lgr);
337 }
338 
339 void smc_lgr_forget(struct smc_link_group *lgr)
340 {
341 	spin_lock_bh(&smc_lgr_list.lock);
342 	/* do not use this link group for new connections */
343 	if (!list_empty(&lgr->list))
344 		list_del_init(&lgr->list);
345 	spin_unlock_bh(&smc_lgr_list.lock);
346 }
347 
348 /* terminate linkgroup abnormally */
349 void smc_lgr_terminate(struct smc_link_group *lgr)
350 {
351 	struct smc_connection *conn;
352 	struct smc_sock *smc;
353 	struct rb_node *node;
354 
355 	if (lgr->terminating)
356 		return;	/* lgr already terminating */
357 	lgr->terminating = 1;
358 	smc_lgr_forget(lgr);
359 	smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
360 
361 	write_lock_bh(&lgr->conns_lock);
362 	node = rb_first(&lgr->conns_all);
363 	while (node) {
364 		conn = rb_entry(node, struct smc_connection, alert_node);
365 		smc = container_of(conn, struct smc_sock, conn);
366 		sock_hold(&smc->sk); /* sock_put in close work */
367 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
368 		__smc_lgr_unregister_conn(conn);
369 		write_unlock_bh(&lgr->conns_lock);
370 		if (!schedule_work(&conn->close_work))
371 			sock_put(&smc->sk);
372 		write_lock_bh(&lgr->conns_lock);
373 		node = rb_first(&lgr->conns_all);
374 	}
375 	write_unlock_bh(&lgr->conns_lock);
376 	wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
377 	smc_lgr_schedule_free_work(lgr);
378 }
379 
380 /* Determine vlan of internal TCP socket.
381  * @vlan_id: address to store the determined vlan id into
382  */
383 static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
384 {
385 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
386 	struct net_device *ndev;
387 	int i, nest_lvl, rc = 0;
388 
389 	*vlan_id = 0;
390 	if (!dst) {
391 		rc = -ENOTCONN;
392 		goto out;
393 	}
394 	if (!dst->dev) {
395 		rc = -ENODEV;
396 		goto out_rel;
397 	}
398 
399 	ndev = dst->dev;
400 	if (is_vlan_dev(ndev)) {
401 		*vlan_id = vlan_dev_vlan_id(ndev);
402 		goto out_rel;
403 	}
404 
405 	rtnl_lock();
406 	nest_lvl = dev_get_nest_level(ndev);
407 	for (i = 0; i < nest_lvl; i++) {
408 		struct list_head *lower = &ndev->adj_list.lower;
409 
410 		if (list_empty(lower))
411 			break;
412 		lower = lower->next;
413 		ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
414 		if (is_vlan_dev(ndev)) {
415 			*vlan_id = vlan_dev_vlan_id(ndev);
416 			break;
417 		}
418 	}
419 	rtnl_unlock();
420 
421 out_rel:
422 	dst_release(dst);
423 out:
424 	return rc;
425 }
426 
427 /* determine the link gid matching the vlan id of the link group */
428 static int smc_link_determine_gid(struct smc_link_group *lgr)
429 {
430 	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
431 	struct ib_gid_attr gattr;
432 	union ib_gid gid;
433 	int i;
434 
435 	if (!lgr->vlan_id) {
436 		lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
437 		return 0;
438 	}
439 
440 	for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
441 	     i++) {
442 		if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
443 				 &gattr))
444 			continue;
445 		if (gattr.ndev) {
446 			if (is_vlan_dev(gattr.ndev) &&
447 			    vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
448 				lnk->gid = gid;
449 				dev_put(gattr.ndev);
450 				return 0;
451 			}
452 			dev_put(gattr.ndev);
453 		}
454 	}
455 	return -ENODEV;
456 }
457 
458 /* create a new SMC connection (and a new link group if necessary) */
459 int smc_conn_create(struct smc_sock *smc,
460 		    struct smc_ib_device *smcibdev, u8 ibport,
461 		    struct smc_clc_msg_local *lcl, int srv_first_contact)
462 {
463 	struct smc_connection *conn = &smc->conn;
464 	struct smc_link_group *lgr;
465 	unsigned short vlan_id;
466 	enum smc_lgr_role role;
467 	int local_contact = SMC_FIRST_CONTACT;
468 	int rc = 0;
469 
470 	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
471 	rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
472 	if (rc)
473 		return rc;
474 
475 	if ((role == SMC_CLNT) && srv_first_contact)
476 		/* create new link group as well */
477 		goto create;
478 
479 	/* determine if an existing link group can be reused */
480 	spin_lock_bh(&smc_lgr_list.lock);
481 	list_for_each_entry(lgr, &smc_lgr_list.list, list) {
482 		write_lock_bh(&lgr->conns_lock);
483 		if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
484 			    SMC_SYSTEMID_LEN) &&
485 		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
486 			    SMC_GID_SIZE) &&
487 		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
488 			    sizeof(lcl->mac)) &&
489 		    !lgr->sync_err &&
490 		    (lgr->role == role) &&
491 		    (lgr->vlan_id == vlan_id) &&
492 		    ((role == SMC_CLNT) ||
493 		     (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
494 			/* link group found */
495 			local_contact = SMC_REUSE_CONTACT;
496 			conn->lgr = lgr;
497 			smc_lgr_register_conn(conn); /* add smc conn to lgr */
498 			write_unlock_bh(&lgr->conns_lock);
499 			break;
500 		}
501 		write_unlock_bh(&lgr->conns_lock);
502 	}
503 	spin_unlock_bh(&smc_lgr_list.lock);
504 
505 	if (role == SMC_CLNT && !srv_first_contact &&
506 	    (local_contact == SMC_FIRST_CONTACT)) {
507 		/* Server reuses a link group, but Client wants to start
508 		 * a new one
509 		 * send out_of_sync decline, reason synchr. error
510 		 */
511 		return -ENOLINK;
512 	}
513 
514 create:
515 	if (local_contact == SMC_FIRST_CONTACT) {
516 		rc = smc_lgr_create(smc, smcibdev, ibport,
517 				    lcl->id_for_peer, vlan_id);
518 		if (rc)
519 			goto out;
520 		smc_lgr_register_conn(conn); /* add smc conn to lgr */
521 		rc = smc_link_determine_gid(conn->lgr);
522 	}
523 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
524 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
525 #ifndef KERNEL_HAS_ATOMIC64
526 	spin_lock_init(&conn->acurs_lock);
527 #endif
528 
529 out:
530 	return rc ? rc : local_contact;
531 }
532 
533 /* try to reuse a sndbuf or rmb description slot for a certain
534  * buffer size; if not available, return NULL
535  */
536 static inline
537 struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr,
538 				      int compressed_bufsize,
539 				      rwlock_t *lock,
540 				      struct list_head *buf_list)
541 {
542 	struct smc_buf_desc *buf_slot;
543 
544 	read_lock_bh(lock);
545 	list_for_each_entry(buf_slot, buf_list, list) {
546 		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
547 			read_unlock_bh(lock);
548 			return buf_slot;
549 		}
550 	}
551 	read_unlock_bh(lock);
552 	return NULL;
553 }
554 
555 /* one of the conditions for announcing a receiver's current window size is
556  * that it "results in a minimum increase in the window size of 10% of the
557  * receive buffer space" [RFC7609]
558  */
559 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
560 {
561 	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
562 }
563 
564 static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
565 					       bool is_rmb, int bufsize)
566 {
567 	struct smc_buf_desc *buf_desc;
568 	struct smc_link *lnk;
569 	int rc;
570 
571 	/* try to alloc a new buffer */
572 	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
573 	if (!buf_desc)
574 		return ERR_PTR(-ENOMEM);
575 
576 	buf_desc->order = get_order(bufsize);
577 	buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
578 				      __GFP_NOMEMALLOC | __GFP_COMP |
579 				      __GFP_NORETRY | __GFP_ZERO,
580 				      buf_desc->order);
581 	if (!buf_desc->pages) {
582 		kfree(buf_desc);
583 		return ERR_PTR(-EAGAIN);
584 	}
585 	buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
586 
587 	/* build the sg table from the pages */
588 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
589 	rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
590 			    GFP_KERNEL);
591 	if (rc) {
592 		smc_buf_free(buf_desc, lnk, is_rmb);
593 		return ERR_PTR(rc);
594 	}
595 	sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
596 		   buf_desc->cpu_addr, bufsize);
597 
598 	/* map sg table to DMA address */
599 	rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
600 			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
601 	/* SMC protocol depends on mapping to one DMA address only */
602 	if (rc != 1)  {
603 		smc_buf_free(buf_desc, lnk, is_rmb);
604 		return ERR_PTR(-EAGAIN);
605 	}
606 
607 	/* create a new memory region for the RMB */
608 	if (is_rmb) {
609 		rc = smc_ib_get_memory_region(lnk->roce_pd,
610 					      IB_ACCESS_REMOTE_WRITE |
611 					      IB_ACCESS_LOCAL_WRITE,
612 					      buf_desc);
613 		if (rc) {
614 			smc_buf_free(buf_desc, lnk, is_rmb);
615 			return ERR_PTR(rc);
616 		}
617 	}
618 
619 	return buf_desc;
620 }
621 
622 static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
623 {
624 	struct smc_connection *conn = &smc->conn;
625 	struct smc_link_group *lgr = conn->lgr;
626 	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
627 	struct list_head *buf_list;
628 	int bufsize, bufsize_short;
629 	int sk_buf_size;
630 	rwlock_t *lock;
631 
632 	if (is_rmb)
633 		/* use socket recv buffer size (w/o overhead) as start value */
634 		sk_buf_size = smc->sk.sk_rcvbuf / 2;
635 	else
636 		/* use socket send buffer size (w/o overhead) as start value */
637 		sk_buf_size = smc->sk.sk_sndbuf / 2;
638 
639 	for (bufsize_short = smc_compress_bufsize(sk_buf_size);
640 	     bufsize_short >= 0; bufsize_short--) {
641 
642 		if (is_rmb) {
643 			lock = &lgr->rmbs_lock;
644 			buf_list = &lgr->rmbs[bufsize_short];
645 		} else {
646 			lock = &lgr->sndbufs_lock;
647 			buf_list = &lgr->sndbufs[bufsize_short];
648 		}
649 		bufsize = smc_uncompress_bufsize(bufsize_short);
650 		if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
651 			continue;
652 
653 		/* check for reusable slot in the link group */
654 		buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list);
655 		if (buf_desc) {
656 			memset(buf_desc->cpu_addr, 0, bufsize);
657 			break; /* found reusable slot */
658 		}
659 
660 		buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize);
661 		if (PTR_ERR(buf_desc) == -ENOMEM)
662 			break;
663 		if (IS_ERR(buf_desc))
664 			continue;
665 
666 		buf_desc->used = 1;
667 		write_lock_bh(lock);
668 		list_add(&buf_desc->list, buf_list);
669 		write_unlock_bh(lock);
670 		break; /* found */
671 	}
672 
673 	if (IS_ERR(buf_desc))
674 		return -ENOMEM;
675 
676 	if (is_rmb) {
677 		conn->rmb_desc = buf_desc;
678 		conn->rmbe_size = bufsize;
679 		conn->rmbe_size_short = bufsize_short;
680 		smc->sk.sk_rcvbuf = bufsize * 2;
681 		atomic_set(&conn->bytes_to_rcv, 0);
682 		conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
683 	} else {
684 		conn->sndbuf_desc = buf_desc;
685 		conn->sndbuf_size = bufsize;
686 		smc->sk.sk_sndbuf = bufsize * 2;
687 		atomic_set(&conn->sndbuf_space, bufsize);
688 	}
689 	return 0;
690 }
691 
692 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
693 {
694 	struct smc_link_group *lgr = conn->lgr;
695 
696 	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
697 			       conn->sndbuf_desc, DMA_TO_DEVICE);
698 }
699 
700 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
701 {
702 	struct smc_link_group *lgr = conn->lgr;
703 
704 	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
705 				  conn->sndbuf_desc, DMA_TO_DEVICE);
706 }
707 
708 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
709 {
710 	struct smc_link_group *lgr = conn->lgr;
711 
712 	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
713 			       conn->rmb_desc, DMA_FROM_DEVICE);
714 }
715 
716 void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
717 {
718 	struct smc_link_group *lgr = conn->lgr;
719 
720 	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
721 				  conn->rmb_desc, DMA_FROM_DEVICE);
722 }
723 
724 /* create the send and receive buffer for an SMC socket;
725  * receive buffers are called RMBs;
726  * (even though the SMC protocol allows more than one RMB-element per RMB,
727  * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
728  * extra RMB for every connection in a link group
729  */
730 int smc_buf_create(struct smc_sock *smc)
731 {
732 	int rc;
733 
734 	/* create send buffer */
735 	rc = __smc_buf_create(smc, false);
736 	if (rc)
737 		return rc;
738 	/* create rmb */
739 	rc = __smc_buf_create(smc, true);
740 	if (rc)
741 		smc_buf_free(smc->conn.sndbuf_desc,
742 			     &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false);
743 	return rc;
744 }
745 
746 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
747 {
748 	int i;
749 
750 	for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
751 		if (!test_and_set_bit(i, lgr->rtokens_used_mask))
752 			return i;
753 	}
754 	return -ENOSPC;
755 }
756 
757 /* add a new rtoken from peer */
758 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
759 {
760 	u64 dma_addr = be64_to_cpu(nw_vaddr);
761 	u32 rkey = ntohl(nw_rkey);
762 	int i;
763 
764 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
765 		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
766 		    (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
767 		    test_bit(i, lgr->rtokens_used_mask)) {
768 			/* already in list */
769 			return i;
770 		}
771 	}
772 	i = smc_rmb_reserve_rtoken_idx(lgr);
773 	if (i < 0)
774 		return i;
775 	lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
776 	lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
777 	return i;
778 }
779 
780 /* delete an rtoken */
781 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
782 {
783 	u32 rkey = ntohl(nw_rkey);
784 	int i;
785 
786 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
787 		if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
788 		    test_bit(i, lgr->rtokens_used_mask)) {
789 			lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
790 			lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;
791 
792 			clear_bit(i, lgr->rtokens_used_mask);
793 			return 0;
794 		}
795 	}
796 	return -ENOENT;
797 }
798 
799 /* save rkey and dma_addr received from peer during clc handshake */
800 int smc_rmb_rtoken_handling(struct smc_connection *conn,
801 			    struct smc_clc_msg_accept_confirm *clc)
802 {
803 	conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
804 					  clc->rmb_rkey);
805 	if (conn->rtoken_idx < 0)
806 		return conn->rtoken_idx;
807 	return 0;
808 }
809