xref: /linux/net/smc/smc_core.c (revision b9b77222d4ff6b5bb8f5d87fca20de0910618bb9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  Basic Transport Functions exploiting Infiniband API
6  *
7  *  Copyright IBM Corp. 2016
8  *
9  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
10  */
11 
12 #include <linux/socket.h>
13 #include <linux/if_vlan.h>
14 #include <linux/random.h>
15 #include <linux/workqueue.h>
16 #include <net/tcp.h>
17 #include <net/sock.h>
18 #include <rdma/ib_verbs.h>
19 
20 #include "smc.h"
21 #include "smc_clc.h"
22 #include "smc_core.h"
23 #include "smc_ib.h"
24 #include "smc_wr.h"
25 #include "smc_llc.h"
26 #include "smc_cdc.h"
27 #include "smc_close.h"
28 
29 #define SMC_LGR_NUM_INCR		256
30 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
31 #define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
32 
33 static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
34 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
35 	.list = LIST_HEAD_INIT(smc_lgr_list.list),
36 	.num = 0,
37 };
38 
39 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
40 			 struct smc_buf_desc *buf_desc);
41 
42 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
43 {
44 	/* client link group creation always follows the server link group
45 	 * creation. For client use a somewhat higher removal delay time,
46 	 * otherwise there is a risk of out-of-sync link groups.
47 	 */
48 	mod_delayed_work(system_wq, &lgr->free_work,
49 			 lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT :
50 						 SMC_LGR_FREE_DELAY_SERV);
51 }
52 
53 /* Register connection's alert token in our lookup structure.
54  * To use rbtrees we have to implement our own insert core.
55  * Requires @conns_lock
56  * @smc		connection to register
57  * Returns 0 on success, != otherwise.
58  */
59 static void smc_lgr_add_alert_token(struct smc_connection *conn)
60 {
61 	struct rb_node **link, *parent = NULL;
62 	u32 token = conn->alert_token_local;
63 
64 	link = &conn->lgr->conns_all.rb_node;
65 	while (*link) {
66 		struct smc_connection *cur = rb_entry(*link,
67 					struct smc_connection, alert_node);
68 
69 		parent = *link;
70 		if (cur->alert_token_local > token)
71 			link = &parent->rb_left;
72 		else
73 			link = &parent->rb_right;
74 	}
75 	/* Put the new node there */
76 	rb_link_node(&conn->alert_node, parent, link);
77 	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
78 }
79 
80 /* Register connection in link group by assigning an alert token
81  * registered in a search tree.
82  * Requires @conns_lock
83  * Note that '0' is a reserved value and not assigned.
84  */
85 static void smc_lgr_register_conn(struct smc_connection *conn)
86 {
87 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
88 	static atomic_t nexttoken = ATOMIC_INIT(0);
89 
90 	/* find a new alert_token_local value not yet used by some connection
91 	 * in this link group
92 	 */
93 	sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
94 	while (!conn->alert_token_local) {
95 		conn->alert_token_local = atomic_inc_return(&nexttoken);
96 		if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
97 			conn->alert_token_local = 0;
98 	}
99 	smc_lgr_add_alert_token(conn);
100 	conn->lgr->conns_num++;
101 }
102 
103 /* Unregister connection and reset the alert token of the given connection<
104  */
105 static void __smc_lgr_unregister_conn(struct smc_connection *conn)
106 {
107 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
108 	struct smc_link_group *lgr = conn->lgr;
109 
110 	rb_erase(&conn->alert_node, &lgr->conns_all);
111 	lgr->conns_num--;
112 	conn->alert_token_local = 0;
113 	conn->lgr = NULL;
114 	sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
115 }
116 
117 /* Unregister connection and trigger lgr freeing if applicable
118  */
119 static void smc_lgr_unregister_conn(struct smc_connection *conn)
120 {
121 	struct smc_link_group *lgr = conn->lgr;
122 	int reduced = 0;
123 
124 	write_lock_bh(&lgr->conns_lock);
125 	if (conn->alert_token_local) {
126 		reduced = 1;
127 		__smc_lgr_unregister_conn(conn);
128 	}
129 	write_unlock_bh(&lgr->conns_lock);
130 	if (!reduced || lgr->conns_num)
131 		return;
132 	smc_lgr_schedule_free_work(lgr);
133 }
134 
135 static void smc_lgr_free_work(struct work_struct *work)
136 {
137 	struct smc_link_group *lgr = container_of(to_delayed_work(work),
138 						  struct smc_link_group,
139 						  free_work);
140 	bool conns;
141 
142 	spin_lock_bh(&smc_lgr_list.lock);
143 	if (list_empty(&lgr->list))
144 		goto free;
145 	read_lock_bh(&lgr->conns_lock);
146 	conns = RB_EMPTY_ROOT(&lgr->conns_all);
147 	read_unlock_bh(&lgr->conns_lock);
148 	if (!conns) { /* number of lgr connections is no longer zero */
149 		spin_unlock_bh(&smc_lgr_list.lock);
150 		return;
151 	}
152 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
153 free:
154 	spin_unlock_bh(&smc_lgr_list.lock);
155 	if (!delayed_work_pending(&lgr->free_work)) {
156 		if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
157 			smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
158 		smc_lgr_free(lgr);
159 	}
160 }
161 
162 /* create a new SMC link group */
163 static int smc_lgr_create(struct smc_sock *smc,
164 			  struct smc_ib_device *smcibdev, u8 ibport,
165 			  char *peer_systemid, unsigned short vlan_id)
166 {
167 	struct smc_link_group *lgr;
168 	struct smc_link *lnk;
169 	u8 rndvec[3];
170 	int rc = 0;
171 	int i;
172 
173 	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
174 	if (!lgr) {
175 		rc = -ENOMEM;
176 		goto out;
177 	}
178 	lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
179 	lgr->sync_err = 0;
180 	memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
181 	lgr->vlan_id = vlan_id;
182 	rwlock_init(&lgr->sndbufs_lock);
183 	rwlock_init(&lgr->rmbs_lock);
184 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
185 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
186 		INIT_LIST_HEAD(&lgr->rmbs[i]);
187 	}
188 	smc_lgr_list.num += SMC_LGR_NUM_INCR;
189 	memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
190 	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
191 	lgr->conns_all = RB_ROOT;
192 
193 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
194 	/* initialize link */
195 	lnk->state = SMC_LNK_ACTIVATING;
196 	lnk->link_id = SMC_SINGLE_LINK;
197 	lnk->smcibdev = smcibdev;
198 	lnk->ibport = ibport;
199 	lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
200 	if (!smcibdev->initialized)
201 		smc_ib_setup_per_ibdev(smcibdev);
202 	get_random_bytes(rndvec, sizeof(rndvec));
203 	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
204 	rc = smc_llc_link_init(lnk);
205 	if (rc)
206 		goto free_lgr;
207 	rc = smc_wr_alloc_link_mem(lnk);
208 	if (rc)
209 		goto clear_llc_lnk;
210 	rc = smc_ib_create_protection_domain(lnk);
211 	if (rc)
212 		goto free_link_mem;
213 	rc = smc_ib_create_queue_pair(lnk);
214 	if (rc)
215 		goto dealloc_pd;
216 	rc = smc_wr_create_link(lnk);
217 	if (rc)
218 		goto destroy_qp;
219 
220 	smc->conn.lgr = lgr;
221 	rwlock_init(&lgr->conns_lock);
222 	spin_lock_bh(&smc_lgr_list.lock);
223 	list_add(&lgr->list, &smc_lgr_list.list);
224 	spin_unlock_bh(&smc_lgr_list.lock);
225 	return 0;
226 
227 destroy_qp:
228 	smc_ib_destroy_queue_pair(lnk);
229 dealloc_pd:
230 	smc_ib_dealloc_protection_domain(lnk);
231 free_link_mem:
232 	smc_wr_free_link_mem(lnk);
233 clear_llc_lnk:
234 	smc_llc_link_clear(lnk);
235 free_lgr:
236 	kfree(lgr);
237 out:
238 	return rc;
239 }
240 
241 static void smc_buf_unuse(struct smc_connection *conn)
242 {
243 	if (conn->sndbuf_desc)
244 		conn->sndbuf_desc->used = 0;
245 	if (conn->rmb_desc) {
246 		if (!conn->rmb_desc->regerr) {
247 			conn->rmb_desc->reused = 1;
248 			conn->rmb_desc->used = 0;
249 		} else {
250 			/* buf registration failed, reuse not possible */
251 			struct smc_link_group *lgr = conn->lgr;
252 
253 			write_lock_bh(&lgr->rmbs_lock);
254 			list_del(&conn->rmb_desc->list);
255 			write_unlock_bh(&lgr->rmbs_lock);
256 
257 			smc_buf_free(lgr, true, conn->rmb_desc);
258 		}
259 	}
260 }
261 
262 /* remove a finished connection from its link group */
263 void smc_conn_free(struct smc_connection *conn)
264 {
265 	if (!conn->lgr)
266 		return;
267 	smc_cdc_tx_dismiss_slots(conn);
268 	smc_lgr_unregister_conn(conn);
269 	smc_buf_unuse(conn);
270 }
271 
272 static void smc_link_clear(struct smc_link *lnk)
273 {
274 	lnk->peer_qpn = 0;
275 	smc_llc_link_clear(lnk);
276 	smc_ib_modify_qp_reset(lnk);
277 	smc_wr_free_link(lnk);
278 	smc_ib_destroy_queue_pair(lnk);
279 	smc_ib_dealloc_protection_domain(lnk);
280 	smc_wr_free_link_mem(lnk);
281 }
282 
283 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
284 			 struct smc_buf_desc *buf_desc)
285 {
286 	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
287 
288 	if (is_rmb) {
289 		if (buf_desc->mr_rx[SMC_SINGLE_LINK])
290 			smc_ib_put_memory_region(
291 					buf_desc->mr_rx[SMC_SINGLE_LINK]);
292 		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
293 				    DMA_FROM_DEVICE);
294 	} else {
295 		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
296 				    DMA_TO_DEVICE);
297 	}
298 	sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
299 	if (buf_desc->pages)
300 		__free_pages(buf_desc->pages, buf_desc->order);
301 	kfree(buf_desc);
302 }
303 
304 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
305 {
306 	struct smc_buf_desc *buf_desc, *bf_desc;
307 	struct list_head *buf_list;
308 	int i;
309 
310 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
311 		if (is_rmb)
312 			buf_list = &lgr->rmbs[i];
313 		else
314 			buf_list = &lgr->sndbufs[i];
315 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
316 					 list) {
317 			list_del(&buf_desc->list);
318 			smc_buf_free(lgr, is_rmb, buf_desc);
319 		}
320 	}
321 }
322 
323 static void smc_lgr_free_bufs(struct smc_link_group *lgr)
324 {
325 	/* free send buffers */
326 	__smc_lgr_free_bufs(lgr, false);
327 	/* free rmbs */
328 	__smc_lgr_free_bufs(lgr, true);
329 }
330 
331 /* remove a link group */
332 void smc_lgr_free(struct smc_link_group *lgr)
333 {
334 	smc_lgr_free_bufs(lgr);
335 	smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
336 	kfree(lgr);
337 }
338 
339 void smc_lgr_forget(struct smc_link_group *lgr)
340 {
341 	spin_lock_bh(&smc_lgr_list.lock);
342 	/* do not use this link group for new connections */
343 	if (!list_empty(&lgr->list))
344 		list_del_init(&lgr->list);
345 	spin_unlock_bh(&smc_lgr_list.lock);
346 }
347 
348 /* terminate linkgroup abnormally */
349 static void __smc_lgr_terminate(struct smc_link_group *lgr)
350 {
351 	struct smc_connection *conn;
352 	struct smc_sock *smc;
353 	struct rb_node *node;
354 
355 	if (lgr->terminating)
356 		return;	/* lgr already terminating */
357 	lgr->terminating = 1;
358 	if (!list_empty(&lgr->list)) /* forget lgr */
359 		list_del_init(&lgr->list);
360 	smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
361 
362 	write_lock_bh(&lgr->conns_lock);
363 	node = rb_first(&lgr->conns_all);
364 	while (node) {
365 		conn = rb_entry(node, struct smc_connection, alert_node);
366 		smc = container_of(conn, struct smc_sock, conn);
367 		sock_hold(&smc->sk); /* sock_put in close work */
368 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
369 		__smc_lgr_unregister_conn(conn);
370 		write_unlock_bh(&lgr->conns_lock);
371 		if (!schedule_work(&conn->close_work))
372 			sock_put(&smc->sk);
373 		write_lock_bh(&lgr->conns_lock);
374 		node = rb_first(&lgr->conns_all);
375 	}
376 	write_unlock_bh(&lgr->conns_lock);
377 	wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
378 	smc_lgr_schedule_free_work(lgr);
379 }
380 
381 void smc_lgr_terminate(struct smc_link_group *lgr)
382 {
383 	spin_lock_bh(&smc_lgr_list.lock);
384 	__smc_lgr_terminate(lgr);
385 	spin_unlock_bh(&smc_lgr_list.lock);
386 }
387 
388 /* Called when IB port is terminated */
389 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
390 {
391 	struct smc_link_group *lgr, *l;
392 
393 	spin_lock_bh(&smc_lgr_list.lock);
394 	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
395 		if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
396 		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
397 			__smc_lgr_terminate(lgr);
398 	}
399 	spin_unlock_bh(&smc_lgr_list.lock);
400 }
401 
402 /* Determine vlan of internal TCP socket.
403  * @vlan_id: address to store the determined vlan id into
404  */
405 static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
406 {
407 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
408 	struct net_device *ndev;
409 	int i, nest_lvl, rc = 0;
410 
411 	*vlan_id = 0;
412 	if (!dst) {
413 		rc = -ENOTCONN;
414 		goto out;
415 	}
416 	if (!dst->dev) {
417 		rc = -ENODEV;
418 		goto out_rel;
419 	}
420 
421 	ndev = dst->dev;
422 	if (is_vlan_dev(ndev)) {
423 		*vlan_id = vlan_dev_vlan_id(ndev);
424 		goto out_rel;
425 	}
426 
427 	rtnl_lock();
428 	nest_lvl = dev_get_nest_level(ndev);
429 	for (i = 0; i < nest_lvl; i++) {
430 		struct list_head *lower = &ndev->adj_list.lower;
431 
432 		if (list_empty(lower))
433 			break;
434 		lower = lower->next;
435 		ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
436 		if (is_vlan_dev(ndev)) {
437 			*vlan_id = vlan_dev_vlan_id(ndev);
438 			break;
439 		}
440 	}
441 	rtnl_unlock();
442 
443 out_rel:
444 	dst_release(dst);
445 out:
446 	return rc;
447 }
448 
449 /* determine the link gid matching the vlan id of the link group */
450 static int smc_link_determine_gid(struct smc_link_group *lgr)
451 {
452 	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
453 	struct ib_gid_attr gattr;
454 	union ib_gid gid;
455 	int i;
456 
457 	if (!lgr->vlan_id) {
458 		lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
459 		return 0;
460 	}
461 
462 	for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
463 	     i++) {
464 		if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
465 				 &gattr))
466 			continue;
467 		if (gattr.ndev) {
468 			if (is_vlan_dev(gattr.ndev) &&
469 			    vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
470 				lnk->gid = gid;
471 				dev_put(gattr.ndev);
472 				return 0;
473 			}
474 			dev_put(gattr.ndev);
475 		}
476 	}
477 	return -ENODEV;
478 }
479 
480 /* create a new SMC connection (and a new link group if necessary) */
481 int smc_conn_create(struct smc_sock *smc,
482 		    struct smc_ib_device *smcibdev, u8 ibport,
483 		    struct smc_clc_msg_local *lcl, int srv_first_contact)
484 {
485 	struct smc_connection *conn = &smc->conn;
486 	int local_contact = SMC_FIRST_CONTACT;
487 	struct smc_link_group *lgr;
488 	unsigned short vlan_id;
489 	enum smc_lgr_role role;
490 	int rc = 0;
491 
492 	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
493 	rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
494 	if (rc)
495 		return rc;
496 
497 	if ((role == SMC_CLNT) && srv_first_contact)
498 		/* create new link group as well */
499 		goto create;
500 
501 	/* determine if an existing link group can be reused */
502 	spin_lock_bh(&smc_lgr_list.lock);
503 	list_for_each_entry(lgr, &smc_lgr_list.list, list) {
504 		write_lock_bh(&lgr->conns_lock);
505 		if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
506 			    SMC_SYSTEMID_LEN) &&
507 		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
508 			    SMC_GID_SIZE) &&
509 		    !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
510 			    sizeof(lcl->mac)) &&
511 		    !lgr->sync_err &&
512 		    (lgr->role == role) &&
513 		    (lgr->vlan_id == vlan_id) &&
514 		    ((role == SMC_CLNT) ||
515 		     (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
516 			/* link group found */
517 			local_contact = SMC_REUSE_CONTACT;
518 			conn->lgr = lgr;
519 			smc_lgr_register_conn(conn); /* add smc conn to lgr */
520 			write_unlock_bh(&lgr->conns_lock);
521 			break;
522 		}
523 		write_unlock_bh(&lgr->conns_lock);
524 	}
525 	spin_unlock_bh(&smc_lgr_list.lock);
526 
527 	if (role == SMC_CLNT && !srv_first_contact &&
528 	    (local_contact == SMC_FIRST_CONTACT)) {
529 		/* Server reuses a link group, but Client wants to start
530 		 * a new one
531 		 * send out_of_sync decline, reason synchr. error
532 		 */
533 		return -ENOLINK;
534 	}
535 
536 create:
537 	if (local_contact == SMC_FIRST_CONTACT) {
538 		rc = smc_lgr_create(smc, smcibdev, ibport,
539 				    lcl->id_for_peer, vlan_id);
540 		if (rc)
541 			goto out;
542 		smc_lgr_register_conn(conn); /* add smc conn to lgr */
543 		rc = smc_link_determine_gid(conn->lgr);
544 	}
545 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
546 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
547 	conn->urg_state = SMC_URG_READ;
548 #ifndef KERNEL_HAS_ATOMIC64
549 	spin_lock_init(&conn->acurs_lock);
550 #endif
551 
552 out:
553 	return rc ? rc : local_contact;
554 }
555 
556 /* convert the RMB size into the compressed notation - minimum 16K.
557  * In contrast to plain ilog2, this rounds towards the next power of 2,
558  * so the socket application gets at least its desired sndbuf / rcvbuf size.
559  */
560 static u8 smc_compress_bufsize(int size)
561 {
562 	u8 compressed;
563 
564 	if (size <= SMC_BUF_MIN_SIZE)
565 		return 0;
566 
567 	size = (size - 1) >> 14;
568 	compressed = ilog2(size) + 1;
569 	if (compressed >= SMC_RMBE_SIZES)
570 		compressed = SMC_RMBE_SIZES - 1;
571 	return compressed;
572 }
573 
574 /* convert the RMB size from compressed notation into integer */
575 int smc_uncompress_bufsize(u8 compressed)
576 {
577 	u32 size;
578 
579 	size = 0x00000001 << (((int)compressed) + 14);
580 	return (int)size;
581 }
582 
583 /* try to reuse a sndbuf or rmb description slot for a certain
584  * buffer size; if not available, return NULL
585  */
586 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
587 					     rwlock_t *lock,
588 					     struct list_head *buf_list)
589 {
590 	struct smc_buf_desc *buf_slot;
591 
592 	read_lock_bh(lock);
593 	list_for_each_entry(buf_slot, buf_list, list) {
594 		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
595 			read_unlock_bh(lock);
596 			return buf_slot;
597 		}
598 	}
599 	read_unlock_bh(lock);
600 	return NULL;
601 }
602 
603 /* one of the conditions for announcing a receiver's current window size is
604  * that it "results in a minimum increase in the window size of 10% of the
605  * receive buffer space" [RFC7609]
606  */
607 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
608 {
609 	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
610 }
611 
612 static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
613 					       bool is_rmb, int bufsize)
614 {
615 	struct smc_buf_desc *buf_desc;
616 	struct smc_link *lnk;
617 	int rc;
618 
619 	/* try to alloc a new buffer */
620 	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
621 	if (!buf_desc)
622 		return ERR_PTR(-ENOMEM);
623 
624 	buf_desc->order = get_order(bufsize);
625 	buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
626 				      __GFP_NOMEMALLOC | __GFP_COMP |
627 				      __GFP_NORETRY | __GFP_ZERO,
628 				      buf_desc->order);
629 	if (!buf_desc->pages) {
630 		kfree(buf_desc);
631 		return ERR_PTR(-EAGAIN);
632 	}
633 	buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
634 
635 	/* build the sg table from the pages */
636 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
637 	rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
638 			    GFP_KERNEL);
639 	if (rc) {
640 		smc_buf_free(lgr, is_rmb, buf_desc);
641 		return ERR_PTR(rc);
642 	}
643 	sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
644 		   buf_desc->cpu_addr, bufsize);
645 
646 	/* map sg table to DMA address */
647 	rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
648 			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
649 	/* SMC protocol depends on mapping to one DMA address only */
650 	if (rc != 1)  {
651 		smc_buf_free(lgr, is_rmb, buf_desc);
652 		return ERR_PTR(-EAGAIN);
653 	}
654 
655 	/* create a new memory region for the RMB */
656 	if (is_rmb) {
657 		rc = smc_ib_get_memory_region(lnk->roce_pd,
658 					      IB_ACCESS_REMOTE_WRITE |
659 					      IB_ACCESS_LOCAL_WRITE,
660 					      buf_desc);
661 		if (rc) {
662 			smc_buf_free(lgr, is_rmb, buf_desc);
663 			return ERR_PTR(rc);
664 		}
665 	}
666 
667 	buf_desc->len = bufsize;
668 	return buf_desc;
669 }
670 
671 static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
672 {
673 	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
674 	struct smc_connection *conn = &smc->conn;
675 	struct smc_link_group *lgr = conn->lgr;
676 	struct list_head *buf_list;
677 	int bufsize, bufsize_short;
678 	int sk_buf_size;
679 	rwlock_t *lock;
680 
681 	if (is_rmb)
682 		/* use socket recv buffer size (w/o overhead) as start value */
683 		sk_buf_size = smc->sk.sk_rcvbuf / 2;
684 	else
685 		/* use socket send buffer size (w/o overhead) as start value */
686 		sk_buf_size = smc->sk.sk_sndbuf / 2;
687 
688 	for (bufsize_short = smc_compress_bufsize(sk_buf_size);
689 	     bufsize_short >= 0; bufsize_short--) {
690 
691 		if (is_rmb) {
692 			lock = &lgr->rmbs_lock;
693 			buf_list = &lgr->rmbs[bufsize_short];
694 		} else {
695 			lock = &lgr->sndbufs_lock;
696 			buf_list = &lgr->sndbufs[bufsize_short];
697 		}
698 		bufsize = smc_uncompress_bufsize(bufsize_short);
699 		if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
700 			continue;
701 
702 		/* check for reusable slot in the link group */
703 		buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
704 		if (buf_desc) {
705 			memset(buf_desc->cpu_addr, 0, bufsize);
706 			break; /* found reusable slot */
707 		}
708 
709 		buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize);
710 		if (PTR_ERR(buf_desc) == -ENOMEM)
711 			break;
712 		if (IS_ERR(buf_desc))
713 			continue;
714 
715 		buf_desc->used = 1;
716 		write_lock_bh(lock);
717 		list_add(&buf_desc->list, buf_list);
718 		write_unlock_bh(lock);
719 		break; /* found */
720 	}
721 
722 	if (IS_ERR(buf_desc))
723 		return -ENOMEM;
724 
725 	if (is_rmb) {
726 		conn->rmb_desc = buf_desc;
727 		conn->rmbe_size_short = bufsize_short;
728 		smc->sk.sk_rcvbuf = bufsize * 2;
729 		atomic_set(&conn->bytes_to_rcv, 0);
730 		conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
731 	} else {
732 		conn->sndbuf_desc = buf_desc;
733 		smc->sk.sk_sndbuf = bufsize * 2;
734 		atomic_set(&conn->sndbuf_space, bufsize);
735 	}
736 	return 0;
737 }
738 
739 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
740 {
741 	struct smc_link_group *lgr = conn->lgr;
742 
743 	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
744 			       conn->sndbuf_desc, DMA_TO_DEVICE);
745 }
746 
747 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
748 {
749 	struct smc_link_group *lgr = conn->lgr;
750 
751 	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
752 				  conn->sndbuf_desc, DMA_TO_DEVICE);
753 }
754 
755 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
756 {
757 	struct smc_link_group *lgr = conn->lgr;
758 
759 	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
760 			       conn->rmb_desc, DMA_FROM_DEVICE);
761 }
762 
763 void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
764 {
765 	struct smc_link_group *lgr = conn->lgr;
766 
767 	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
768 				  conn->rmb_desc, DMA_FROM_DEVICE);
769 }
770 
771 /* create the send and receive buffer for an SMC socket;
772  * receive buffers are called RMBs;
773  * (even though the SMC protocol allows more than one RMB-element per RMB,
774  * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
775  * extra RMB for every connection in a link group
776  */
777 int smc_buf_create(struct smc_sock *smc)
778 {
779 	int rc;
780 
781 	/* create send buffer */
782 	rc = __smc_buf_create(smc, false);
783 	if (rc)
784 		return rc;
785 	/* create rmb */
786 	rc = __smc_buf_create(smc, true);
787 	if (rc)
788 		smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
789 	return rc;
790 }
791 
792 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
793 {
794 	int i;
795 
796 	for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
797 		if (!test_and_set_bit(i, lgr->rtokens_used_mask))
798 			return i;
799 	}
800 	return -ENOSPC;
801 }
802 
803 /* add a new rtoken from peer */
804 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
805 {
806 	u64 dma_addr = be64_to_cpu(nw_vaddr);
807 	u32 rkey = ntohl(nw_rkey);
808 	int i;
809 
810 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
811 		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
812 		    (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
813 		    test_bit(i, lgr->rtokens_used_mask)) {
814 			/* already in list */
815 			return i;
816 		}
817 	}
818 	i = smc_rmb_reserve_rtoken_idx(lgr);
819 	if (i < 0)
820 		return i;
821 	lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
822 	lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
823 	return i;
824 }
825 
826 /* delete an rtoken */
827 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
828 {
829 	u32 rkey = ntohl(nw_rkey);
830 	int i;
831 
832 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
833 		if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
834 		    test_bit(i, lgr->rtokens_used_mask)) {
835 			lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
836 			lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;
837 
838 			clear_bit(i, lgr->rtokens_used_mask);
839 			return 0;
840 		}
841 	}
842 	return -ENOENT;
843 }
844 
845 /* save rkey and dma_addr received from peer during clc handshake */
846 int smc_rmb_rtoken_handling(struct smc_connection *conn,
847 			    struct smc_clc_msg_accept_confirm *clc)
848 {
849 	conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
850 					  clc->rmb_rkey);
851 	if (conn->rtoken_idx < 0)
852 		return conn->rtoken_idx;
853 	return 0;
854 }
855 
856 /* Called (from smc_exit) when module is removed */
857 void smc_core_exit(void)
858 {
859 	struct smc_link_group *lgr, *lg;
860 	LIST_HEAD(lgr_freeing_list);
861 
862 	spin_lock_bh(&smc_lgr_list.lock);
863 	if (!list_empty(&smc_lgr_list.list))
864 		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
865 	spin_unlock_bh(&smc_lgr_list.lock);
866 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
867 		list_del_init(&lgr->list);
868 		smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
869 		cancel_delayed_work_sync(&lgr->free_work);
870 		smc_lgr_free(lgr); /* free link group */
871 	}
872 }
873