xref: /linux/net/smc/smc_core.c (revision 40e79150c1686263e6a031d7702aec63aff31332)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  Basic Transport Functions exploiting Infiniband API
6  *
7  *  Copyright IBM Corp. 2016
8  *
9  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
10  */
11 
12 #include <linux/socket.h>
13 #include <linux/if_vlan.h>
14 #include <linux/random.h>
15 #include <linux/workqueue.h>
16 #include <linux/wait.h>
17 #include <linux/reboot.h>
18 #include <net/tcp.h>
19 #include <net/sock.h>
20 #include <rdma/ib_verbs.h>
21 #include <rdma/ib_cache.h>
22 
23 #include "smc.h"
24 #include "smc_clc.h"
25 #include "smc_core.h"
26 #include "smc_ib.h"
27 #include "smc_wr.h"
28 #include "smc_llc.h"
29 #include "smc_cdc.h"
30 #include "smc_close.h"
31 #include "smc_ism.h"
32 
33 #define SMC_LGR_NUM_INCR		256
34 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
35 #define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
36 #define SMC_LGR_FREE_DELAY_FAST		(8 * HZ)
37 
38 static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
39 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
40 	.list = LIST_HEAD_INIT(smc_lgr_list.list),
41 	.num = 0,
42 };
43 
44 static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */
45 static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
46 
47 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
48 			 struct smc_buf_desc *buf_desc);
49 static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
50 
51 /* return head of link group list and its lock for a given link group */
52 static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
53 						  spinlock_t **lgr_lock)
54 {
55 	if (lgr->is_smcd) {
56 		*lgr_lock = &lgr->smcd->lgr_lock;
57 		return &lgr->smcd->lgr_list;
58 	}
59 
60 	*lgr_lock = &smc_lgr_list.lock;
61 	return &smc_lgr_list.list;
62 }
63 
64 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
65 {
66 	/* client link group creation always follows the server link group
67 	 * creation. For client use a somewhat higher removal delay time,
68 	 * otherwise there is a risk of out-of-sync link groups.
69 	 */
70 	if (!lgr->freeing && !lgr->freefast) {
71 		mod_delayed_work(system_wq, &lgr->free_work,
72 				 (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
73 						SMC_LGR_FREE_DELAY_CLNT :
74 						SMC_LGR_FREE_DELAY_SERV);
75 	}
76 }
77 
78 void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
79 {
80 	if (!lgr->freeing && !lgr->freefast) {
81 		lgr->freefast = 1;
82 		mod_delayed_work(system_wq, &lgr->free_work,
83 				 SMC_LGR_FREE_DELAY_FAST);
84 	}
85 }
86 
87 /* Register connection's alert token in our lookup structure.
88  * To use rbtrees we have to implement our own insert core.
89  * Requires @conns_lock
90  * @smc		connection to register
91  * Returns 0 on success, != otherwise.
92  */
93 static void smc_lgr_add_alert_token(struct smc_connection *conn)
94 {
95 	struct rb_node **link, *parent = NULL;
96 	u32 token = conn->alert_token_local;
97 
98 	link = &conn->lgr->conns_all.rb_node;
99 	while (*link) {
100 		struct smc_connection *cur = rb_entry(*link,
101 					struct smc_connection, alert_node);
102 
103 		parent = *link;
104 		if (cur->alert_token_local > token)
105 			link = &parent->rb_left;
106 		else
107 			link = &parent->rb_right;
108 	}
109 	/* Put the new node there */
110 	rb_link_node(&conn->alert_node, parent, link);
111 	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
112 }
113 
114 /* Register connection in link group by assigning an alert token
115  * registered in a search tree.
116  * Requires @conns_lock
117  * Note that '0' is a reserved value and not assigned.
118  */
119 static int smc_lgr_register_conn(struct smc_connection *conn)
120 {
121 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
122 	static atomic_t nexttoken = ATOMIC_INIT(0);
123 
124 	/* find a new alert_token_local value not yet used by some connection
125 	 * in this link group
126 	 */
127 	sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
128 	while (!conn->alert_token_local) {
129 		conn->alert_token_local = atomic_inc_return(&nexttoken);
130 		if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
131 			conn->alert_token_local = 0;
132 	}
133 	smc_lgr_add_alert_token(conn);
134 
135 	/* assign the new connection to a link */
136 	if (!conn->lgr->is_smcd) {
137 		struct smc_link *lnk;
138 		int i;
139 
140 		/* tbd - link balancing */
141 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
142 			lnk = &conn->lgr->lnk[i];
143 			if (lnk->state == SMC_LNK_ACTIVATING ||
144 			    lnk->state == SMC_LNK_ACTIVE)
145 				conn->lnk = lnk;
146 		}
147 		if (!conn->lnk)
148 			return SMC_CLC_DECL_NOACTLINK;
149 	}
150 	conn->lgr->conns_num++;
151 	return 0;
152 }
153 
154 /* Unregister connection and reset the alert token of the given connection<
155  */
156 static void __smc_lgr_unregister_conn(struct smc_connection *conn)
157 {
158 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
159 	struct smc_link_group *lgr = conn->lgr;
160 
161 	rb_erase(&conn->alert_node, &lgr->conns_all);
162 	lgr->conns_num--;
163 	conn->alert_token_local = 0;
164 	sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
165 }
166 
167 /* Unregister connection from lgr
168  */
169 static void smc_lgr_unregister_conn(struct smc_connection *conn)
170 {
171 	struct smc_link_group *lgr = conn->lgr;
172 
173 	if (!lgr)
174 		return;
175 	write_lock_bh(&lgr->conns_lock);
176 	if (conn->alert_token_local) {
177 		__smc_lgr_unregister_conn(conn);
178 	}
179 	write_unlock_bh(&lgr->conns_lock);
180 	conn->lgr = NULL;
181 }
182 
183 void smc_lgr_cleanup_early(struct smc_connection *conn)
184 {
185 	struct smc_link_group *lgr = conn->lgr;
186 
187 	if (!lgr)
188 		return;
189 
190 	smc_conn_free(conn);
191 	smc_lgr_forget(lgr);
192 	smc_lgr_schedule_free_work_fast(lgr);
193 }
194 
195 /* Send delete link, either as client to request the initiation
196  * of the DELETE LINK sequence from server; or as server to
197  * initiate the delete processing. See smc_llc_rx_delete_link().
198  */
199 static int smcr_link_send_delete(struct smc_link *lnk, bool orderly)
200 {
201 	if (lnk->state == SMC_LNK_ACTIVE &&
202 	    !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) {
203 		return 0;
204 	}
205 	return -ENOTCONN;
206 }
207 
208 static void smc_lgr_free(struct smc_link_group *lgr);
209 
210 static void smc_lgr_free_work(struct work_struct *work)
211 {
212 	struct smc_link_group *lgr = container_of(to_delayed_work(work),
213 						  struct smc_link_group,
214 						  free_work);
215 	spinlock_t *lgr_lock;
216 	bool conns;
217 	int i;
218 
219 	smc_lgr_list_head(lgr, &lgr_lock);
220 	spin_lock_bh(lgr_lock);
221 	if (lgr->freeing) {
222 		spin_unlock_bh(lgr_lock);
223 		return;
224 	}
225 	read_lock_bh(&lgr->conns_lock);
226 	conns = RB_EMPTY_ROOT(&lgr->conns_all);
227 	read_unlock_bh(&lgr->conns_lock);
228 	if (!conns) { /* number of lgr connections is no longer zero */
229 		spin_unlock_bh(lgr_lock);
230 		return;
231 	}
232 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
233 
234 	if (!lgr->is_smcd && !lgr->terminating)	{
235 		bool do_wait = false;
236 
237 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
238 			struct smc_link *lnk = &lgr->lnk[i];
239 			/* try to send del link msg, on err free immediately */
240 			if (lnk->state == SMC_LNK_ACTIVE &&
241 			    !smcr_link_send_delete(lnk, true)) {
242 				/* reschedule in case we never receive a resp */
243 				smc_lgr_schedule_free_work(lgr);
244 				do_wait = true;
245 			}
246 		}
247 		if (do_wait) {
248 			spin_unlock_bh(lgr_lock);
249 			return; /* wait for resp, see smc_llc_rx_delete_link */
250 		}
251 	}
252 	lgr->freeing = 1; /* this instance does the freeing, no new schedule */
253 	spin_unlock_bh(lgr_lock);
254 	cancel_delayed_work(&lgr->free_work);
255 
256 	if (lgr->is_smcd && !lgr->terminating)
257 		smc_ism_signal_shutdown(lgr);
258 	if (!lgr->is_smcd) {
259 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
260 			struct smc_link *lnk = &lgr->lnk[i];
261 
262 			if (smc_link_usable(lnk))
263 				lnk->state = SMC_LNK_INACTIVE;
264 		}
265 		wake_up_interruptible_all(&lgr->llc_waiter);
266 	}
267 	smc_lgr_free(lgr);
268 }
269 
270 static void smc_lgr_terminate_work(struct work_struct *work)
271 {
272 	struct smc_link_group *lgr = container_of(work, struct smc_link_group,
273 						  terminate_work);
274 
275 	__smc_lgr_terminate(lgr, true);
276 }
277 
278 /* return next unique link id for the lgr */
279 static u8 smcr_next_link_id(struct smc_link_group *lgr)
280 {
281 	u8 link_id;
282 	int i;
283 
284 	while (1) {
285 		link_id = ++lgr->next_link_id;
286 		if (!link_id)	/* skip zero as link_id */
287 			link_id = ++lgr->next_link_id;
288 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
289 			if (smc_link_usable(&lgr->lnk[i]) &&
290 			    lgr->lnk[i].link_id == link_id)
291 				continue;
292 		}
293 		break;
294 	}
295 	return link_id;
296 }
297 
298 static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
299 			  u8 link_idx, struct smc_init_info *ini)
300 {
301 	u8 rndvec[3];
302 	int rc;
303 
304 	get_device(&ini->ib_dev->ibdev->dev);
305 	atomic_inc(&ini->ib_dev->lnk_cnt);
306 	lnk->state = SMC_LNK_ACTIVATING;
307 	lnk->link_id = smcr_next_link_id(lgr);
308 	lnk->lgr = lgr;
309 	lnk->link_idx = link_idx;
310 	lnk->smcibdev = ini->ib_dev;
311 	lnk->ibport = ini->ib_port;
312 	lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
313 	if (!ini->ib_dev->initialized) {
314 		rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev);
315 		if (rc)
316 			goto out;
317 	}
318 	get_random_bytes(rndvec, sizeof(rndvec));
319 	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
320 		(rndvec[2] << 16);
321 	rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
322 				  ini->vlan_id, lnk->gid, &lnk->sgid_index);
323 	if (rc)
324 		goto out;
325 	rc = smc_llc_link_init(lnk);
326 	if (rc)
327 		goto out;
328 	rc = smc_wr_alloc_link_mem(lnk);
329 	if (rc)
330 		goto clear_llc_lnk;
331 	rc = smc_ib_create_protection_domain(lnk);
332 	if (rc)
333 		goto free_link_mem;
334 	rc = smc_ib_create_queue_pair(lnk);
335 	if (rc)
336 		goto dealloc_pd;
337 	rc = smc_wr_create_link(lnk);
338 	if (rc)
339 		goto destroy_qp;
340 	return 0;
341 
342 destroy_qp:
343 	smc_ib_destroy_queue_pair(lnk);
344 dealloc_pd:
345 	smc_ib_dealloc_protection_domain(lnk);
346 free_link_mem:
347 	smc_wr_free_link_mem(lnk);
348 clear_llc_lnk:
349 	smc_llc_link_clear(lnk);
350 out:
351 	put_device(&ini->ib_dev->ibdev->dev);
352 	memset(lnk, 0, sizeof(struct smc_link));
353 	lnk->state = SMC_LNK_UNUSED;
354 	if (!atomic_dec_return(&ini->ib_dev->lnk_cnt))
355 		wake_up(&ini->ib_dev->lnks_deleted);
356 	return rc;
357 }
358 
359 /* create a new SMC link group */
360 static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
361 {
362 	struct smc_link_group *lgr;
363 	struct list_head *lgr_list;
364 	struct smc_link *lnk;
365 	spinlock_t *lgr_lock;
366 	u8 link_idx;
367 	int rc = 0;
368 	int i;
369 
370 	if (ini->is_smcd && ini->vlan_id) {
371 		if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) {
372 			rc = SMC_CLC_DECL_ISMVLANERR;
373 			goto out;
374 		}
375 	}
376 
377 	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
378 	if (!lgr) {
379 		rc = SMC_CLC_DECL_MEM;
380 		goto ism_put_vlan;
381 	}
382 	lgr->is_smcd = ini->is_smcd;
383 	lgr->sync_err = 0;
384 	lgr->terminating = 0;
385 	lgr->freefast = 0;
386 	lgr->freeing = 0;
387 	lgr->vlan_id = ini->vlan_id;
388 	mutex_init(&lgr->sndbufs_lock);
389 	mutex_init(&lgr->rmbs_lock);
390 	rwlock_init(&lgr->conns_lock);
391 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
392 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
393 		INIT_LIST_HEAD(&lgr->rmbs[i]);
394 	}
395 	lgr->next_link_id = 0;
396 	smc_lgr_list.num += SMC_LGR_NUM_INCR;
397 	memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
398 	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
399 	INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work);
400 	lgr->conns_all = RB_ROOT;
401 	if (ini->is_smcd) {
402 		/* SMC-D specific settings */
403 		get_device(&ini->ism_dev->dev);
404 		lgr->peer_gid = ini->ism_gid;
405 		lgr->smcd = ini->ism_dev;
406 		lgr_list = &ini->ism_dev->lgr_list;
407 		lgr_lock = &lgr->smcd->lgr_lock;
408 		lgr->peer_shutdown = 0;
409 		atomic_inc(&ini->ism_dev->lgr_cnt);
410 	} else {
411 		/* SMC-R specific settings */
412 		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
413 		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
414 		       SMC_SYSTEMID_LEN);
415 		smc_llc_lgr_init(lgr, smc);
416 
417 		link_idx = SMC_SINGLE_LINK;
418 		lnk = &lgr->lnk[link_idx];
419 		rc = smcr_link_init(lgr, lnk, link_idx, ini);
420 		if (rc)
421 			goto free_lgr;
422 		lgr_list = &smc_lgr_list.list;
423 		lgr_lock = &smc_lgr_list.lock;
424 		atomic_inc(&lgr_cnt);
425 	}
426 	smc->conn.lgr = lgr;
427 	spin_lock_bh(lgr_lock);
428 	list_add(&lgr->list, lgr_list);
429 	spin_unlock_bh(lgr_lock);
430 	return 0;
431 
432 free_lgr:
433 	kfree(lgr);
434 ism_put_vlan:
435 	if (ini->is_smcd && ini->vlan_id)
436 		smc_ism_put_vlan(ini->ism_dev, ini->vlan_id);
437 out:
438 	if (rc < 0) {
439 		if (rc == -ENOMEM)
440 			rc = SMC_CLC_DECL_MEM;
441 		else
442 			rc = SMC_CLC_DECL_INTERR;
443 	}
444 	return rc;
445 }
446 
447 static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
448 			   struct smc_link_group *lgr)
449 {
450 	if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) {
451 		/* unregister rmb with peer */
452 		smc_llc_do_delete_rkey(lgr, rmb_desc);
453 		rmb_desc->is_conf_rkey = false;
454 	}
455 	if (rmb_desc->is_reg_err) {
456 		/* buf registration failed, reuse not possible */
457 		mutex_lock(&lgr->rmbs_lock);
458 		list_del(&rmb_desc->list);
459 		mutex_unlock(&lgr->rmbs_lock);
460 
461 		smc_buf_free(lgr, true, rmb_desc);
462 	} else {
463 		rmb_desc->used = 0;
464 	}
465 }
466 
467 static void smc_buf_unuse(struct smc_connection *conn,
468 			  struct smc_link_group *lgr)
469 {
470 	if (conn->sndbuf_desc)
471 		conn->sndbuf_desc->used = 0;
472 	if (conn->rmb_desc && lgr->is_smcd)
473 		conn->rmb_desc->used = 0;
474 	else if (conn->rmb_desc)
475 		smcr_buf_unuse(conn->rmb_desc, lgr);
476 }
477 
478 /* remove a finished connection from its link group */
479 void smc_conn_free(struct smc_connection *conn)
480 {
481 	struct smc_link_group *lgr = conn->lgr;
482 
483 	if (!lgr)
484 		return;
485 	if (lgr->is_smcd) {
486 		if (!list_empty(&lgr->list))
487 			smc_ism_unset_conn(conn);
488 		tasklet_kill(&conn->rx_tsklet);
489 	} else {
490 		smc_cdc_tx_dismiss_slots(conn);
491 	}
492 	if (!list_empty(&lgr->list)) {
493 		smc_lgr_unregister_conn(conn);
494 		smc_buf_unuse(conn, lgr); /* allow buffer reuse */
495 	}
496 
497 	if (!lgr->conns_num)
498 		smc_lgr_schedule_free_work(lgr);
499 }
500 
501 static void smcr_link_clear(struct smc_link *lnk)
502 {
503 	struct smc_ib_device *smcibdev;
504 
505 	if (lnk->peer_qpn == 0)
506 		return;
507 	lnk->peer_qpn = 0;
508 	smc_llc_link_clear(lnk);
509 	smc_ib_modify_qp_reset(lnk);
510 	smc_wr_free_link(lnk);
511 	smc_ib_destroy_queue_pair(lnk);
512 	smc_ib_dealloc_protection_domain(lnk);
513 	smc_wr_free_link_mem(lnk);
514 	put_device(&lnk->smcibdev->ibdev->dev);
515 	smcibdev = lnk->smcibdev;
516 	memset(lnk, 0, sizeof(struct smc_link));
517 	lnk->state = SMC_LNK_UNUSED;
518 	if (!atomic_dec_return(&smcibdev->lnk_cnt))
519 		wake_up(&smcibdev->lnks_deleted);
520 }
521 
522 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
523 			  struct smc_buf_desc *buf_desc)
524 {
525 	struct smc_link *lnk;
526 	int i;
527 
528 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
529 		lnk = &lgr->lnk[i];
530 		if (!buf_desc->is_map_ib[lnk->link_idx])
531 			continue;
532 		if (is_rmb) {
533 			if (buf_desc->mr_rx[lnk->link_idx])
534 				smc_ib_put_memory_region(
535 						buf_desc->mr_rx[lnk->link_idx]);
536 			smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
537 		} else {
538 			smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
539 		}
540 		sg_free_table(&buf_desc->sgt[lnk->link_idx]);
541 	}
542 
543 	if (buf_desc->pages)
544 		__free_pages(buf_desc->pages, buf_desc->order);
545 	kfree(buf_desc);
546 }
547 
548 static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
549 			  struct smc_buf_desc *buf_desc)
550 {
551 	if (is_dmb) {
552 		/* restore original buf len */
553 		buf_desc->len += sizeof(struct smcd_cdc_msg);
554 		smc_ism_unregister_dmb(lgr->smcd, buf_desc);
555 	} else {
556 		kfree(buf_desc->cpu_addr);
557 	}
558 	kfree(buf_desc);
559 }
560 
561 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
562 			 struct smc_buf_desc *buf_desc)
563 {
564 	if (lgr->is_smcd)
565 		smcd_buf_free(lgr, is_rmb, buf_desc);
566 	else
567 		smcr_buf_free(lgr, is_rmb, buf_desc);
568 }
569 
570 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
571 {
572 	struct smc_buf_desc *buf_desc, *bf_desc;
573 	struct list_head *buf_list;
574 	int i;
575 
576 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
577 		if (is_rmb)
578 			buf_list = &lgr->rmbs[i];
579 		else
580 			buf_list = &lgr->sndbufs[i];
581 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
582 					 list) {
583 			list_del(&buf_desc->list);
584 			smc_buf_free(lgr, is_rmb, buf_desc);
585 		}
586 	}
587 }
588 
589 static void smc_lgr_free_bufs(struct smc_link_group *lgr)
590 {
591 	/* free send buffers */
592 	__smc_lgr_free_bufs(lgr, false);
593 	/* free rmbs */
594 	__smc_lgr_free_bufs(lgr, true);
595 }
596 
597 /* remove a link group */
598 static void smc_lgr_free(struct smc_link_group *lgr)
599 {
600 	int i;
601 
602 	smc_lgr_free_bufs(lgr);
603 	if (lgr->is_smcd) {
604 		if (!lgr->terminating) {
605 			smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
606 			put_device(&lgr->smcd->dev);
607 		}
608 		if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
609 			wake_up(&lgr->smcd->lgrs_deleted);
610 	} else {
611 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
612 			if (lgr->lnk[i].state != SMC_LNK_UNUSED)
613 				smcr_link_clear(&lgr->lnk[i]);
614 		}
615 		smc_llc_lgr_clear(lgr);
616 		if (!atomic_dec_return(&lgr_cnt))
617 			wake_up(&lgrs_deleted);
618 	}
619 	kfree(lgr);
620 }
621 
622 void smc_lgr_forget(struct smc_link_group *lgr)
623 {
624 	struct list_head *lgr_list;
625 	spinlock_t *lgr_lock;
626 
627 	lgr_list = smc_lgr_list_head(lgr, &lgr_lock);
628 	spin_lock_bh(lgr_lock);
629 	/* do not use this link group for new connections */
630 	if (!list_empty(lgr_list))
631 		list_del_init(lgr_list);
632 	spin_unlock_bh(lgr_lock);
633 }
634 
635 static void smcd_unregister_all_dmbs(struct smc_link_group *lgr)
636 {
637 	int i;
638 
639 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
640 		struct smc_buf_desc *buf_desc;
641 
642 		list_for_each_entry(buf_desc, &lgr->rmbs[i], list) {
643 			buf_desc->len += sizeof(struct smcd_cdc_msg);
644 			smc_ism_unregister_dmb(lgr->smcd, buf_desc);
645 		}
646 	}
647 }
648 
649 static void smc_sk_wake_ups(struct smc_sock *smc)
650 {
651 	smc->sk.sk_write_space(&smc->sk);
652 	smc->sk.sk_data_ready(&smc->sk);
653 	smc->sk.sk_state_change(&smc->sk);
654 }
655 
656 /* kill a connection */
657 static void smc_conn_kill(struct smc_connection *conn, bool soft)
658 {
659 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
660 
661 	if (conn->lgr->is_smcd && conn->lgr->peer_shutdown)
662 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
663 	else
664 		smc_close_abort(conn);
665 	conn->killed = 1;
666 	smc->sk.sk_err = ECONNABORTED;
667 	smc_sk_wake_ups(smc);
668 	if (conn->lgr->is_smcd) {
669 		smc_ism_unset_conn(conn);
670 		if (soft)
671 			tasklet_kill(&conn->rx_tsklet);
672 		else
673 			tasklet_unlock_wait(&conn->rx_tsklet);
674 	} else {
675 		smc_cdc_tx_dismiss_slots(conn);
676 	}
677 	smc_lgr_unregister_conn(conn);
678 	smc_close_active_abort(smc);
679 }
680 
681 static void smc_lgr_cleanup(struct smc_link_group *lgr)
682 {
683 	int i;
684 
685 	if (lgr->is_smcd) {
686 		smc_ism_signal_shutdown(lgr);
687 		smcd_unregister_all_dmbs(lgr);
688 		smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
689 		put_device(&lgr->smcd->dev);
690 	} else {
691 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
692 			struct smc_link *lnk = &lgr->lnk[i];
693 
694 			if (smc_link_usable(lnk))
695 				lnk->state = SMC_LNK_INACTIVE;
696 		}
697 		wake_up_interruptible_all(&lgr->llc_waiter);
698 	}
699 }
700 
701 /* terminate link group
702  * @soft: true if link group shutdown can take its time
703  *	  false if immediate link group shutdown is required
704  */
705 static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
706 {
707 	struct smc_connection *conn;
708 	struct smc_sock *smc;
709 	struct rb_node *node;
710 
711 	if (lgr->terminating)
712 		return;	/* lgr already terminating */
713 	if (!soft)
714 		cancel_delayed_work_sync(&lgr->free_work);
715 	lgr->terminating = 1;
716 
717 	/* kill remaining link group connections */
718 	read_lock_bh(&lgr->conns_lock);
719 	node = rb_first(&lgr->conns_all);
720 	while (node) {
721 		read_unlock_bh(&lgr->conns_lock);
722 		conn = rb_entry(node, struct smc_connection, alert_node);
723 		smc = container_of(conn, struct smc_sock, conn);
724 		sock_hold(&smc->sk); /* sock_put below */
725 		lock_sock(&smc->sk);
726 		smc_conn_kill(conn, soft);
727 		release_sock(&smc->sk);
728 		sock_put(&smc->sk); /* sock_hold above */
729 		read_lock_bh(&lgr->conns_lock);
730 		node = rb_first(&lgr->conns_all);
731 	}
732 	read_unlock_bh(&lgr->conns_lock);
733 	smc_lgr_cleanup(lgr);
734 	if (soft)
735 		smc_lgr_schedule_free_work_fast(lgr);
736 	else
737 		smc_lgr_free(lgr);
738 }
739 
740 /* unlink link group and schedule termination */
741 void smc_lgr_terminate_sched(struct smc_link_group *lgr)
742 {
743 	spinlock_t *lgr_lock;
744 
745 	smc_lgr_list_head(lgr, &lgr_lock);
746 	spin_lock_bh(lgr_lock);
747 	if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) {
748 		spin_unlock_bh(lgr_lock);
749 		return;	/* lgr already terminating */
750 	}
751 	list_del_init(&lgr->list);
752 	spin_unlock_bh(lgr_lock);
753 	schedule_work(&lgr->terminate_work);
754 }
755 
756 /* Called when IB port is terminated */
757 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
758 {
759 	struct smc_link_group *lgr, *l;
760 	LIST_HEAD(lgr_free_list);
761 	int i;
762 
763 	spin_lock_bh(&smc_lgr_list.lock);
764 	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
765 		if (lgr->is_smcd)
766 			continue;
767 		/* tbd - terminate only when no more links are active */
768 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
769 			if (!smc_link_usable(&lgr->lnk[i]))
770 				continue;
771 			if (lgr->lnk[i].smcibdev == smcibdev &&
772 			    lgr->lnk[i].ibport == ibport) {
773 				list_move(&lgr->list, &lgr_free_list);
774 				lgr->freeing = 1;
775 			}
776 		}
777 	}
778 	spin_unlock_bh(&smc_lgr_list.lock);
779 
780 	list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
781 		list_del_init(&lgr->list);
782 		__smc_lgr_terminate(lgr, false);
783 	}
784 }
785 
786 /* Called when peer lgr shutdown (regularly or abnormally) is received */
787 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
788 {
789 	struct smc_link_group *lgr, *l;
790 	LIST_HEAD(lgr_free_list);
791 
792 	/* run common cleanup function and build free list */
793 	spin_lock_bh(&dev->lgr_lock);
794 	list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
795 		if ((!peer_gid || lgr->peer_gid == peer_gid) &&
796 		    (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
797 			if (peer_gid) /* peer triggered termination */
798 				lgr->peer_shutdown = 1;
799 			list_move(&lgr->list, &lgr_free_list);
800 		}
801 	}
802 	spin_unlock_bh(&dev->lgr_lock);
803 
804 	/* cancel the regular free workers and actually free lgrs */
805 	list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
806 		list_del_init(&lgr->list);
807 		schedule_work(&lgr->terminate_work);
808 	}
809 }
810 
811 /* Called when an SMCD device is removed or the smc module is unloaded */
812 void smc_smcd_terminate_all(struct smcd_dev *smcd)
813 {
814 	struct smc_link_group *lgr, *lg;
815 	LIST_HEAD(lgr_free_list);
816 
817 	spin_lock_bh(&smcd->lgr_lock);
818 	list_splice_init(&smcd->lgr_list, &lgr_free_list);
819 	list_for_each_entry(lgr, &lgr_free_list, list)
820 		lgr->freeing = 1;
821 	spin_unlock_bh(&smcd->lgr_lock);
822 
823 	list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
824 		list_del_init(&lgr->list);
825 		__smc_lgr_terminate(lgr, false);
826 	}
827 
828 	if (atomic_read(&smcd->lgr_cnt))
829 		wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt));
830 }
831 
832 /* Called when an SMCR device is removed or the smc module is unloaded.
833  * If smcibdev is given, all SMCR link groups using this device are terminated.
834  * If smcibdev is NULL, all SMCR link groups are terminated.
835  */
836 void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
837 {
838 	struct smc_link_group *lgr, *lg;
839 	LIST_HEAD(lgr_free_list);
840 	int i;
841 
842 	spin_lock_bh(&smc_lgr_list.lock);
843 	if (!smcibdev) {
844 		list_splice_init(&smc_lgr_list.list, &lgr_free_list);
845 		list_for_each_entry(lgr, &lgr_free_list, list)
846 			lgr->freeing = 1;
847 	} else {
848 		list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
849 			for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
850 				if (lgr->lnk[i].smcibdev == smcibdev) {
851 					list_move(&lgr->list, &lgr_free_list);
852 					lgr->freeing = 1;
853 					break;
854 				}
855 			}
856 		}
857 	}
858 	spin_unlock_bh(&smc_lgr_list.lock);
859 
860 	list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
861 		list_del_init(&lgr->list);
862 		__smc_lgr_terminate(lgr, false);
863 	}
864 
865 	if (smcibdev) {
866 		if (atomic_read(&smcibdev->lnk_cnt))
867 			wait_event(smcibdev->lnks_deleted,
868 				   !atomic_read(&smcibdev->lnk_cnt));
869 	} else {
870 		if (atomic_read(&lgr_cnt))
871 			wait_event(lgrs_deleted, !atomic_read(&lgr_cnt));
872 	}
873 }
874 
875 /* Determine vlan of internal TCP socket.
876  * @vlan_id: address to store the determined vlan id into
877  */
878 int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
879 {
880 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
881 	struct net_device *ndev;
882 	int i, nest_lvl, rc = 0;
883 
884 	ini->vlan_id = 0;
885 	if (!dst) {
886 		rc = -ENOTCONN;
887 		goto out;
888 	}
889 	if (!dst->dev) {
890 		rc = -ENODEV;
891 		goto out_rel;
892 	}
893 
894 	ndev = dst->dev;
895 	if (is_vlan_dev(ndev)) {
896 		ini->vlan_id = vlan_dev_vlan_id(ndev);
897 		goto out_rel;
898 	}
899 
900 	rtnl_lock();
901 	nest_lvl = ndev->lower_level;
902 	for (i = 0; i < nest_lvl; i++) {
903 		struct list_head *lower = &ndev->adj_list.lower;
904 
905 		if (list_empty(lower))
906 			break;
907 		lower = lower->next;
908 		ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
909 		if (is_vlan_dev(ndev)) {
910 			ini->vlan_id = vlan_dev_vlan_id(ndev);
911 			break;
912 		}
913 	}
914 	rtnl_unlock();
915 
916 out_rel:
917 	dst_release(dst);
918 out:
919 	return rc;
920 }
921 
922 static bool smcr_lgr_match(struct smc_link_group *lgr,
923 			   struct smc_clc_msg_local *lcl,
924 			   enum smc_lgr_role role, u32 clcqpn)
925 {
926 	int i;
927 
928 	if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) ||
929 	    lgr->role != role)
930 		return false;
931 
932 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
933 		if (lgr->lnk[i].state != SMC_LNK_ACTIVE)
934 			continue;
935 		if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) &&
936 		    !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) &&
937 		    !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac)))
938 			return true;
939 	}
940 	return false;
941 }
942 
943 static bool smcd_lgr_match(struct smc_link_group *lgr,
944 			   struct smcd_dev *smcismdev, u64 peer_gid)
945 {
946 	return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
947 }
948 
949 /* create a new SMC connection (and a new link group if necessary) */
950 int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
951 {
952 	struct smc_connection *conn = &smc->conn;
953 	struct list_head *lgr_list;
954 	struct smc_link_group *lgr;
955 	enum smc_lgr_role role;
956 	spinlock_t *lgr_lock;
957 	int rc = 0;
958 
959 	lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
960 	lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock;
961 	ini->cln_first_contact = SMC_FIRST_CONTACT;
962 	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
963 	if (role == SMC_CLNT && ini->srv_first_contact)
964 		/* create new link group as well */
965 		goto create;
966 
967 	/* determine if an existing link group can be reused */
968 	spin_lock_bh(lgr_lock);
969 	list_for_each_entry(lgr, lgr_list, list) {
970 		write_lock_bh(&lgr->conns_lock);
971 		if ((ini->is_smcd ?
972 		     smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
973 		     smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
974 		    !lgr->sync_err &&
975 		    lgr->vlan_id == ini->vlan_id &&
976 		    (role == SMC_CLNT ||
977 		     lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
978 			/* link group found */
979 			ini->cln_first_contact = SMC_REUSE_CONTACT;
980 			conn->lgr = lgr;
981 			rc = smc_lgr_register_conn(conn); /* add conn to lgr */
982 			write_unlock_bh(&lgr->conns_lock);
983 			if (!rc && delayed_work_pending(&lgr->free_work))
984 				cancel_delayed_work(&lgr->free_work);
985 			break;
986 		}
987 		write_unlock_bh(&lgr->conns_lock);
988 	}
989 	spin_unlock_bh(lgr_lock);
990 	if (rc)
991 		return rc;
992 
993 	if (role == SMC_CLNT && !ini->srv_first_contact &&
994 	    ini->cln_first_contact == SMC_FIRST_CONTACT) {
995 		/* Server reuses a link group, but Client wants to start
996 		 * a new one
997 		 * send out_of_sync decline, reason synchr. error
998 		 */
999 		return SMC_CLC_DECL_SYNCERR;
1000 	}
1001 
1002 create:
1003 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
1004 		rc = smc_lgr_create(smc, ini);
1005 		if (rc)
1006 			goto out;
1007 		lgr = conn->lgr;
1008 		write_lock_bh(&lgr->conns_lock);
1009 		rc = smc_lgr_register_conn(conn); /* add smc conn to lgr */
1010 		write_unlock_bh(&lgr->conns_lock);
1011 		if (rc)
1012 			goto out;
1013 	}
1014 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
1015 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
1016 	conn->urg_state = SMC_URG_READ;
1017 	if (ini->is_smcd) {
1018 		conn->rx_off = sizeof(struct smcd_cdc_msg);
1019 		smcd_cdc_rx_init(conn); /* init tasklet for this conn */
1020 	}
1021 #ifndef KERNEL_HAS_ATOMIC64
1022 	spin_lock_init(&conn->acurs_lock);
1023 #endif
1024 
1025 out:
1026 	return rc;
1027 }
1028 
1029 /* convert the RMB size into the compressed notation - minimum 16K.
1030  * In contrast to plain ilog2, this rounds towards the next power of 2,
1031  * so the socket application gets at least its desired sndbuf / rcvbuf size.
1032  */
1033 static u8 smc_compress_bufsize(int size)
1034 {
1035 	u8 compressed;
1036 
1037 	if (size <= SMC_BUF_MIN_SIZE)
1038 		return 0;
1039 
1040 	size = (size - 1) >> 14;
1041 	compressed = ilog2(size) + 1;
1042 	if (compressed >= SMC_RMBE_SIZES)
1043 		compressed = SMC_RMBE_SIZES - 1;
1044 	return compressed;
1045 }
1046 
1047 /* convert the RMB size from compressed notation into integer */
1048 int smc_uncompress_bufsize(u8 compressed)
1049 {
1050 	u32 size;
1051 
1052 	size = 0x00000001 << (((int)compressed) + 14);
1053 	return (int)size;
1054 }
1055 
1056 /* try to reuse a sndbuf or rmb description slot for a certain
1057  * buffer size; if not available, return NULL
1058  */
1059 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
1060 					     struct mutex *lock,
1061 					     struct list_head *buf_list)
1062 {
1063 	struct smc_buf_desc *buf_slot;
1064 
1065 	mutex_lock(lock);
1066 	list_for_each_entry(buf_slot, buf_list, list) {
1067 		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
1068 			mutex_unlock(lock);
1069 			return buf_slot;
1070 		}
1071 	}
1072 	mutex_unlock(lock);
1073 	return NULL;
1074 }
1075 
1076 /* one of the conditions for announcing a receiver's current window size is
1077  * that it "results in a minimum increase in the window size of 10% of the
1078  * receive buffer space" [RFC7609]
1079  */
1080 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
1081 {
1082 	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
1083 }
1084 
1085 /* map an rmb buf to a link */
1086 static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb,
1087 			     struct smc_link *lnk)
1088 {
1089 	int rc;
1090 
1091 	if (buf_desc->is_map_ib[lnk->link_idx])
1092 		return 0;
1093 
1094 	rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL);
1095 	if (rc)
1096 		return rc;
1097 	sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl,
1098 		   buf_desc->cpu_addr, buf_desc->len);
1099 
1100 	/* map sg table to DMA address */
1101 	rc = smc_ib_buf_map_sg(lnk, buf_desc,
1102 			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
1103 	/* SMC protocol depends on mapping to one DMA address only */
1104 	if (rc != 1) {
1105 		rc = -EAGAIN;
1106 		goto free_table;
1107 	}
1108 
1109 	/* create a new memory region for the RMB */
1110 	if (is_rmb) {
1111 		rc = smc_ib_get_memory_region(lnk->roce_pd,
1112 					      IB_ACCESS_REMOTE_WRITE |
1113 					      IB_ACCESS_LOCAL_WRITE,
1114 					      buf_desc, lnk->link_idx);
1115 		if (rc)
1116 			goto buf_unmap;
1117 		smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE);
1118 	}
1119 	buf_desc->is_map_ib[lnk->link_idx] = true;
1120 	return 0;
1121 
1122 buf_unmap:
1123 	smc_ib_buf_unmap_sg(lnk, buf_desc,
1124 			    is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
1125 free_table:
1126 	sg_free_table(&buf_desc->sgt[lnk->link_idx]);
1127 	return rc;
1128 }
1129 
1130 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
1131 						bool is_rmb, int bufsize)
1132 {
1133 	struct smc_buf_desc *buf_desc;
1134 
1135 	/* try to alloc a new buffer */
1136 	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
1137 	if (!buf_desc)
1138 		return ERR_PTR(-ENOMEM);
1139 
1140 	buf_desc->order = get_order(bufsize);
1141 	buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
1142 				      __GFP_NOMEMALLOC | __GFP_COMP |
1143 				      __GFP_NORETRY | __GFP_ZERO,
1144 				      buf_desc->order);
1145 	if (!buf_desc->pages) {
1146 		kfree(buf_desc);
1147 		return ERR_PTR(-EAGAIN);
1148 	}
1149 	buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
1150 	buf_desc->len = bufsize;
1151 	return buf_desc;
1152 }
1153 
1154 /* map buf_desc on all usable links,
1155  * unused buffers stay mapped as long as the link is up
1156  */
1157 static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
1158 				     struct smc_buf_desc *buf_desc, bool is_rmb)
1159 {
1160 	int i, rc = 0;
1161 
1162 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1163 		struct smc_link *lnk = &lgr->lnk[i];
1164 
1165 		if (!smc_link_usable(lnk))
1166 			continue;
1167 		if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) {
1168 			rc = -ENOMEM;
1169 			goto out;
1170 		}
1171 	}
1172 out:
1173 	return rc;
1174 }
1175 
1176 #define SMCD_DMBE_SIZES		7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
1177 
1178 static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
1179 						bool is_dmb, int bufsize)
1180 {
1181 	struct smc_buf_desc *buf_desc;
1182 	int rc;
1183 
1184 	if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
1185 		return ERR_PTR(-EAGAIN);
1186 
1187 	/* try to alloc a new DMB */
1188 	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
1189 	if (!buf_desc)
1190 		return ERR_PTR(-ENOMEM);
1191 	if (is_dmb) {
1192 		rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
1193 		if (rc) {
1194 			kfree(buf_desc);
1195 			return ERR_PTR(-EAGAIN);
1196 		}
1197 		buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
1198 		/* CDC header stored in buf. So, pretend it was smaller */
1199 		buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
1200 	} else {
1201 		buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
1202 					     __GFP_NOWARN | __GFP_NORETRY |
1203 					     __GFP_NOMEMALLOC);
1204 		if (!buf_desc->cpu_addr) {
1205 			kfree(buf_desc);
1206 			return ERR_PTR(-EAGAIN);
1207 		}
1208 		buf_desc->len = bufsize;
1209 	}
1210 	return buf_desc;
1211 }
1212 
1213 static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
1214 {
1215 	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
1216 	struct smc_connection *conn = &smc->conn;
1217 	struct smc_link_group *lgr = conn->lgr;
1218 	struct list_head *buf_list;
1219 	int bufsize, bufsize_short;
1220 	struct mutex *lock;	/* lock buffer list */
1221 	int sk_buf_size;
1222 
1223 	if (is_rmb)
1224 		/* use socket recv buffer size (w/o overhead) as start value */
1225 		sk_buf_size = smc->sk.sk_rcvbuf / 2;
1226 	else
1227 		/* use socket send buffer size (w/o overhead) as start value */
1228 		sk_buf_size = smc->sk.sk_sndbuf / 2;
1229 
1230 	for (bufsize_short = smc_compress_bufsize(sk_buf_size);
1231 	     bufsize_short >= 0; bufsize_short--) {
1232 
1233 		if (is_rmb) {
1234 			lock = &lgr->rmbs_lock;
1235 			buf_list = &lgr->rmbs[bufsize_short];
1236 		} else {
1237 			lock = &lgr->sndbufs_lock;
1238 			buf_list = &lgr->sndbufs[bufsize_short];
1239 		}
1240 		bufsize = smc_uncompress_bufsize(bufsize_short);
1241 		if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
1242 			continue;
1243 
1244 		/* check for reusable slot in the link group */
1245 		buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
1246 		if (buf_desc) {
1247 			memset(buf_desc->cpu_addr, 0, bufsize);
1248 			break; /* found reusable slot */
1249 		}
1250 
1251 		if (is_smcd)
1252 			buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
1253 		else
1254 			buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
1255 
1256 		if (PTR_ERR(buf_desc) == -ENOMEM)
1257 			break;
1258 		if (IS_ERR(buf_desc))
1259 			continue;
1260 
1261 		buf_desc->used = 1;
1262 		mutex_lock(lock);
1263 		list_add(&buf_desc->list, buf_list);
1264 		mutex_unlock(lock);
1265 		break; /* found */
1266 	}
1267 
1268 	if (IS_ERR(buf_desc))
1269 		return -ENOMEM;
1270 
1271 	if (!is_smcd) {
1272 		if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) {
1273 			smcr_buf_unuse(buf_desc, lgr);
1274 			return -ENOMEM;
1275 		}
1276 	}
1277 
1278 	if (is_rmb) {
1279 		conn->rmb_desc = buf_desc;
1280 		conn->rmbe_size_short = bufsize_short;
1281 		smc->sk.sk_rcvbuf = bufsize * 2;
1282 		atomic_set(&conn->bytes_to_rcv, 0);
1283 		conn->rmbe_update_limit =
1284 			smc_rmb_wnd_update_limit(buf_desc->len);
1285 		if (is_smcd)
1286 			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
1287 	} else {
1288 		conn->sndbuf_desc = buf_desc;
1289 		smc->sk.sk_sndbuf = bufsize * 2;
1290 		atomic_set(&conn->sndbuf_space, bufsize);
1291 	}
1292 	return 0;
1293 }
1294 
1295 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
1296 {
1297 	if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk))
1298 		return;
1299 	smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
1300 }
1301 
1302 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
1303 {
1304 	if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk))
1305 		return;
1306 	smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
1307 }
1308 
1309 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
1310 {
1311 	int i;
1312 
1313 	if (!conn->lgr || conn->lgr->is_smcd)
1314 		return;
1315 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1316 		if (!smc_link_usable(&conn->lgr->lnk[i]))
1317 			continue;
1318 		smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc,
1319 				       DMA_FROM_DEVICE);
1320 	}
1321 }
1322 
1323 void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
1324 {
1325 	int i;
1326 
1327 	if (!conn->lgr || conn->lgr->is_smcd)
1328 		return;
1329 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1330 		if (!smc_link_usable(&conn->lgr->lnk[i]))
1331 			continue;
1332 		smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc,
1333 					  DMA_FROM_DEVICE);
1334 	}
1335 }
1336 
1337 /* create the send and receive buffer for an SMC socket;
1338  * receive buffers are called RMBs;
1339  * (even though the SMC protocol allows more than one RMB-element per RMB,
1340  * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
1341  * extra RMB for every connection in a link group
1342  */
1343 int smc_buf_create(struct smc_sock *smc, bool is_smcd)
1344 {
1345 	int rc;
1346 
1347 	/* create send buffer */
1348 	rc = __smc_buf_create(smc, is_smcd, false);
1349 	if (rc)
1350 		return rc;
1351 	/* create rmb */
1352 	rc = __smc_buf_create(smc, is_smcd, true);
1353 	if (rc)
1354 		smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
1355 	return rc;
1356 }
1357 
1358 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
1359 {
1360 	int i;
1361 
1362 	for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
1363 		if (!test_and_set_bit(i, lgr->rtokens_used_mask))
1364 			return i;
1365 	}
1366 	return -ENOSPC;
1367 }
1368 
1369 static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx,
1370 				   u32 rkey)
1371 {
1372 	int i;
1373 
1374 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1375 		if (test_bit(i, lgr->rtokens_used_mask) &&
1376 		    lgr->rtokens[i][lnk_idx].rkey == rkey)
1377 			return i;
1378 	}
1379 	return -ENOENT;
1380 }
1381 
1382 /* set rtoken for a new link to an existing rmb */
1383 void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
1384 		    __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey)
1385 {
1386 	int rtok_idx;
1387 
1388 	rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known));
1389 	if (rtok_idx == -ENOENT)
1390 		return;
1391 	lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey);
1392 	lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr);
1393 }
1394 
1395 /* set rtoken for a new link whose link_id is given */
1396 void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
1397 		     __be64 nw_vaddr, __be32 nw_rkey)
1398 {
1399 	u64 dma_addr = be64_to_cpu(nw_vaddr);
1400 	u32 rkey = ntohl(nw_rkey);
1401 	bool found = false;
1402 	int link_idx;
1403 
1404 	for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) {
1405 		if (lgr->lnk[link_idx].link_id == link_id) {
1406 			found = true;
1407 			break;
1408 		}
1409 	}
1410 	if (!found)
1411 		return;
1412 	lgr->rtokens[rtok_idx][link_idx].rkey = rkey;
1413 	lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr;
1414 }
1415 
1416 /* add a new rtoken from peer */
1417 int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
1418 {
1419 	struct smc_link_group *lgr = smc_get_lgr(lnk);
1420 	u64 dma_addr = be64_to_cpu(nw_vaddr);
1421 	u32 rkey = ntohl(nw_rkey);
1422 	int i;
1423 
1424 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1425 		if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
1426 		    lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr &&
1427 		    test_bit(i, lgr->rtokens_used_mask)) {
1428 			/* already in list */
1429 			return i;
1430 		}
1431 	}
1432 	i = smc_rmb_reserve_rtoken_idx(lgr);
1433 	if (i < 0)
1434 		return i;
1435 	lgr->rtokens[i][lnk->link_idx].rkey = rkey;
1436 	lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr;
1437 	return i;
1438 }
1439 
1440 /* delete an rtoken from all links */
1441 int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
1442 {
1443 	struct smc_link_group *lgr = smc_get_lgr(lnk);
1444 	u32 rkey = ntohl(nw_rkey);
1445 	int i, j;
1446 
1447 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1448 		if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
1449 		    test_bit(i, lgr->rtokens_used_mask)) {
1450 			for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) {
1451 				lgr->rtokens[i][j].rkey = 0;
1452 				lgr->rtokens[i][j].dma_addr = 0;
1453 			}
1454 			clear_bit(i, lgr->rtokens_used_mask);
1455 			return 0;
1456 		}
1457 	}
1458 	return -ENOENT;
1459 }
1460 
1461 /* save rkey and dma_addr received from peer during clc handshake */
1462 int smc_rmb_rtoken_handling(struct smc_connection *conn,
1463 			    struct smc_link *lnk,
1464 			    struct smc_clc_msg_accept_confirm *clc)
1465 {
1466 	conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr,
1467 					  clc->rmb_rkey);
1468 	if (conn->rtoken_idx < 0)
1469 		return conn->rtoken_idx;
1470 	return 0;
1471 }
1472 
1473 static void smc_core_going_away(void)
1474 {
1475 	struct smc_ib_device *smcibdev;
1476 	struct smcd_dev *smcd;
1477 
1478 	spin_lock(&smc_ib_devices.lock);
1479 	list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
1480 		int i;
1481 
1482 		for (i = 0; i < SMC_MAX_PORTS; i++)
1483 			set_bit(i, smcibdev->ports_going_away);
1484 	}
1485 	spin_unlock(&smc_ib_devices.lock);
1486 
1487 	spin_lock(&smcd_dev_list.lock);
1488 	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
1489 		smcd->going_away = 1;
1490 	}
1491 	spin_unlock(&smcd_dev_list.lock);
1492 }
1493 
1494 /* Clean up all SMC link groups */
1495 static void smc_lgrs_shutdown(void)
1496 {
1497 	struct smcd_dev *smcd;
1498 
1499 	smc_core_going_away();
1500 
1501 	smc_smcr_terminate_all(NULL);
1502 
1503 	spin_lock(&smcd_dev_list.lock);
1504 	list_for_each_entry(smcd, &smcd_dev_list.list, list)
1505 		smc_smcd_terminate_all(smcd);
1506 	spin_unlock(&smcd_dev_list.lock);
1507 }
1508 
1509 static int smc_core_reboot_event(struct notifier_block *this,
1510 				 unsigned long event, void *ptr)
1511 {
1512 	smc_lgrs_shutdown();
1513 	smc_ib_unregister_client();
1514 	return 0;
1515 }
1516 
1517 static struct notifier_block smc_reboot_notifier = {
1518 	.notifier_call = smc_core_reboot_event,
1519 };
1520 
1521 int __init smc_core_init(void)
1522 {
1523 	return register_reboot_notifier(&smc_reboot_notifier);
1524 }
1525 
1526 /* Called (from smc_exit) when module is removed */
1527 void smc_core_exit(void)
1528 {
1529 	unregister_reboot_notifier(&smc_reboot_notifier);
1530 	smc_lgrs_shutdown();
1531 }
1532