xref: /linux/net/smc/smc_core.c (revision 2c63221cd9e5c0dad0424029aeb1c40faada8330)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  Basic Transport Functions exploiting Infiniband API
6  *
7  *  Copyright IBM Corp. 2016
8  *
9  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
10  */
11 
12 #include <linux/socket.h>
13 #include <linux/if_vlan.h>
14 #include <linux/random.h>
15 #include <linux/workqueue.h>
16 #include <net/tcp.h>
17 #include <net/sock.h>
18 #include <rdma/ib_verbs.h>
19 #include <rdma/ib_cache.h>
20 
21 #include "smc.h"
22 #include "smc_clc.h"
23 #include "smc_core.h"
24 #include "smc_ib.h"
25 #include "smc_wr.h"
26 #include "smc_llc.h"
27 #include "smc_cdc.h"
28 #include "smc_close.h"
29 #include "smc_ism.h"
30 
31 #define SMC_LGR_NUM_INCR		256
32 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
33 #define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
34 #define SMC_LGR_FREE_DELAY_FAST		(8 * HZ)
35 
36 static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
37 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
38 	.list = LIST_HEAD_INIT(smc_lgr_list.list),
39 	.num = 0,
40 };
41 
42 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
43 			 struct smc_buf_desc *buf_desc);
44 
45 /* return head of link group list and its lock for a given link group */
46 static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
47 						  spinlock_t **lgr_lock)
48 {
49 	if (lgr->is_smcd) {
50 		*lgr_lock = &lgr->smcd->lgr_lock;
51 		return &lgr->smcd->lgr_list;
52 	}
53 
54 	*lgr_lock = &smc_lgr_list.lock;
55 	return &smc_lgr_list.list;
56 }
57 
58 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
59 {
60 	/* client link group creation always follows the server link group
61 	 * creation. For client use a somewhat higher removal delay time,
62 	 * otherwise there is a risk of out-of-sync link groups.
63 	 */
64 	if (!lgr->freeing && !lgr->freefast) {
65 		mod_delayed_work(system_wq, &lgr->free_work,
66 				 (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
67 						SMC_LGR_FREE_DELAY_CLNT :
68 						SMC_LGR_FREE_DELAY_SERV);
69 	}
70 }
71 
72 void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
73 {
74 	if (!lgr->freeing && !lgr->freefast) {
75 		lgr->freefast = 1;
76 		mod_delayed_work(system_wq, &lgr->free_work,
77 				 SMC_LGR_FREE_DELAY_FAST);
78 	}
79 }
80 
81 /* Register connection's alert token in our lookup structure.
82  * To use rbtrees we have to implement our own insert core.
83  * Requires @conns_lock
84  * @smc		connection to register
85  * Returns 0 on success, != otherwise.
86  */
87 static void smc_lgr_add_alert_token(struct smc_connection *conn)
88 {
89 	struct rb_node **link, *parent = NULL;
90 	u32 token = conn->alert_token_local;
91 
92 	link = &conn->lgr->conns_all.rb_node;
93 	while (*link) {
94 		struct smc_connection *cur = rb_entry(*link,
95 					struct smc_connection, alert_node);
96 
97 		parent = *link;
98 		if (cur->alert_token_local > token)
99 			link = &parent->rb_left;
100 		else
101 			link = &parent->rb_right;
102 	}
103 	/* Put the new node there */
104 	rb_link_node(&conn->alert_node, parent, link);
105 	rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
106 }
107 
108 /* Register connection in link group by assigning an alert token
109  * registered in a search tree.
110  * Requires @conns_lock
111  * Note that '0' is a reserved value and not assigned.
112  */
113 static void smc_lgr_register_conn(struct smc_connection *conn)
114 {
115 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
116 	static atomic_t nexttoken = ATOMIC_INIT(0);
117 
118 	/* find a new alert_token_local value not yet used by some connection
119 	 * in this link group
120 	 */
121 	sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
122 	while (!conn->alert_token_local) {
123 		conn->alert_token_local = atomic_inc_return(&nexttoken);
124 		if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
125 			conn->alert_token_local = 0;
126 	}
127 	smc_lgr_add_alert_token(conn);
128 	conn->lgr->conns_num++;
129 }
130 
131 /* Unregister connection and reset the alert token of the given connection<
132  */
133 static void __smc_lgr_unregister_conn(struct smc_connection *conn)
134 {
135 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
136 	struct smc_link_group *lgr = conn->lgr;
137 
138 	rb_erase(&conn->alert_node, &lgr->conns_all);
139 	lgr->conns_num--;
140 	conn->alert_token_local = 0;
141 	sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
142 }
143 
144 /* Unregister connection from lgr
145  */
146 static void smc_lgr_unregister_conn(struct smc_connection *conn)
147 {
148 	struct smc_link_group *lgr = conn->lgr;
149 
150 	if (!lgr)
151 		return;
152 	write_lock_bh(&lgr->conns_lock);
153 	if (conn->alert_token_local) {
154 		__smc_lgr_unregister_conn(conn);
155 	}
156 	write_unlock_bh(&lgr->conns_lock);
157 	conn->lgr = NULL;
158 }
159 
160 /* Send delete link, either as client to request the initiation
161  * of the DELETE LINK sequence from server; or as server to
162  * initiate the delete processing. See smc_llc_rx_delete_link().
163  */
164 static int smc_link_send_delete(struct smc_link *lnk)
165 {
166 	if (lnk->state == SMC_LNK_ACTIVE &&
167 	    !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) {
168 		smc_llc_link_deleting(lnk);
169 		return 0;
170 	}
171 	return -ENOTCONN;
172 }
173 
174 static void smc_lgr_free(struct smc_link_group *lgr);
175 
176 static void smc_lgr_free_work(struct work_struct *work)
177 {
178 	struct smc_link_group *lgr = container_of(to_delayed_work(work),
179 						  struct smc_link_group,
180 						  free_work);
181 	spinlock_t *lgr_lock;
182 	struct smc_link *lnk;
183 	bool conns;
184 
185 	smc_lgr_list_head(lgr, &lgr_lock);
186 	spin_lock_bh(lgr_lock);
187 	if (lgr->freeing) {
188 		spin_unlock_bh(lgr_lock);
189 		return;
190 	}
191 	read_lock_bh(&lgr->conns_lock);
192 	conns = RB_EMPTY_ROOT(&lgr->conns_all);
193 	read_unlock_bh(&lgr->conns_lock);
194 	if (!conns) { /* number of lgr connections is no longer zero */
195 		spin_unlock_bh(lgr_lock);
196 		return;
197 	}
198 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
199 
200 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
201 	if (!lgr->is_smcd && !lgr->terminating)	{
202 		/* try to send del link msg, on error free lgr immediately */
203 		if (lnk->state == SMC_LNK_ACTIVE &&
204 		    !smc_link_send_delete(lnk)) {
205 			/* reschedule in case we never receive a response */
206 			smc_lgr_schedule_free_work(lgr);
207 			spin_unlock_bh(lgr_lock);
208 			return;
209 		}
210 	}
211 	lgr->freeing = 1; /* this instance does the freeing, no new schedule */
212 	spin_unlock_bh(lgr_lock);
213 	cancel_delayed_work(&lgr->free_work);
214 
215 	if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
216 		smc_llc_link_inactive(lnk);
217 	if (lgr->is_smcd)
218 		smc_ism_signal_shutdown(lgr);
219 	smc_lgr_free(lgr);
220 }
221 
222 static void smc_lgr_terminate_work(struct work_struct *work)
223 {
224 	struct smc_link_group *lgr = container_of(work, struct smc_link_group,
225 						  terminate_work);
226 
227 	smc_lgr_terminate(lgr);
228 }
229 
230 /* create a new SMC link group */
231 static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
232 {
233 	struct smc_link_group *lgr;
234 	struct list_head *lgr_list;
235 	struct smc_link *lnk;
236 	spinlock_t *lgr_lock;
237 	u8 rndvec[3];
238 	int rc = 0;
239 	int i;
240 
241 	if (ini->is_smcd && ini->vlan_id) {
242 		if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) {
243 			rc = SMC_CLC_DECL_ISMVLANERR;
244 			goto out;
245 		}
246 	}
247 
248 	lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
249 	if (!lgr) {
250 		rc = SMC_CLC_DECL_MEM;
251 		goto ism_put_vlan;
252 	}
253 	lgr->is_smcd = ini->is_smcd;
254 	lgr->sync_err = 0;
255 	lgr->terminating = 0;
256 	lgr->freefast = 0;
257 	lgr->freeing = 0;
258 	lgr->vlan_id = ini->vlan_id;
259 	rwlock_init(&lgr->sndbufs_lock);
260 	rwlock_init(&lgr->rmbs_lock);
261 	rwlock_init(&lgr->conns_lock);
262 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
263 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
264 		INIT_LIST_HEAD(&lgr->rmbs[i]);
265 	}
266 	smc_lgr_list.num += SMC_LGR_NUM_INCR;
267 	memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
268 	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
269 	INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work);
270 	lgr->conns_all = RB_ROOT;
271 	if (ini->is_smcd) {
272 		/* SMC-D specific settings */
273 		get_device(&ini->ism_dev->dev);
274 		lgr->peer_gid = ini->ism_gid;
275 		lgr->smcd = ini->ism_dev;
276 		lgr_list = &ini->ism_dev->lgr_list;
277 		lgr_lock = &lgr->smcd->lgr_lock;
278 	} else {
279 		/* SMC-R specific settings */
280 		get_device(&ini->ib_dev->ibdev->dev);
281 		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
282 		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
283 		       SMC_SYSTEMID_LEN);
284 
285 		lnk = &lgr->lnk[SMC_SINGLE_LINK];
286 		/* initialize link */
287 		lnk->state = SMC_LNK_ACTIVATING;
288 		lnk->link_id = SMC_SINGLE_LINK;
289 		lnk->smcibdev = ini->ib_dev;
290 		lnk->ibport = ini->ib_port;
291 		lgr_list = &smc_lgr_list.list;
292 		lgr_lock = &smc_lgr_list.lock;
293 		lnk->path_mtu =
294 			ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
295 		if (!ini->ib_dev->initialized)
296 			smc_ib_setup_per_ibdev(ini->ib_dev);
297 		get_random_bytes(rndvec, sizeof(rndvec));
298 		lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
299 			(rndvec[2] << 16);
300 		rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
301 					  ini->vlan_id, lnk->gid,
302 					  &lnk->sgid_index);
303 		if (rc)
304 			goto free_lgr;
305 		rc = smc_llc_link_init(lnk);
306 		if (rc)
307 			goto free_lgr;
308 		rc = smc_wr_alloc_link_mem(lnk);
309 		if (rc)
310 			goto clear_llc_lnk;
311 		rc = smc_ib_create_protection_domain(lnk);
312 		if (rc)
313 			goto free_link_mem;
314 		rc = smc_ib_create_queue_pair(lnk);
315 		if (rc)
316 			goto dealloc_pd;
317 		rc = smc_wr_create_link(lnk);
318 		if (rc)
319 			goto destroy_qp;
320 	}
321 	smc->conn.lgr = lgr;
322 	spin_lock_bh(lgr_lock);
323 	list_add(&lgr->list, lgr_list);
324 	spin_unlock_bh(lgr_lock);
325 	return 0;
326 
327 destroy_qp:
328 	smc_ib_destroy_queue_pair(lnk);
329 dealloc_pd:
330 	smc_ib_dealloc_protection_domain(lnk);
331 free_link_mem:
332 	smc_wr_free_link_mem(lnk);
333 clear_llc_lnk:
334 	smc_llc_link_clear(lnk);
335 free_lgr:
336 	kfree(lgr);
337 ism_put_vlan:
338 	if (ini->is_smcd && ini->vlan_id)
339 		smc_ism_put_vlan(ini->ism_dev, ini->vlan_id);
340 out:
341 	if (rc < 0) {
342 		if (rc == -ENOMEM)
343 			rc = SMC_CLC_DECL_MEM;
344 		else
345 			rc = SMC_CLC_DECL_INTERR;
346 	}
347 	return rc;
348 }
349 
350 static void smc_buf_unuse(struct smc_connection *conn,
351 			  struct smc_link_group *lgr)
352 {
353 	if (conn->sndbuf_desc)
354 		conn->sndbuf_desc->used = 0;
355 	if (conn->rmb_desc) {
356 		if (!conn->rmb_desc->regerr) {
357 			if (!lgr->is_smcd && !list_empty(&lgr->list)) {
358 				/* unregister rmb with peer */
359 				smc_llc_do_delete_rkey(
360 						&lgr->lnk[SMC_SINGLE_LINK],
361 						conn->rmb_desc);
362 			}
363 			conn->rmb_desc->used = 0;
364 		} else {
365 			/* buf registration failed, reuse not possible */
366 			write_lock_bh(&lgr->rmbs_lock);
367 			list_del(&conn->rmb_desc->list);
368 			write_unlock_bh(&lgr->rmbs_lock);
369 
370 			smc_buf_free(lgr, true, conn->rmb_desc);
371 		}
372 	}
373 }
374 
375 /* remove a finished connection from its link group */
376 void smc_conn_free(struct smc_connection *conn)
377 {
378 	struct smc_link_group *lgr = conn->lgr;
379 
380 	if (!lgr)
381 		return;
382 	if (lgr->is_smcd) {
383 		smc_ism_unset_conn(conn);
384 		tasklet_kill(&conn->rx_tsklet);
385 	} else {
386 		smc_cdc_tx_dismiss_slots(conn);
387 	}
388 	if (!list_empty(&lgr->list)) {
389 		smc_lgr_unregister_conn(conn);
390 		smc_buf_unuse(conn, lgr); /* allow buffer reuse */
391 	}
392 
393 	if (!lgr->conns_num)
394 		smc_lgr_schedule_free_work(lgr);
395 }
396 
397 static void smc_link_clear(struct smc_link *lnk)
398 {
399 	lnk->peer_qpn = 0;
400 	smc_llc_link_clear(lnk);
401 	smc_ib_modify_qp_reset(lnk);
402 	smc_wr_free_link(lnk);
403 	smc_ib_destroy_queue_pair(lnk);
404 	smc_ib_dealloc_protection_domain(lnk);
405 	smc_wr_free_link_mem(lnk);
406 }
407 
408 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
409 			  struct smc_buf_desc *buf_desc)
410 {
411 	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
412 
413 	if (is_rmb) {
414 		if (buf_desc->mr_rx[SMC_SINGLE_LINK])
415 			smc_ib_put_memory_region(
416 					buf_desc->mr_rx[SMC_SINGLE_LINK]);
417 		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
418 				    DMA_FROM_DEVICE);
419 	} else {
420 		smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
421 				    DMA_TO_DEVICE);
422 	}
423 	sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
424 	if (buf_desc->pages)
425 		__free_pages(buf_desc->pages, buf_desc->order);
426 	kfree(buf_desc);
427 }
428 
429 static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
430 			  struct smc_buf_desc *buf_desc)
431 {
432 	if (is_dmb) {
433 		/* restore original buf len */
434 		buf_desc->len += sizeof(struct smcd_cdc_msg);
435 		smc_ism_unregister_dmb(lgr->smcd, buf_desc);
436 	} else {
437 		kfree(buf_desc->cpu_addr);
438 	}
439 	kfree(buf_desc);
440 }
441 
442 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
443 			 struct smc_buf_desc *buf_desc)
444 {
445 	if (lgr->is_smcd)
446 		smcd_buf_free(lgr, is_rmb, buf_desc);
447 	else
448 		smcr_buf_free(lgr, is_rmb, buf_desc);
449 }
450 
451 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
452 {
453 	struct smc_buf_desc *buf_desc, *bf_desc;
454 	struct list_head *buf_list;
455 	int i;
456 
457 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
458 		if (is_rmb)
459 			buf_list = &lgr->rmbs[i];
460 		else
461 			buf_list = &lgr->sndbufs[i];
462 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
463 					 list) {
464 			list_del(&buf_desc->list);
465 			smc_buf_free(lgr, is_rmb, buf_desc);
466 		}
467 	}
468 }
469 
470 static void smc_lgr_free_bufs(struct smc_link_group *lgr)
471 {
472 	/* free send buffers */
473 	__smc_lgr_free_bufs(lgr, false);
474 	/* free rmbs */
475 	__smc_lgr_free_bufs(lgr, true);
476 }
477 
478 /* remove a link group */
479 static void smc_lgr_free(struct smc_link_group *lgr)
480 {
481 	smc_lgr_free_bufs(lgr);
482 	if (lgr->is_smcd) {
483 		smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
484 		put_device(&lgr->smcd->dev);
485 	} else {
486 		smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
487 		put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev);
488 	}
489 	kfree(lgr);
490 }
491 
492 void smc_lgr_forget(struct smc_link_group *lgr)
493 {
494 	struct list_head *lgr_list;
495 	spinlock_t *lgr_lock;
496 
497 	lgr_list = smc_lgr_list_head(lgr, &lgr_lock);
498 	spin_lock_bh(lgr_lock);
499 	/* do not use this link group for new connections */
500 	if (!list_empty(lgr_list))
501 		list_del_init(lgr_list);
502 	spin_unlock_bh(lgr_lock);
503 }
504 
505 static void smc_sk_wake_ups(struct smc_sock *smc)
506 {
507 	smc->sk.sk_write_space(&smc->sk);
508 	smc->sk.sk_data_ready(&smc->sk);
509 	smc->sk.sk_state_change(&smc->sk);
510 }
511 
512 /* kill a connection */
513 static void smc_conn_kill(struct smc_connection *conn)
514 {
515 	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
516 
517 	smc_close_abort(conn);
518 	conn->killed = 1;
519 	smc_sk_wake_ups(smc);
520 	smc_lgr_unregister_conn(conn);
521 	smc->sk.sk_err = ECONNABORTED;
522 	smc_close_active_abort(smc);
523 }
524 
525 /* terminate link group */
526 static void __smc_lgr_terminate(struct smc_link_group *lgr)
527 {
528 	struct smc_connection *conn;
529 	struct smc_sock *smc;
530 	struct rb_node *node;
531 
532 	if (lgr->terminating)
533 		return;	/* lgr already terminating */
534 	lgr->terminating = 1;
535 	if (!lgr->is_smcd)
536 		smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
537 
538 	/* kill remaining link group connections */
539 	read_lock_bh(&lgr->conns_lock);
540 	node = rb_first(&lgr->conns_all);
541 	while (node) {
542 		read_unlock_bh(&lgr->conns_lock);
543 		conn = rb_entry(node, struct smc_connection, alert_node);
544 		smc = container_of(conn, struct smc_sock, conn);
545 		sock_hold(&smc->sk); /* sock_put below */
546 		lock_sock(&smc->sk);
547 		smc_conn_kill(conn);
548 		release_sock(&smc->sk);
549 		sock_put(&smc->sk); /* sock_hold above */
550 		read_lock_bh(&lgr->conns_lock);
551 		node = rb_first(&lgr->conns_all);
552 	}
553 	read_unlock_bh(&lgr->conns_lock);
554 	if (!lgr->is_smcd)
555 		wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
556 	smc_lgr_schedule_free_work_fast(lgr);
557 }
558 
559 /* unlink and terminate link group */
560 void smc_lgr_terminate(struct smc_link_group *lgr)
561 {
562 	spinlock_t *lgr_lock;
563 
564 	smc_lgr_list_head(lgr, &lgr_lock);
565 	spin_lock_bh(lgr_lock);
566 	if (lgr->terminating) {
567 		spin_unlock_bh(lgr_lock);
568 		return;	/* lgr already terminating */
569 	}
570 	list_del_init(&lgr->list);
571 	spin_unlock_bh(lgr_lock);
572 	__smc_lgr_terminate(lgr);
573 }
574 
575 /* Called when IB port is terminated */
576 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
577 {
578 	struct smc_link_group *lgr, *l;
579 	LIST_HEAD(lgr_free_list);
580 
581 	spin_lock_bh(&smc_lgr_list.lock);
582 	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
583 		if (!lgr->is_smcd &&
584 		    lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
585 		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
586 			list_move(&lgr->list, &lgr_free_list);
587 	}
588 	spin_unlock_bh(&smc_lgr_list.lock);
589 
590 	list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
591 		list_del_init(&lgr->list);
592 		__smc_lgr_terminate(lgr);
593 	}
594 }
595 
596 /* Called when SMC-D device is terminated or peer is lost */
597 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
598 {
599 	struct smc_link_group *lgr, *l;
600 	LIST_HEAD(lgr_free_list);
601 
602 	/* run common cleanup function and build free list */
603 	spin_lock_bh(&dev->lgr_lock);
604 	list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
605 		if ((!peer_gid || lgr->peer_gid == peer_gid) &&
606 		    (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
607 			list_move(&lgr->list, &lgr_free_list);
608 		}
609 	}
610 	spin_unlock_bh(&dev->lgr_lock);
611 
612 	/* cancel the regular free workers and actually free lgrs */
613 	list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
614 		list_del_init(&lgr->list);
615 		__smc_lgr_terminate(lgr);
616 		cancel_delayed_work_sync(&lgr->free_work);
617 		if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */
618 			smc_ism_signal_shutdown(lgr);
619 		smc_lgr_free(lgr);
620 	}
621 }
622 
623 /* Determine vlan of internal TCP socket.
624  * @vlan_id: address to store the determined vlan id into
625  */
626 int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
627 {
628 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
629 	struct net_device *ndev;
630 	int i, nest_lvl, rc = 0;
631 
632 	ini->vlan_id = 0;
633 	if (!dst) {
634 		rc = -ENOTCONN;
635 		goto out;
636 	}
637 	if (!dst->dev) {
638 		rc = -ENODEV;
639 		goto out_rel;
640 	}
641 
642 	ndev = dst->dev;
643 	if (is_vlan_dev(ndev)) {
644 		ini->vlan_id = vlan_dev_vlan_id(ndev);
645 		goto out_rel;
646 	}
647 
648 	rtnl_lock();
649 	nest_lvl = ndev->lower_level;
650 	for (i = 0; i < nest_lvl; i++) {
651 		struct list_head *lower = &ndev->adj_list.lower;
652 
653 		if (list_empty(lower))
654 			break;
655 		lower = lower->next;
656 		ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
657 		if (is_vlan_dev(ndev)) {
658 			ini->vlan_id = vlan_dev_vlan_id(ndev);
659 			break;
660 		}
661 	}
662 	rtnl_unlock();
663 
664 out_rel:
665 	dst_release(dst);
666 out:
667 	return rc;
668 }
669 
670 static bool smcr_lgr_match(struct smc_link_group *lgr,
671 			   struct smc_clc_msg_local *lcl,
672 			   enum smc_lgr_role role, u32 clcqpn)
673 {
674 	return !memcmp(lgr->peer_systemid, lcl->id_for_peer,
675 		       SMC_SYSTEMID_LEN) &&
676 		!memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
677 			SMC_GID_SIZE) &&
678 		!memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
679 			sizeof(lcl->mac)) &&
680 		lgr->role == role &&
681 		(lgr->role == SMC_SERV ||
682 		 lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn);
683 }
684 
685 static bool smcd_lgr_match(struct smc_link_group *lgr,
686 			   struct smcd_dev *smcismdev, u64 peer_gid)
687 {
688 	return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
689 }
690 
691 /* create a new SMC connection (and a new link group if necessary) */
692 int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
693 {
694 	struct smc_connection *conn = &smc->conn;
695 	struct list_head *lgr_list;
696 	struct smc_link_group *lgr;
697 	enum smc_lgr_role role;
698 	spinlock_t *lgr_lock;
699 	int rc = 0;
700 
701 	lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
702 	lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock;
703 	ini->cln_first_contact = SMC_FIRST_CONTACT;
704 	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
705 	if (role == SMC_CLNT && ini->srv_first_contact)
706 		/* create new link group as well */
707 		goto create;
708 
709 	/* determine if an existing link group can be reused */
710 	spin_lock_bh(lgr_lock);
711 	list_for_each_entry(lgr, lgr_list, list) {
712 		write_lock_bh(&lgr->conns_lock);
713 		if ((ini->is_smcd ?
714 		     smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
715 		     smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
716 		    !lgr->sync_err &&
717 		    lgr->vlan_id == ini->vlan_id &&
718 		    (role == SMC_CLNT ||
719 		     lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
720 			/* link group found */
721 			ini->cln_first_contact = SMC_REUSE_CONTACT;
722 			conn->lgr = lgr;
723 			smc_lgr_register_conn(conn); /* add smc conn to lgr */
724 			if (delayed_work_pending(&lgr->free_work))
725 				cancel_delayed_work(&lgr->free_work);
726 			write_unlock_bh(&lgr->conns_lock);
727 			break;
728 		}
729 		write_unlock_bh(&lgr->conns_lock);
730 	}
731 	spin_unlock_bh(lgr_lock);
732 
733 	if (role == SMC_CLNT && !ini->srv_first_contact &&
734 	    ini->cln_first_contact == SMC_FIRST_CONTACT) {
735 		/* Server reuses a link group, but Client wants to start
736 		 * a new one
737 		 * send out_of_sync decline, reason synchr. error
738 		 */
739 		return SMC_CLC_DECL_SYNCERR;
740 	}
741 
742 create:
743 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
744 		rc = smc_lgr_create(smc, ini);
745 		if (rc)
746 			goto out;
747 		lgr = conn->lgr;
748 		write_lock_bh(&lgr->conns_lock);
749 		smc_lgr_register_conn(conn); /* add smc conn to lgr */
750 		write_unlock_bh(&lgr->conns_lock);
751 	}
752 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
753 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
754 	conn->urg_state = SMC_URG_READ;
755 	if (ini->is_smcd) {
756 		conn->rx_off = sizeof(struct smcd_cdc_msg);
757 		smcd_cdc_rx_init(conn); /* init tasklet for this conn */
758 	}
759 #ifndef KERNEL_HAS_ATOMIC64
760 	spin_lock_init(&conn->acurs_lock);
761 #endif
762 
763 out:
764 	return rc;
765 }
766 
767 /* convert the RMB size into the compressed notation - minimum 16K.
768  * In contrast to plain ilog2, this rounds towards the next power of 2,
769  * so the socket application gets at least its desired sndbuf / rcvbuf size.
770  */
771 static u8 smc_compress_bufsize(int size)
772 {
773 	u8 compressed;
774 
775 	if (size <= SMC_BUF_MIN_SIZE)
776 		return 0;
777 
778 	size = (size - 1) >> 14;
779 	compressed = ilog2(size) + 1;
780 	if (compressed >= SMC_RMBE_SIZES)
781 		compressed = SMC_RMBE_SIZES - 1;
782 	return compressed;
783 }
784 
785 /* convert the RMB size from compressed notation into integer */
786 int smc_uncompress_bufsize(u8 compressed)
787 {
788 	u32 size;
789 
790 	size = 0x00000001 << (((int)compressed) + 14);
791 	return (int)size;
792 }
793 
794 /* try to reuse a sndbuf or rmb description slot for a certain
795  * buffer size; if not available, return NULL
796  */
797 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
798 					     rwlock_t *lock,
799 					     struct list_head *buf_list)
800 {
801 	struct smc_buf_desc *buf_slot;
802 
803 	read_lock_bh(lock);
804 	list_for_each_entry(buf_slot, buf_list, list) {
805 		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
806 			read_unlock_bh(lock);
807 			return buf_slot;
808 		}
809 	}
810 	read_unlock_bh(lock);
811 	return NULL;
812 }
813 
814 /* one of the conditions for announcing a receiver's current window size is
815  * that it "results in a minimum increase in the window size of 10% of the
816  * receive buffer space" [RFC7609]
817  */
818 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
819 {
820 	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
821 }
822 
823 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
824 						bool is_rmb, int bufsize)
825 {
826 	struct smc_buf_desc *buf_desc;
827 	struct smc_link *lnk;
828 	int rc;
829 
830 	/* try to alloc a new buffer */
831 	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
832 	if (!buf_desc)
833 		return ERR_PTR(-ENOMEM);
834 
835 	buf_desc->order = get_order(bufsize);
836 	buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
837 				      __GFP_NOMEMALLOC | __GFP_COMP |
838 				      __GFP_NORETRY | __GFP_ZERO,
839 				      buf_desc->order);
840 	if (!buf_desc->pages) {
841 		kfree(buf_desc);
842 		return ERR_PTR(-EAGAIN);
843 	}
844 	buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
845 
846 	/* build the sg table from the pages */
847 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
848 	rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
849 			    GFP_KERNEL);
850 	if (rc) {
851 		smc_buf_free(lgr, is_rmb, buf_desc);
852 		return ERR_PTR(rc);
853 	}
854 	sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
855 		   buf_desc->cpu_addr, bufsize);
856 
857 	/* map sg table to DMA address */
858 	rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
859 			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
860 	/* SMC protocol depends on mapping to one DMA address only */
861 	if (rc != 1)  {
862 		smc_buf_free(lgr, is_rmb, buf_desc);
863 		return ERR_PTR(-EAGAIN);
864 	}
865 
866 	/* create a new memory region for the RMB */
867 	if (is_rmb) {
868 		rc = smc_ib_get_memory_region(lnk->roce_pd,
869 					      IB_ACCESS_REMOTE_WRITE |
870 					      IB_ACCESS_LOCAL_WRITE,
871 					      buf_desc);
872 		if (rc) {
873 			smc_buf_free(lgr, is_rmb, buf_desc);
874 			return ERR_PTR(rc);
875 		}
876 	}
877 
878 	buf_desc->len = bufsize;
879 	return buf_desc;
880 }
881 
882 #define SMCD_DMBE_SIZES		7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
883 
884 static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
885 						bool is_dmb, int bufsize)
886 {
887 	struct smc_buf_desc *buf_desc;
888 	int rc;
889 
890 	if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
891 		return ERR_PTR(-EAGAIN);
892 
893 	/* try to alloc a new DMB */
894 	buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
895 	if (!buf_desc)
896 		return ERR_PTR(-ENOMEM);
897 	if (is_dmb) {
898 		rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
899 		if (rc) {
900 			kfree(buf_desc);
901 			return ERR_PTR(-EAGAIN);
902 		}
903 		buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
904 		/* CDC header stored in buf. So, pretend it was smaller */
905 		buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
906 	} else {
907 		buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
908 					     __GFP_NOWARN | __GFP_NORETRY |
909 					     __GFP_NOMEMALLOC);
910 		if (!buf_desc->cpu_addr) {
911 			kfree(buf_desc);
912 			return ERR_PTR(-EAGAIN);
913 		}
914 		buf_desc->len = bufsize;
915 	}
916 	return buf_desc;
917 }
918 
919 static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
920 {
921 	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
922 	struct smc_connection *conn = &smc->conn;
923 	struct smc_link_group *lgr = conn->lgr;
924 	struct list_head *buf_list;
925 	int bufsize, bufsize_short;
926 	int sk_buf_size;
927 	rwlock_t *lock;
928 
929 	if (is_rmb)
930 		/* use socket recv buffer size (w/o overhead) as start value */
931 		sk_buf_size = smc->sk.sk_rcvbuf / 2;
932 	else
933 		/* use socket send buffer size (w/o overhead) as start value */
934 		sk_buf_size = smc->sk.sk_sndbuf / 2;
935 
936 	for (bufsize_short = smc_compress_bufsize(sk_buf_size);
937 	     bufsize_short >= 0; bufsize_short--) {
938 
939 		if (is_rmb) {
940 			lock = &lgr->rmbs_lock;
941 			buf_list = &lgr->rmbs[bufsize_short];
942 		} else {
943 			lock = &lgr->sndbufs_lock;
944 			buf_list = &lgr->sndbufs[bufsize_short];
945 		}
946 		bufsize = smc_uncompress_bufsize(bufsize_short);
947 		if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
948 			continue;
949 
950 		/* check for reusable slot in the link group */
951 		buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
952 		if (buf_desc) {
953 			memset(buf_desc->cpu_addr, 0, bufsize);
954 			break; /* found reusable slot */
955 		}
956 
957 		if (is_smcd)
958 			buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
959 		else
960 			buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
961 
962 		if (PTR_ERR(buf_desc) == -ENOMEM)
963 			break;
964 		if (IS_ERR(buf_desc))
965 			continue;
966 
967 		buf_desc->used = 1;
968 		write_lock_bh(lock);
969 		list_add(&buf_desc->list, buf_list);
970 		write_unlock_bh(lock);
971 		break; /* found */
972 	}
973 
974 	if (IS_ERR(buf_desc))
975 		return -ENOMEM;
976 
977 	if (is_rmb) {
978 		conn->rmb_desc = buf_desc;
979 		conn->rmbe_size_short = bufsize_short;
980 		smc->sk.sk_rcvbuf = bufsize * 2;
981 		atomic_set(&conn->bytes_to_rcv, 0);
982 		conn->rmbe_update_limit =
983 			smc_rmb_wnd_update_limit(buf_desc->len);
984 		if (is_smcd)
985 			smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
986 	} else {
987 		conn->sndbuf_desc = buf_desc;
988 		smc->sk.sk_sndbuf = bufsize * 2;
989 		atomic_set(&conn->sndbuf_space, bufsize);
990 	}
991 	return 0;
992 }
993 
994 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
995 {
996 	struct smc_link_group *lgr = conn->lgr;
997 
998 	if (!conn->lgr || conn->lgr->is_smcd)
999 		return;
1000 	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
1001 			       conn->sndbuf_desc, DMA_TO_DEVICE);
1002 }
1003 
1004 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
1005 {
1006 	struct smc_link_group *lgr = conn->lgr;
1007 
1008 	if (!conn->lgr || conn->lgr->is_smcd)
1009 		return;
1010 	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
1011 				  conn->sndbuf_desc, DMA_TO_DEVICE);
1012 }
1013 
1014 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
1015 {
1016 	struct smc_link_group *lgr = conn->lgr;
1017 
1018 	if (!conn->lgr || conn->lgr->is_smcd)
1019 		return;
1020 	smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
1021 			       conn->rmb_desc, DMA_FROM_DEVICE);
1022 }
1023 
1024 void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
1025 {
1026 	struct smc_link_group *lgr = conn->lgr;
1027 
1028 	if (!conn->lgr || conn->lgr->is_smcd)
1029 		return;
1030 	smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
1031 				  conn->rmb_desc, DMA_FROM_DEVICE);
1032 }
1033 
1034 /* create the send and receive buffer for an SMC socket;
1035  * receive buffers are called RMBs;
1036  * (even though the SMC protocol allows more than one RMB-element per RMB,
1037  * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
1038  * extra RMB for every connection in a link group
1039  */
1040 int smc_buf_create(struct smc_sock *smc, bool is_smcd)
1041 {
1042 	int rc;
1043 
1044 	/* create send buffer */
1045 	rc = __smc_buf_create(smc, is_smcd, false);
1046 	if (rc)
1047 		return rc;
1048 	/* create rmb */
1049 	rc = __smc_buf_create(smc, is_smcd, true);
1050 	if (rc)
1051 		smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
1052 	return rc;
1053 }
1054 
1055 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
1056 {
1057 	int i;
1058 
1059 	for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
1060 		if (!test_and_set_bit(i, lgr->rtokens_used_mask))
1061 			return i;
1062 	}
1063 	return -ENOSPC;
1064 }
1065 
1066 /* add a new rtoken from peer */
1067 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
1068 {
1069 	u64 dma_addr = be64_to_cpu(nw_vaddr);
1070 	u32 rkey = ntohl(nw_rkey);
1071 	int i;
1072 
1073 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1074 		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
1075 		    (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
1076 		    test_bit(i, lgr->rtokens_used_mask)) {
1077 			/* already in list */
1078 			return i;
1079 		}
1080 	}
1081 	i = smc_rmb_reserve_rtoken_idx(lgr);
1082 	if (i < 0)
1083 		return i;
1084 	lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
1085 	lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
1086 	return i;
1087 }
1088 
1089 /* delete an rtoken */
1090 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
1091 {
1092 	u32 rkey = ntohl(nw_rkey);
1093 	int i;
1094 
1095 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1096 		if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
1097 		    test_bit(i, lgr->rtokens_used_mask)) {
1098 			lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
1099 			lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;
1100 
1101 			clear_bit(i, lgr->rtokens_used_mask);
1102 			return 0;
1103 		}
1104 	}
1105 	return -ENOENT;
1106 }
1107 
1108 /* save rkey and dma_addr received from peer during clc handshake */
1109 int smc_rmb_rtoken_handling(struct smc_connection *conn,
1110 			    struct smc_clc_msg_accept_confirm *clc)
1111 {
1112 	conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
1113 					  clc->rmb_rkey);
1114 	if (conn->rtoken_idx < 0)
1115 		return conn->rtoken_idx;
1116 	return 0;
1117 }
1118 
1119 static void smc_core_going_away(void)
1120 {
1121 	struct smc_ib_device *smcibdev;
1122 	struct smcd_dev *smcd;
1123 
1124 	spin_lock(&smc_ib_devices.lock);
1125 	list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
1126 		int i;
1127 
1128 		for (i = 0; i < SMC_MAX_PORTS; i++)
1129 			set_bit(i, smcibdev->ports_going_away);
1130 	}
1131 	spin_unlock(&smc_ib_devices.lock);
1132 
1133 	spin_lock(&smcd_dev_list.lock);
1134 	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
1135 		smcd->going_away = 1;
1136 	}
1137 	spin_unlock(&smcd_dev_list.lock);
1138 }
1139 
1140 /* Called (from smc_exit) when module is removed */
1141 void smc_core_exit(void)
1142 {
1143 	struct smc_link_group *lgr, *lg;
1144 	LIST_HEAD(lgr_freeing_list);
1145 	struct smcd_dev *smcd;
1146 
1147 	smc_core_going_away();
1148 
1149 	spin_lock_bh(&smc_lgr_list.lock);
1150 	list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1151 	spin_unlock_bh(&smc_lgr_list.lock);
1152 
1153 	spin_lock(&smcd_dev_list.lock);
1154 	list_for_each_entry(smcd, &smcd_dev_list.list, list)
1155 		list_splice_init(&smcd->lgr_list, &lgr_freeing_list);
1156 	spin_unlock(&smcd_dev_list.lock);
1157 
1158 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1159 		list_del_init(&lgr->list);
1160 		if (!lgr->is_smcd) {
1161 			struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
1162 
1163 			if (lnk->state == SMC_LNK_ACTIVE)
1164 				smc_llc_send_delete_link(lnk, SMC_LLC_REQ,
1165 							 false);
1166 			smc_llc_link_inactive(lnk);
1167 		}
1168 		cancel_delayed_work_sync(&lgr->free_work);
1169 		if (lgr->is_smcd)
1170 			smc_ism_signal_shutdown(lgr);
1171 		smc_lgr_free(lgr); /* free link group */
1172 	}
1173 }
1174