xref: /linux/net/mptcp/pm.c (revision 6e7fd890f1d6ac83805409e9c346240de2705584)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2019, Intel Corporation.
5  */
6 #define pr_fmt(fmt) "MPTCP: " fmt
7 
8 #include <linux/kernel.h>
9 #include <net/mptcp.h>
10 #include "protocol.h"
11 
12 #include "mib.h"
13 
14 /* path manager command handlers */
15 
16 int mptcp_pm_announce_addr(struct mptcp_sock *msk,
17 			   const struct mptcp_addr_info *addr,
18 			   bool echo)
19 {
20 	u8 add_addr = READ_ONCE(msk->pm.addr_signal);
21 
22 	pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo);
23 
24 	lockdep_assert_held(&msk->pm.lock);
25 
26 	if (add_addr &
27 	    (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) {
28 		MPTCP_INC_STATS(sock_net((struct sock *)msk),
29 				echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP);
30 		return -EINVAL;
31 	}
32 
33 	if (echo) {
34 		msk->pm.remote = *addr;
35 		add_addr |= BIT(MPTCP_ADD_ADDR_ECHO);
36 	} else {
37 		msk->pm.local = *addr;
38 		add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL);
39 	}
40 	WRITE_ONCE(msk->pm.addr_signal, add_addr);
41 	return 0;
42 }
43 
44 int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
45 {
46 	u8 rm_addr = READ_ONCE(msk->pm.addr_signal);
47 
48 	pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
49 
50 	if (rm_addr) {
51 		MPTCP_ADD_STATS(sock_net((struct sock *)msk),
52 				MPTCP_MIB_RMADDRTXDROP, rm_list->nr);
53 		return -EINVAL;
54 	}
55 
56 	msk->pm.rm_list_tx = *rm_list;
57 	rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL);
58 	WRITE_ONCE(msk->pm.addr_signal, rm_addr);
59 	mptcp_pm_nl_addr_send_ack(msk);
60 	return 0;
61 }
62 
63 int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
64 {
65 	pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
66 
67 	spin_lock_bh(&msk->pm.lock);
68 	mptcp_pm_nl_rm_subflow_received(msk, rm_list);
69 	spin_unlock_bh(&msk->pm.lock);
70 	return 0;
71 }
72 
73 /* path manager event handlers */
74 
75 void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side)
76 {
77 	struct mptcp_pm_data *pm = &msk->pm;
78 
79 	pr_debug("msk=%p, token=%u side=%d", msk, READ_ONCE(msk->token), server_side);
80 
81 	WRITE_ONCE(pm->server_side, server_side);
82 	mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC);
83 }
84 
85 bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
86 {
87 	struct mptcp_pm_data *pm = &msk->pm;
88 	unsigned int subflows_max;
89 	int ret = 0;
90 
91 	if (mptcp_pm_is_userspace(msk)) {
92 		if (mptcp_userspace_pm_active(msk)) {
93 			spin_lock_bh(&pm->lock);
94 			pm->subflows++;
95 			spin_unlock_bh(&pm->lock);
96 			return true;
97 		}
98 		return false;
99 	}
100 
101 	subflows_max = mptcp_pm_get_subflows_max(msk);
102 
103 	pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
104 		 subflows_max, READ_ONCE(pm->accept_subflow));
105 
106 	/* try to avoid acquiring the lock below */
107 	if (!READ_ONCE(pm->accept_subflow))
108 		return false;
109 
110 	spin_lock_bh(&pm->lock);
111 	if (READ_ONCE(pm->accept_subflow)) {
112 		ret = pm->subflows < subflows_max;
113 		if (ret && ++pm->subflows == subflows_max)
114 			WRITE_ONCE(pm->accept_subflow, false);
115 	}
116 	spin_unlock_bh(&pm->lock);
117 
118 	return ret;
119 }
120 
121 /* return true if the new status bit is currently cleared, that is, this event
122  * can be server, eventually by an already scheduled work
123  */
124 static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
125 				   enum mptcp_pm_status new_status)
126 {
127 	pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
128 		 BIT(new_status));
129 	if (msk->pm.status & BIT(new_status))
130 		return false;
131 
132 	msk->pm.status |= BIT(new_status);
133 	mptcp_schedule_work((struct sock *)msk);
134 	return true;
135 }
136 
137 void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk)
138 {
139 	struct mptcp_pm_data *pm = &msk->pm;
140 	bool announce = false;
141 
142 	pr_debug("msk=%p", msk);
143 
144 	spin_lock_bh(&pm->lock);
145 
146 	/* mptcp_pm_fully_established() can be invoked by multiple
147 	 * racing paths - accept() and check_fully_established()
148 	 * be sure to serve this event only once.
149 	 */
150 	if (READ_ONCE(pm->work_pending) &&
151 	    !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
152 		mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
153 
154 	if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
155 		announce = true;
156 
157 	msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
158 	spin_unlock_bh(&pm->lock);
159 
160 	if (announce)
161 		mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC);
162 }
163 
164 void mptcp_pm_connection_closed(struct mptcp_sock *msk)
165 {
166 	pr_debug("msk=%p", msk);
167 }
168 
169 void mptcp_pm_subflow_established(struct mptcp_sock *msk)
170 {
171 	struct mptcp_pm_data *pm = &msk->pm;
172 
173 	pr_debug("msk=%p", msk);
174 
175 	if (!READ_ONCE(pm->work_pending))
176 		return;
177 
178 	spin_lock_bh(&pm->lock);
179 
180 	if (READ_ONCE(pm->work_pending))
181 		mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
182 
183 	spin_unlock_bh(&pm->lock);
184 }
185 
186 void mptcp_pm_subflow_check_next(struct mptcp_sock *msk,
187 				 const struct mptcp_subflow_context *subflow)
188 {
189 	struct mptcp_pm_data *pm = &msk->pm;
190 	bool update_subflows;
191 
192 	update_subflows = subflow->request_join || subflow->mp_join;
193 	if (mptcp_pm_is_userspace(msk)) {
194 		if (update_subflows) {
195 			spin_lock_bh(&pm->lock);
196 			pm->subflows--;
197 			spin_unlock_bh(&pm->lock);
198 		}
199 		return;
200 	}
201 
202 	if (!READ_ONCE(pm->work_pending) && !update_subflows)
203 		return;
204 
205 	spin_lock_bh(&pm->lock);
206 	if (update_subflows)
207 		__mptcp_pm_close_subflow(msk);
208 
209 	/* Even if this subflow is not really established, tell the PM to try
210 	 * to pick the next ones, if possible.
211 	 */
212 	if (mptcp_pm_nl_check_work_pending(msk))
213 		mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
214 
215 	spin_unlock_bh(&pm->lock);
216 }
217 
218 void mptcp_pm_add_addr_received(const struct sock *ssk,
219 				const struct mptcp_addr_info *addr)
220 {
221 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
222 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
223 	struct mptcp_pm_data *pm = &msk->pm;
224 
225 	pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
226 		 READ_ONCE(pm->accept_addr));
227 
228 	mptcp_event_addr_announced(ssk, addr);
229 
230 	spin_lock_bh(&pm->lock);
231 
232 	if (mptcp_pm_is_userspace(msk)) {
233 		if (mptcp_userspace_pm_active(msk)) {
234 			mptcp_pm_announce_addr(msk, addr, true);
235 			mptcp_pm_add_addr_send_ack(msk);
236 		} else {
237 			__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
238 		}
239 	} else if (!READ_ONCE(pm->accept_addr)) {
240 		mptcp_pm_announce_addr(msk, addr, true);
241 		mptcp_pm_add_addr_send_ack(msk);
242 	} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
243 		pm->remote = *addr;
244 	} else {
245 		__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
246 	}
247 
248 	spin_unlock_bh(&pm->lock);
249 }
250 
251 void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
252 			      const struct mptcp_addr_info *addr)
253 {
254 	struct mptcp_pm_data *pm = &msk->pm;
255 
256 	pr_debug("msk=%p", msk);
257 
258 	spin_lock_bh(&pm->lock);
259 
260 	if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending))
261 		mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
262 
263 	spin_unlock_bh(&pm->lock);
264 }
265 
266 void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
267 {
268 	if (!mptcp_pm_should_add_signal(msk))
269 		return;
270 
271 	mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
272 }
273 
274 void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
275 			       const struct mptcp_rm_list *rm_list)
276 {
277 	struct mptcp_pm_data *pm = &msk->pm;
278 	u8 i;
279 
280 	pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr);
281 
282 	for (i = 0; i < rm_list->nr; i++)
283 		mptcp_event_addr_removed(msk, rm_list->ids[i]);
284 
285 	spin_lock_bh(&pm->lock);
286 	if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED))
287 		pm->rm_list_rx = *rm_list;
288 	else
289 		__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP);
290 	spin_unlock_bh(&pm->lock);
291 }
292 
293 void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
294 {
295 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
296 	struct sock *sk = subflow->conn;
297 	struct mptcp_sock *msk;
298 
299 	pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
300 	msk = mptcp_sk(sk);
301 	if (subflow->backup != bkup)
302 		subflow->backup = bkup;
303 
304 	mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
305 }
306 
307 void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
308 {
309 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
310 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
311 
312 	pr_debug("fail_seq=%llu", fail_seq);
313 
314 	if (!READ_ONCE(msk->allow_infinite_fallback))
315 		return;
316 
317 	if (!subflow->fail_tout) {
318 		pr_debug("send MP_FAIL response and infinite map");
319 
320 		subflow->send_mp_fail = 1;
321 		subflow->send_infinite_map = 1;
322 		tcp_send_ack(sk);
323 	} else {
324 		pr_debug("MP_FAIL response received");
325 		WRITE_ONCE(subflow->fail_tout, 0);
326 	}
327 }
328 
329 /* path manager helpers */
330 
331 bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
332 			      unsigned int opt_size, unsigned int remaining,
333 			      struct mptcp_addr_info *addr, bool *echo,
334 			      bool *drop_other_suboptions)
335 {
336 	int ret = false;
337 	u8 add_addr;
338 	u8 family;
339 	bool port;
340 
341 	spin_lock_bh(&msk->pm.lock);
342 
343 	/* double check after the lock is acquired */
344 	if (!mptcp_pm_should_add_signal(msk))
345 		goto out_unlock;
346 
347 	/* always drop every other options for pure ack ADD_ADDR; this is a
348 	 * plain dup-ack from TCP perspective. The other MPTCP-relevant info,
349 	 * if any, will be carried by the 'original' TCP ack
350 	 */
351 	if (skb && skb_is_tcp_pure_ack(skb)) {
352 		remaining += opt_size;
353 		*drop_other_suboptions = true;
354 	}
355 
356 	*echo = mptcp_pm_should_add_signal_echo(msk);
357 	port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
358 
359 	family = *echo ? msk->pm.remote.family : msk->pm.local.family;
360 	if (remaining < mptcp_add_addr_len(family, *echo, port))
361 		goto out_unlock;
362 
363 	if (*echo) {
364 		*addr = msk->pm.remote;
365 		add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO);
366 	} else {
367 		*addr = msk->pm.local;
368 		add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL);
369 	}
370 	WRITE_ONCE(msk->pm.addr_signal, add_addr);
371 	ret = true;
372 
373 out_unlock:
374 	spin_unlock_bh(&msk->pm.lock);
375 	return ret;
376 }
377 
378 bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
379 			     struct mptcp_rm_list *rm_list)
380 {
381 	int ret = false, len;
382 	u8 rm_addr;
383 
384 	spin_lock_bh(&msk->pm.lock);
385 
386 	/* double check after the lock is acquired */
387 	if (!mptcp_pm_should_rm_signal(msk))
388 		goto out_unlock;
389 
390 	rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL);
391 	len = mptcp_rm_addr_len(&msk->pm.rm_list_tx);
392 	if (len < 0) {
393 		WRITE_ONCE(msk->pm.addr_signal, rm_addr);
394 		goto out_unlock;
395 	}
396 	if (remaining < len)
397 		goto out_unlock;
398 
399 	*rm_list = msk->pm.rm_list_tx;
400 	WRITE_ONCE(msk->pm.addr_signal, rm_addr);
401 	ret = true;
402 
403 out_unlock:
404 	spin_unlock_bh(&msk->pm.lock);
405 	return ret;
406 }
407 
408 int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
409 {
410 	struct mptcp_addr_info skc_local;
411 	struct mptcp_addr_info msk_local;
412 
413 	if (WARN_ON_ONCE(!msk))
414 		return -1;
415 
416 	/* The 0 ID mapping is defined by the first subflow, copied into the msk
417 	 * addr
418 	 */
419 	mptcp_local_address((struct sock_common *)msk, &msk_local);
420 	mptcp_local_address((struct sock_common *)skc, &skc_local);
421 	if (mptcp_addresses_equal(&msk_local, &skc_local, false))
422 		return 0;
423 
424 	if (mptcp_pm_is_userspace(msk))
425 		return mptcp_userspace_pm_get_local_id(msk, &skc_local);
426 	return mptcp_pm_nl_get_local_id(msk, &skc_local);
427 }
428 
429 bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc)
430 {
431 	struct mptcp_addr_info skc_local;
432 
433 	mptcp_local_address((struct sock_common *)skc, &skc_local);
434 
435 	if (mptcp_pm_is_userspace(msk))
436 		return mptcp_userspace_pm_is_backup(msk, &skc_local);
437 
438 	return mptcp_pm_nl_is_backup(msk, &skc_local);
439 }
440 
441 int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
442 					 u8 *flags, int *ifindex)
443 {
444 	*flags = 0;
445 	*ifindex = 0;
446 
447 	if (!id)
448 		return 0;
449 
450 	if (mptcp_pm_is_userspace(msk))
451 		return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, id, flags, ifindex);
452 	return mptcp_pm_nl_get_flags_and_ifindex_by_id(msk, id, flags, ifindex);
453 }
454 
455 int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info)
456 {
457 	if (info->attrs[MPTCP_PM_ATTR_TOKEN])
458 		return mptcp_userspace_pm_get_addr(skb, info);
459 	return mptcp_pm_nl_get_addr(skb, info);
460 }
461 
462 int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb)
463 {
464 	const struct genl_info *info = genl_info_dump(cb);
465 
466 	if (info->attrs[MPTCP_PM_ATTR_TOKEN])
467 		return mptcp_userspace_pm_dump_addr(msg, cb);
468 	return mptcp_pm_nl_dump_addr(msg, cb);
469 }
470 
471 int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info)
472 {
473 	if (info->attrs[MPTCP_PM_ATTR_TOKEN])
474 		return mptcp_userspace_pm_set_flags(skb, info);
475 	return mptcp_pm_nl_set_flags(skb, info);
476 }
477 
478 void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
479 {
480 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
481 	u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp);
482 
483 	/* keep track of rtx periods with no progress */
484 	if (!subflow->stale_count) {
485 		subflow->stale_rcv_tstamp = rcv_tstamp;
486 		subflow->stale_count++;
487 	} else if (subflow->stale_rcv_tstamp == rcv_tstamp) {
488 		if (subflow->stale_count < U8_MAX)
489 			subflow->stale_count++;
490 		mptcp_pm_nl_subflow_chk_stale(msk, ssk);
491 	} else {
492 		subflow->stale_count = 0;
493 		mptcp_subflow_set_active(subflow);
494 	}
495 }
496 
497 /* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses,
498  * otherwise allow any matching local/remote pair
499  */
500 bool mptcp_pm_addr_families_match(const struct sock *sk,
501 				  const struct mptcp_addr_info *loc,
502 				  const struct mptcp_addr_info *rem)
503 {
504 	bool mptcp_is_v4 = sk->sk_family == AF_INET;
505 
506 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
507 	bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6);
508 	bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6);
509 
510 	if (mptcp_is_v4)
511 		return loc_is_v4 && rem_is_v4;
512 
513 	if (ipv6_only_sock(sk))
514 		return !loc_is_v4 && !rem_is_v4;
515 
516 	return loc_is_v4 == rem_is_v4;
517 #else
518 	return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET;
519 #endif
520 }
521 
522 void mptcp_pm_data_reset(struct mptcp_sock *msk)
523 {
524 	u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
525 	struct mptcp_pm_data *pm = &msk->pm;
526 
527 	pm->add_addr_signaled = 0;
528 	pm->add_addr_accepted = 0;
529 	pm->local_addr_used = 0;
530 	pm->subflows = 0;
531 	pm->rm_list_tx.nr = 0;
532 	pm->rm_list_rx.nr = 0;
533 	WRITE_ONCE(pm->pm_type, pm_type);
534 
535 	if (pm_type == MPTCP_PM_TYPE_KERNEL) {
536 		bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
537 
538 		/* pm->work_pending must be only be set to 'true' when
539 		 * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
540 		 */
541 		WRITE_ONCE(pm->work_pending,
542 			   (!!mptcp_pm_get_local_addr_max(msk) &&
543 			    subflows_allowed) ||
544 			   !!mptcp_pm_get_add_addr_signal_max(msk));
545 		WRITE_ONCE(pm->accept_addr,
546 			   !!mptcp_pm_get_add_addr_accept_max(msk) &&
547 			   subflows_allowed);
548 		WRITE_ONCE(pm->accept_subflow, subflows_allowed);
549 	} else {
550 		WRITE_ONCE(pm->work_pending, 0);
551 		WRITE_ONCE(pm->accept_addr, 0);
552 		WRITE_ONCE(pm->accept_subflow, 0);
553 	}
554 
555 	WRITE_ONCE(pm->addr_signal, 0);
556 	WRITE_ONCE(pm->remote_deny_join_id0, false);
557 	pm->status = 0;
558 	bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
559 }
560 
561 void mptcp_pm_data_init(struct mptcp_sock *msk)
562 {
563 	spin_lock_init(&msk->pm.lock);
564 	INIT_LIST_HEAD(&msk->pm.anno_list);
565 	INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list);
566 	mptcp_pm_data_reset(msk);
567 }
568 
569 void __init mptcp_pm_init(void)
570 {
571 	mptcp_pm_nl_init();
572 }
573