xref: /linux/net/ipv4/tcp_cong.c (revision a1ff5a7d78a036d6c2178ee5acd6ba4946243800)
1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2317a76f9SStephen Hemminger /*
3b92022f3SFabian Frederick  * Pluggable TCP congestion control support and newReno
4317a76f9SStephen Hemminger  * congestion control.
502582e9bSMasanari Iida  * Based on ideas from I/O scheduler support and Web100.
6317a76f9SStephen Hemminger  *
7317a76f9SStephen Hemminger  * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
8317a76f9SStephen Hemminger  */
9317a76f9SStephen Hemminger 
10afd46503SJoe Perches #define pr_fmt(fmt) "TCP: " fmt
11afd46503SJoe Perches 
12317a76f9SStephen Hemminger #include <linux/module.h>
13317a76f9SStephen Hemminger #include <linux/mm.h>
14317a76f9SStephen Hemminger #include <linux/types.h>
15317a76f9SStephen Hemminger #include <linux/list.h>
165a0e3ad6STejun Heo #include <linux/gfp.h>
17c5c6a8abSDaniel Borkmann #include <linux/jhash.h>
18317a76f9SStephen Hemminger #include <net/tcp.h>
1915fcdf6aSPing Gan #include <trace/events/tcp.h>
20317a76f9SStephen Hemminger 
21317a76f9SStephen Hemminger static DEFINE_SPINLOCK(tcp_cong_list_lock);
22317a76f9SStephen Hemminger static LIST_HEAD(tcp_cong_list);
23317a76f9SStephen Hemminger 
24317a76f9SStephen Hemminger /* Simple linear search, don't expect many entries! */
tcp_ca_find(const char * name)250baf26b0SMartin KaFai Lau struct tcp_congestion_ops *tcp_ca_find(const char *name)
26317a76f9SStephen Hemminger {
27317a76f9SStephen Hemminger 	struct tcp_congestion_ops *e;
28317a76f9SStephen Hemminger 
295f8ef48dSStephen Hemminger 	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
30317a76f9SStephen Hemminger 		if (strcmp(e->name, name) == 0)
31317a76f9SStephen Hemminger 			return e;
32317a76f9SStephen Hemminger 	}
33317a76f9SStephen Hemminger 
34317a76f9SStephen Hemminger 	return NULL;
35317a76f9SStephen Hemminger }
36317a76f9SStephen Hemminger 
tcp_set_ca_state(struct sock * sk,const u8 ca_state)3715fcdf6aSPing Gan void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
3815fcdf6aSPing Gan {
3915fcdf6aSPing Gan 	struct inet_connection_sock *icsk = inet_csk(sk);
4015fcdf6aSPing Gan 
4115fcdf6aSPing Gan 	trace_tcp_cong_state_set(sk, ca_state);
4215fcdf6aSPing Gan 
4315fcdf6aSPing Gan 	if (icsk->icsk_ca_ops->set_state)
4415fcdf6aSPing Gan 		icsk->icsk_ca_ops->set_state(sk, ca_state);
4515fcdf6aSPing Gan 	icsk->icsk_ca_state = ca_state;
4615fcdf6aSPing Gan }
4715fcdf6aSPing Gan 
48c5c6a8abSDaniel Borkmann /* Must be called with rcu lock held */
tcp_ca_find_autoload(const char * name)4961e2bbafSJason Xing static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name)
50c5c6a8abSDaniel Borkmann {
516670e152SStephen Hemminger 	struct tcp_congestion_ops *ca = tcp_ca_find(name);
526670e152SStephen Hemminger 
53c5c6a8abSDaniel Borkmann #ifdef CONFIG_MODULES
54c5c6a8abSDaniel Borkmann 	if (!ca && capable(CAP_NET_ADMIN)) {
55c5c6a8abSDaniel Borkmann 		rcu_read_unlock();
56c5c6a8abSDaniel Borkmann 		request_module("tcp_%s", name);
57c5c6a8abSDaniel Borkmann 		rcu_read_lock();
58c5c6a8abSDaniel Borkmann 		ca = tcp_ca_find(name);
59c5c6a8abSDaniel Borkmann 	}
60c5c6a8abSDaniel Borkmann #endif
61c5c6a8abSDaniel Borkmann 	return ca;
62c5c6a8abSDaniel Borkmann }
63c5c6a8abSDaniel Borkmann 
64c5c6a8abSDaniel Borkmann /* Simple linear search, not much in here. */
tcp_ca_find_key(u32 key)65c5c6a8abSDaniel Borkmann struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
66c5c6a8abSDaniel Borkmann {
67c5c6a8abSDaniel Borkmann 	struct tcp_congestion_ops *e;
68c5c6a8abSDaniel Borkmann 
69c5c6a8abSDaniel Borkmann 	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
70c5c6a8abSDaniel Borkmann 		if (e->key == key)
71c5c6a8abSDaniel Borkmann 			return e;
72c5c6a8abSDaniel Borkmann 	}
73c5c6a8abSDaniel Borkmann 
74c5c6a8abSDaniel Borkmann 	return NULL;
75c5c6a8abSDaniel Borkmann }
76c5c6a8abSDaniel Borkmann 
tcp_validate_congestion_control(struct tcp_congestion_ops * ca)778fb1a76aSKui-Feng Lee int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
78317a76f9SStephen Hemminger {
79e9799183SFlorian Westphal 	/* all algorithms must implement these */
80e9799183SFlorian Westphal 	if (!ca->ssthresh || !ca->undo_cwnd ||
81e9799183SFlorian Westphal 	    !(ca->cong_avoid || ca->cong_control)) {
82afd46503SJoe Perches 		pr_err("%s does not implement required ops\n", ca->name);
83317a76f9SStephen Hemminger 		return -EINVAL;
84317a76f9SStephen Hemminger 	}
85317a76f9SStephen Hemminger 
868fb1a76aSKui-Feng Lee 	return 0;
878fb1a76aSKui-Feng Lee }
888fb1a76aSKui-Feng Lee 
898fb1a76aSKui-Feng Lee /* Attach new congestion control algorithm to the list
908fb1a76aSKui-Feng Lee  * of available options.
918fb1a76aSKui-Feng Lee  */
tcp_register_congestion_control(struct tcp_congestion_ops * ca)928fb1a76aSKui-Feng Lee int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
938fb1a76aSKui-Feng Lee {
948fb1a76aSKui-Feng Lee 	int ret;
958fb1a76aSKui-Feng Lee 
968fb1a76aSKui-Feng Lee 	ret = tcp_validate_congestion_control(ca);
978fb1a76aSKui-Feng Lee 	if (ret)
988fb1a76aSKui-Feng Lee 		return ret;
998fb1a76aSKui-Feng Lee 
100c5c6a8abSDaniel Borkmann 	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
101c5c6a8abSDaniel Borkmann 
102317a76f9SStephen Hemminger 	spin_lock(&tcp_cong_list_lock);
103c5c6a8abSDaniel Borkmann 	if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
104c5c6a8abSDaniel Borkmann 		pr_notice("%s already registered or non-unique key\n",
105c5c6a8abSDaniel Borkmann 			  ca->name);
106317a76f9SStephen Hemminger 		ret = -EEXIST;
107317a76f9SStephen Hemminger 	} else {
1083d2573f7SStephen Hemminger 		list_add_tail_rcu(&ca->list, &tcp_cong_list);
109db2855aeSstephen hemminger 		pr_debug("%s registered\n", ca->name);
110317a76f9SStephen Hemminger 	}
111317a76f9SStephen Hemminger 	spin_unlock(&tcp_cong_list_lock);
112317a76f9SStephen Hemminger 
113317a76f9SStephen Hemminger 	return ret;
114317a76f9SStephen Hemminger }
115317a76f9SStephen Hemminger EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
116317a76f9SStephen Hemminger 
117317a76f9SStephen Hemminger /*
118317a76f9SStephen Hemminger  * Remove congestion control algorithm, called from
119317a76f9SStephen Hemminger  * the module's remove function.  Module ref counts are used
120317a76f9SStephen Hemminger  * to ensure that this can't be done till all sockets using
121317a76f9SStephen Hemminger  * that method are closed.
122317a76f9SStephen Hemminger  */
tcp_unregister_congestion_control(struct tcp_congestion_ops * ca)123317a76f9SStephen Hemminger void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
124317a76f9SStephen Hemminger {
125317a76f9SStephen Hemminger 	spin_lock(&tcp_cong_list_lock);
126317a76f9SStephen Hemminger 	list_del_rcu(&ca->list);
127317a76f9SStephen Hemminger 	spin_unlock(&tcp_cong_list_lock);
128c5c6a8abSDaniel Borkmann 
129c5c6a8abSDaniel Borkmann 	/* Wait for outstanding readers to complete before the
130c5c6a8abSDaniel Borkmann 	 * module gets removed entirely.
131c5c6a8abSDaniel Borkmann 	 *
132c5c6a8abSDaniel Borkmann 	 * A try_module_get() should fail by now as our module is
133c5c6a8abSDaniel Borkmann 	 * in "going" state since no refs are held anymore and
134c5c6a8abSDaniel Borkmann 	 * module_exit() handler being called.
135c5c6a8abSDaniel Borkmann 	 */
136c5c6a8abSDaniel Borkmann 	synchronize_rcu();
137317a76f9SStephen Hemminger }
138317a76f9SStephen Hemminger EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
139317a76f9SStephen Hemminger 
1408fb1a76aSKui-Feng Lee /* Replace a registered old ca with a new one.
1418fb1a76aSKui-Feng Lee  *
1428fb1a76aSKui-Feng Lee  * The new ca must have the same name as the old one, that has been
1438fb1a76aSKui-Feng Lee  * registered.
1448fb1a76aSKui-Feng Lee  */
tcp_update_congestion_control(struct tcp_congestion_ops * ca,struct tcp_congestion_ops * old_ca)1458fb1a76aSKui-Feng Lee int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
1468fb1a76aSKui-Feng Lee {
1478fb1a76aSKui-Feng Lee 	struct tcp_congestion_ops *existing;
14873e4f9e6SKui-Feng Lee 	int ret = 0;
1498fb1a76aSKui-Feng Lee 
1508fb1a76aSKui-Feng Lee 	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
1518fb1a76aSKui-Feng Lee 
1528fb1a76aSKui-Feng Lee 	spin_lock(&tcp_cong_list_lock);
1538fb1a76aSKui-Feng Lee 	existing = tcp_ca_find_key(old_ca->key);
1548fb1a76aSKui-Feng Lee 	if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
1558fb1a76aSKui-Feng Lee 		pr_notice("%s not registered or non-unique key\n",
1568fb1a76aSKui-Feng Lee 			  ca->name);
1578fb1a76aSKui-Feng Lee 		ret = -EINVAL;
1588fb1a76aSKui-Feng Lee 	} else if (existing != old_ca) {
1598fb1a76aSKui-Feng Lee 		pr_notice("invalid old congestion control algorithm to replace\n");
1608fb1a76aSKui-Feng Lee 		ret = -EINVAL;
1618fb1a76aSKui-Feng Lee 	} else {
1628fb1a76aSKui-Feng Lee 		/* Add the new one before removing the old one to keep
1638fb1a76aSKui-Feng Lee 		 * one implementation available all the time.
1648fb1a76aSKui-Feng Lee 		 */
1658fb1a76aSKui-Feng Lee 		list_add_tail_rcu(&ca->list, &tcp_cong_list);
1668fb1a76aSKui-Feng Lee 		list_del_rcu(&existing->list);
1678fb1a76aSKui-Feng Lee 		pr_debug("%s updated\n", ca->name);
1688fb1a76aSKui-Feng Lee 	}
1698fb1a76aSKui-Feng Lee 	spin_unlock(&tcp_cong_list_lock);
1708fb1a76aSKui-Feng Lee 
1718fb1a76aSKui-Feng Lee 	/* Wait for outstanding readers to complete before the
1728fb1a76aSKui-Feng Lee 	 * module or struct_ops gets removed entirely.
1738fb1a76aSKui-Feng Lee 	 */
1748fb1a76aSKui-Feng Lee 	if (!ret)
1758fb1a76aSKui-Feng Lee 		synchronize_rcu();
1768fb1a76aSKui-Feng Lee 
1778fb1a76aSKui-Feng Lee 	return ret;
1788fb1a76aSKui-Feng Lee }
1798fb1a76aSKui-Feng Lee 
tcp_ca_get_key_by_name(const char * name,bool * ecn_ca)18061e2bbafSJason Xing u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
181c5c6a8abSDaniel Borkmann {
182c5c6a8abSDaniel Borkmann 	const struct tcp_congestion_ops *ca;
183c3a8d947SDaniel Borkmann 	u32 key = TCP_CA_UNSPEC;
184c5c6a8abSDaniel Borkmann 
185c5c6a8abSDaniel Borkmann 	might_sleep();
186c5c6a8abSDaniel Borkmann 
187c5c6a8abSDaniel Borkmann 	rcu_read_lock();
18861e2bbafSJason Xing 	ca = tcp_ca_find_autoload(name);
189c3a8d947SDaniel Borkmann 	if (ca) {
190c3a8d947SDaniel Borkmann 		key = ca->key;
191c3a8d947SDaniel Borkmann 		*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
192c3a8d947SDaniel Borkmann 	}
193c5c6a8abSDaniel Borkmann 	rcu_read_unlock();
194c5c6a8abSDaniel Borkmann 
195c5c6a8abSDaniel Borkmann 	return key;
196c5c6a8abSDaniel Borkmann }
197c5c6a8abSDaniel Borkmann 
tcp_ca_get_name_by_key(u32 key,char * buffer)198c5c6a8abSDaniel Borkmann char *tcp_ca_get_name_by_key(u32 key, char *buffer)
199c5c6a8abSDaniel Borkmann {
200c5c6a8abSDaniel Borkmann 	const struct tcp_congestion_ops *ca;
201c5c6a8abSDaniel Borkmann 	char *ret = NULL;
202c5c6a8abSDaniel Borkmann 
203c5c6a8abSDaniel Borkmann 	rcu_read_lock();
204c5c6a8abSDaniel Borkmann 	ca = tcp_ca_find_key(key);
205*a3bfc095SKees Cook 	if (ca) {
206*a3bfc095SKees Cook 		strscpy(buffer, ca->name, TCP_CA_NAME_MAX);
207*a3bfc095SKees Cook 		ret = buffer;
208*a3bfc095SKees Cook 	}
209c5c6a8abSDaniel Borkmann 	rcu_read_unlock();
210c5c6a8abSDaniel Borkmann 
211c5c6a8abSDaniel Borkmann 	return ret;
212c5c6a8abSDaniel Borkmann }
213c5c6a8abSDaniel Borkmann 
214317a76f9SStephen Hemminger /* Assign choice of congestion control. */
tcp_assign_congestion_control(struct sock * sk)21555d8694fSFlorian Westphal void tcp_assign_congestion_control(struct sock *sk)
216317a76f9SStephen Hemminger {
2176670e152SStephen Hemminger 	struct net *net = sock_net(sk);
2186687e988SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
2196670e152SStephen Hemminger 	const struct tcp_congestion_ops *ca;
220317a76f9SStephen Hemminger 
221317a76f9SStephen Hemminger 	rcu_read_lock();
2226670e152SStephen Hemminger 	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
2230baf26b0SMartin KaFai Lau 	if (unlikely(!bpf_try_module_get(ca, ca->owner)))
2246670e152SStephen Hemminger 		ca = &tcp_reno;
2256687e988SArnaldo Carvalho de Melo 	icsk->icsk_ca_ops = ca;
22655d8694fSFlorian Westphal 	rcu_read_unlock();
227c1201444SWei Wang 
2286670e152SStephen Hemminger 	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
2296ac705b1SEric Dumazet 	if (ca->flags & TCP_CONG_NEEDS_ECN)
2306ac705b1SEric Dumazet 		INET_ECN_xmit(sk);
2316ac705b1SEric Dumazet 	else
2326ac705b1SEric Dumazet 		INET_ECN_dontxmit(sk);
233317a76f9SStephen Hemminger }
234317a76f9SStephen Hemminger 
tcp_init_congestion_control(struct sock * sk)23555d8694fSFlorian Westphal void tcp_init_congestion_control(struct sock *sk)
23655d8694fSFlorian Westphal {
2378919a9b3SNeal Cardwell 	struct inet_connection_sock *icsk = inet_csk(sk);
238317a76f9SStephen Hemminger 
23944abafc4SYuchung Cheng 	tcp_sk(sk)->prior_ssthresh = 0;
2406687e988SArnaldo Carvalho de Melo 	if (icsk->icsk_ca_ops->init)
2416687e988SArnaldo Carvalho de Melo 		icsk->icsk_ca_ops->init(sk);
2426ac705b1SEric Dumazet 	if (tcp_ca_needs_ecn(sk))
2436ac705b1SEric Dumazet 		INET_ECN_xmit(sk);
2446ac705b1SEric Dumazet 	else
2456ac705b1SEric Dumazet 		INET_ECN_dontxmit(sk);
2468919a9b3SNeal Cardwell 	icsk->icsk_ca_initialized = 1;
247317a76f9SStephen Hemminger }
248317a76f9SStephen Hemminger 
tcp_reinit_congestion_control(struct sock * sk,const struct tcp_congestion_ops * ca)249ebfa00c5SSabrina Dubroca static void tcp_reinit_congestion_control(struct sock *sk,
25029ba4fffSDaniel Borkmann 					  const struct tcp_congestion_ops *ca)
25129ba4fffSDaniel Borkmann {
25229ba4fffSDaniel Borkmann 	struct inet_connection_sock *icsk = inet_csk(sk);
25329ba4fffSDaniel Borkmann 
25429ba4fffSDaniel Borkmann 	tcp_cleanup_congestion_control(sk);
25529ba4fffSDaniel Borkmann 	icsk->icsk_ca_ops = ca;
2569f950415SNeal Cardwell 	icsk->icsk_ca_setsockopt = 1;
2577082c5c3SFlorian Westphal 	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
258c1201444SWei Wang 
25955472017SAlexander Duyck 	if (ca->flags & TCP_CONG_NEEDS_ECN)
26055472017SAlexander Duyck 		INET_ECN_xmit(sk);
26155472017SAlexander Duyck 	else
26255472017SAlexander Duyck 		INET_ECN_dontxmit(sk);
26355472017SAlexander Duyck 
264ce69e563SChristoph Paasch 	if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2656ac705b1SEric Dumazet 		tcp_init_congestion_control(sk);
26629ba4fffSDaniel Borkmann }
26729ba4fffSDaniel Borkmann 
268317a76f9SStephen Hemminger /* Manage refcounts on socket close. */
tcp_cleanup_congestion_control(struct sock * sk)2696687e988SArnaldo Carvalho de Melo void tcp_cleanup_congestion_control(struct sock *sk)
270317a76f9SStephen Hemminger {
2716687e988SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
2726687e988SArnaldo Carvalho de Melo 
2736687e988SArnaldo Carvalho de Melo 	if (icsk->icsk_ca_ops->release)
2746687e988SArnaldo Carvalho de Melo 		icsk->icsk_ca_ops->release(sk);
2750baf26b0SMartin KaFai Lau 	bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
276317a76f9SStephen Hemminger }
277317a76f9SStephen Hemminger 
278317a76f9SStephen Hemminger /* Used by sysctl to change default congestion control */
tcp_set_default_congestion_control(struct net * net,const char * name)2796670e152SStephen Hemminger int tcp_set_default_congestion_control(struct net *net, const char *name)
280317a76f9SStephen Hemminger {
281317a76f9SStephen Hemminger 	struct tcp_congestion_ops *ca;
2826670e152SStephen Hemminger 	const struct tcp_congestion_ops *prev;
2836670e152SStephen Hemminger 	int ret;
284317a76f9SStephen Hemminger 
2856670e152SStephen Hemminger 	rcu_read_lock();
28661e2bbafSJason Xing 	ca = tcp_ca_find_autoload(name);
2876670e152SStephen Hemminger 	if (!ca) {
2886670e152SStephen Hemminger 		ret = -ENOENT;
2890baf26b0SMartin KaFai Lau 	} else if (!bpf_try_module_get(ca, ca->owner)) {
2906670e152SStephen Hemminger 		ret = -EBUSY;
2918d432592SJonathon Reinhart 	} else if (!net_eq(net, &init_net) &&
2928d432592SJonathon Reinhart 			!(ca->flags & TCP_CONG_NON_RESTRICTED)) {
2938d432592SJonathon Reinhart 		/* Only init netns can set default to a restricted algorithm */
2948d432592SJonathon Reinhart 		ret = -EPERM;
2956670e152SStephen Hemminger 	} else {
2966670e152SStephen Hemminger 		prev = xchg(&net->ipv4.tcp_congestion_control, ca);
2976670e152SStephen Hemminger 		if (prev)
2980baf26b0SMartin KaFai Lau 			bpf_module_put(prev, prev->owner);
299317a76f9SStephen Hemminger 
3006670e152SStephen Hemminger 		ca->flags |= TCP_CONG_NON_RESTRICTED;
301317a76f9SStephen Hemminger 		ret = 0;
302317a76f9SStephen Hemminger 	}
3036670e152SStephen Hemminger 	rcu_read_unlock();
304317a76f9SStephen Hemminger 
305317a76f9SStephen Hemminger 	return ret;
306317a76f9SStephen Hemminger }
307317a76f9SStephen Hemminger 
308b1736a71SStephen Hemminger /* Set default value from kernel configuration at bootup */
tcp_congestion_default(void)309b1736a71SStephen Hemminger static int __init tcp_congestion_default(void)
310b1736a71SStephen Hemminger {
3116670e152SStephen Hemminger 	return tcp_set_default_congestion_control(&init_net,
3126670e152SStephen Hemminger 						  CONFIG_DEFAULT_TCP_CONG);
313b1736a71SStephen Hemminger }
314b1736a71SStephen Hemminger late_initcall(tcp_congestion_default);
315b1736a71SStephen Hemminger 
3163ff825b2SStephen Hemminger /* Build string with list of available congestion control values */
tcp_get_available_congestion_control(char * buf,size_t maxlen)3173ff825b2SStephen Hemminger void tcp_get_available_congestion_control(char *buf, size_t maxlen)
3183ff825b2SStephen Hemminger {
3193ff825b2SStephen Hemminger 	struct tcp_congestion_ops *ca;
3203ff825b2SStephen Hemminger 	size_t offs = 0;
3213ff825b2SStephen Hemminger 
3223ff825b2SStephen Hemminger 	rcu_read_lock();
3233ff825b2SStephen Hemminger 	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
3243ff825b2SStephen Hemminger 		offs += snprintf(buf + offs, maxlen - offs,
3253ff825b2SStephen Hemminger 				 "%s%s",
3263ff825b2SStephen Hemminger 				 offs == 0 ? "" : " ", ca->name);
3279bb59a21SHangbin Liu 
3289bb59a21SHangbin Liu 		if (WARN_ON_ONCE(offs >= maxlen))
3299bb59a21SHangbin Liu 			break;
3303ff825b2SStephen Hemminger 	}
3313ff825b2SStephen Hemminger 	rcu_read_unlock();
3323ff825b2SStephen Hemminger }
3333ff825b2SStephen Hemminger 
334317a76f9SStephen Hemminger /* Get current default congestion control */
tcp_get_default_congestion_control(struct net * net,char * name)3356670e152SStephen Hemminger void tcp_get_default_congestion_control(struct net *net, char *name)
336317a76f9SStephen Hemminger {
3376670e152SStephen Hemminger 	const struct tcp_congestion_ops *ca;
338317a76f9SStephen Hemminger 
339317a76f9SStephen Hemminger 	rcu_read_lock();
3406670e152SStephen Hemminger 	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
341*a3bfc095SKees Cook 	strscpy(name, ca->name, TCP_CA_NAME_MAX);
342317a76f9SStephen Hemminger 	rcu_read_unlock();
343317a76f9SStephen Hemminger }
344317a76f9SStephen Hemminger 
345ce7bc3bfSStephen Hemminger /* Built list of non-restricted congestion control values */
tcp_get_allowed_congestion_control(char * buf,size_t maxlen)346ce7bc3bfSStephen Hemminger void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
347ce7bc3bfSStephen Hemminger {
348ce7bc3bfSStephen Hemminger 	struct tcp_congestion_ops *ca;
349ce7bc3bfSStephen Hemminger 	size_t offs = 0;
350ce7bc3bfSStephen Hemminger 
351ce7bc3bfSStephen Hemminger 	*buf = '\0';
352ce7bc3bfSStephen Hemminger 	rcu_read_lock();
353ce7bc3bfSStephen Hemminger 	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
354164891aaSStephen Hemminger 		if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
355ce7bc3bfSStephen Hemminger 			continue;
356ce7bc3bfSStephen Hemminger 		offs += snprintf(buf + offs, maxlen - offs,
357ce7bc3bfSStephen Hemminger 				 "%s%s",
358ce7bc3bfSStephen Hemminger 				 offs == 0 ? "" : " ", ca->name);
3599bb59a21SHangbin Liu 
3609bb59a21SHangbin Liu 		if (WARN_ON_ONCE(offs >= maxlen))
3619bb59a21SHangbin Liu 			break;
362ce7bc3bfSStephen Hemminger 	}
363ce7bc3bfSStephen Hemminger 	rcu_read_unlock();
364ce7bc3bfSStephen Hemminger }
365ce7bc3bfSStephen Hemminger 
366ce7bc3bfSStephen Hemminger /* Change list of non-restricted congestion control */
tcp_set_allowed_congestion_control(char * val)367ce7bc3bfSStephen Hemminger int tcp_set_allowed_congestion_control(char *val)
368ce7bc3bfSStephen Hemminger {
369ce7bc3bfSStephen Hemminger 	struct tcp_congestion_ops *ca;
370c34186edSJulia Lawall 	char *saved_clone, *clone, *name;
371ce7bc3bfSStephen Hemminger 	int ret = 0;
372ce7bc3bfSStephen Hemminger 
373c34186edSJulia Lawall 	saved_clone = clone = kstrdup(val, GFP_USER);
374ce7bc3bfSStephen Hemminger 	if (!clone)
375ce7bc3bfSStephen Hemminger 		return -ENOMEM;
376ce7bc3bfSStephen Hemminger 
377ce7bc3bfSStephen Hemminger 	spin_lock(&tcp_cong_list_lock);
378ce7bc3bfSStephen Hemminger 	/* pass 1 check for bad entries */
379ce7bc3bfSStephen Hemminger 	while ((name = strsep(&clone, " ")) && *name) {
380ce7bc3bfSStephen Hemminger 		ca = tcp_ca_find(name);
381ce7bc3bfSStephen Hemminger 		if (!ca) {
382ce7bc3bfSStephen Hemminger 			ret = -ENOENT;
383ce7bc3bfSStephen Hemminger 			goto out;
384ce7bc3bfSStephen Hemminger 		}
385ce7bc3bfSStephen Hemminger 	}
386ce7bc3bfSStephen Hemminger 
387164891aaSStephen Hemminger 	/* pass 2 clear old values */
388ce7bc3bfSStephen Hemminger 	list_for_each_entry_rcu(ca, &tcp_cong_list, list)
389164891aaSStephen Hemminger 		ca->flags &= ~TCP_CONG_NON_RESTRICTED;
390ce7bc3bfSStephen Hemminger 
391ce7bc3bfSStephen Hemminger 	/* pass 3 mark as allowed */
392ce7bc3bfSStephen Hemminger 	while ((name = strsep(&val, " ")) && *name) {
393ce7bc3bfSStephen Hemminger 		ca = tcp_ca_find(name);
394ce7bc3bfSStephen Hemminger 		WARN_ON(!ca);
395ce7bc3bfSStephen Hemminger 		if (ca)
396164891aaSStephen Hemminger 			ca->flags |= TCP_CONG_NON_RESTRICTED;
397ce7bc3bfSStephen Hemminger 	}
398ce7bc3bfSStephen Hemminger out:
399ce7bc3bfSStephen Hemminger 	spin_unlock(&tcp_cong_list_lock);
400c34186edSJulia Lawall 	kfree(saved_clone);
401ce7bc3bfSStephen Hemminger 
402ce7bc3bfSStephen Hemminger 	return ret;
403ce7bc3bfSStephen Hemminger }
404ce7bc3bfSStephen Hemminger 
40591b5b21cSLawrence Brakmo /* Change congestion control for socket. If load is false, then it is the
40691b5b21cSLawrence Brakmo  * responsibility of the caller to call tcp_init_congestion_control or
40791b5b21cSLawrence Brakmo  * tcp_reinit_congestion_control (if the current congestion control was
40891b5b21cSLawrence Brakmo  * already initialized.
40991b5b21cSLawrence Brakmo  */
tcp_set_congestion_control(struct sock * sk,const char * name,bool load,bool cap_net_admin)4108d650cdeSEric Dumazet int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
41129a94932SNeal Cardwell 			       bool cap_net_admin)
4125f8ef48dSStephen Hemminger {
4136687e988SArnaldo Carvalho de Melo 	struct inet_connection_sock *icsk = inet_csk(sk);
414c5c6a8abSDaniel Borkmann 	const struct tcp_congestion_ops *ca;
4155f8ef48dSStephen Hemminger 	int err = 0;
4165f8ef48dSStephen Hemminger 
417c5c6a8abSDaniel Borkmann 	if (icsk->icsk_ca_dst_locked)
418c5c6a8abSDaniel Borkmann 		return -EPERM;
4194d4d3d1eSStephen Hemminger 
420c5c6a8abSDaniel Borkmann 	rcu_read_lock();
42191b5b21cSLawrence Brakmo 	if (!load)
42291b5b21cSLawrence Brakmo 		ca = tcp_ca_find(name);
42391b5b21cSLawrence Brakmo 	else
42461e2bbafSJason Xing 		ca = tcp_ca_find_autoload(name);
4256670e152SStephen Hemminger 
426c5c6a8abSDaniel Borkmann 	/* No change asking for existing value */
4279f950415SNeal Cardwell 	if (ca == icsk->icsk_ca_ops) {
4289f950415SNeal Cardwell 		icsk->icsk_ca_setsockopt = 1;
4295f8ef48dSStephen Hemminger 		goto out;
4309f950415SNeal Cardwell 	}
4316670e152SStephen Hemminger 
4325050bef8SNeal Cardwell 	if (!ca)
4335f8ef48dSStephen Hemminger 		err = -ENOENT;
4345050bef8SNeal Cardwell 	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
43591b5b21cSLawrence Brakmo 		err = -EPERM;
4365050bef8SNeal Cardwell 	else if (!bpf_try_module_get(ca, ca->owner))
43791b5b21cSLawrence Brakmo 		err = -EBUSY;
4385050bef8SNeal Cardwell 	else
43929ba4fffSDaniel Borkmann 		tcp_reinit_congestion_control(sk, ca);
4405f8ef48dSStephen Hemminger  out:
4415f8ef48dSStephen Hemminger 	rcu_read_unlock();
4425f8ef48dSStephen Hemminger 	return err;
4435f8ef48dSStephen Hemminger }
4445f8ef48dSStephen Hemminger 
4459f9843a7SYuchung Cheng /* Slow start is used when congestion window is no greater than the slow start
4469f9843a7SYuchung Cheng  * threshold. We base on RFC2581 and also handle stretch ACKs properly.
4479f9843a7SYuchung Cheng  * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
4489f9843a7SYuchung Cheng  * something better;) a packet is only considered (s)acked in its entirety to
4499f9843a7SYuchung Cheng  * defend the ACK attacks described in the RFC. Slow start processes a stretch
4509f9843a7SYuchung Cheng  * ACK of degree N as if N acks of degree 1 are received back to back except
4519f9843a7SYuchung Cheng  * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
4529f9843a7SYuchung Cheng  * returns the leftover acks to adjust cwnd in congestion avoidance mode.
45340efc6faSStephen Hemminger  */
tcp_slow_start(struct tcp_sock * tp,u32 acked)454400031e0SDavid Vernet __bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
45540efc6faSStephen Hemminger {
45640570375SEric Dumazet 	u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
457973ec449SEric Dumazet 
45840570375SEric Dumazet 	acked -= cwnd - tcp_snd_cwnd(tp);
45940570375SEric Dumazet 	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
460e73ebb08SNeal Cardwell 
461e73ebb08SNeal Cardwell 	return acked;
462886236c1SJohn Heffner }
46340efc6faSStephen Hemminger EXPORT_SYMBOL_GPL(tcp_slow_start);
46440efc6faSStephen Hemminger 
465814d488cSNeal Cardwell /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
466814d488cSNeal Cardwell  * for every packet that was ACKed.
467814d488cSNeal Cardwell  */
tcp_cong_avoid_ai(struct tcp_sock * tp,u32 w,u32 acked)468400031e0SDavid Vernet __bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
469758ce5c8SIlpo Järvinen {
4709949afa4SNeal Cardwell 	/* If credits accumulated at a higher w, apply them gently now. */
4719949afa4SNeal Cardwell 	if (tp->snd_cwnd_cnt >= w) {
4729949afa4SNeal Cardwell 		tp->snd_cwnd_cnt = 0;
47340570375SEric Dumazet 		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
4749949afa4SNeal Cardwell 	}
4759949afa4SNeal Cardwell 
476e73ebb08SNeal Cardwell 	tp->snd_cwnd_cnt += acked;
477758ce5c8SIlpo Järvinen 	if (tp->snd_cwnd_cnt >= w) {
478814d488cSNeal Cardwell 		u32 delta = tp->snd_cwnd_cnt / w;
479814d488cSNeal Cardwell 
480814d488cSNeal Cardwell 		tp->snd_cwnd_cnt -= delta * w;
48140570375SEric Dumazet 		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta);
482758ce5c8SIlpo Järvinen 	}
48340570375SEric Dumazet 	tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
484758ce5c8SIlpo Järvinen }
485758ce5c8SIlpo Järvinen EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
486758ce5c8SIlpo Järvinen 
487317a76f9SStephen Hemminger /*
488317a76f9SStephen Hemminger  * TCP Reno congestion control
489317a76f9SStephen Hemminger  * This is special case used for fallback as well.
490317a76f9SStephen Hemminger  */
491317a76f9SStephen Hemminger /* This is Jacobson's slow start and congestion avoidance.
492317a76f9SStephen Hemminger  * SIGCOMM '88, p. 328.
493317a76f9SStephen Hemminger  */
tcp_reno_cong_avoid(struct sock * sk,u32 ack,u32 acked)494400031e0SDavid Vernet __bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
495317a76f9SStephen Hemminger {
4966687e988SArnaldo Carvalho de Melo 	struct tcp_sock *tp = tcp_sk(sk);
4976687e988SArnaldo Carvalho de Melo 
49824901551SEric Dumazet 	if (!tcp_is_cwnd_limited(sk))
499317a76f9SStephen Hemminger 		return;
500317a76f9SStephen Hemminger 
501317a76f9SStephen Hemminger 	/* In "safe" area, increase. */
502071d5080SYuchung Cheng 	if (tcp_in_slow_start(tp)) {
503c22bdca9SNeal Cardwell 		acked = tcp_slow_start(tp, acked);
504c22bdca9SNeal Cardwell 		if (!acked)
505c22bdca9SNeal Cardwell 			return;
506c22bdca9SNeal Cardwell 	}
5079772efb9SStephen Hemminger 	/* In dangerous area, increase slowly. */
50840570375SEric Dumazet 	tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
509317a76f9SStephen Hemminger }
510317a76f9SStephen Hemminger EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
511317a76f9SStephen Hemminger 
512317a76f9SStephen Hemminger /* Slow start threshold is half the congestion window (min 2) */
tcp_reno_ssthresh(struct sock * sk)513400031e0SDavid Vernet __bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk)
514317a76f9SStephen Hemminger {
5156687e988SArnaldo Carvalho de Melo 	const struct tcp_sock *tp = tcp_sk(sk);
516688d1945Sstephen hemminger 
51740570375SEric Dumazet 	return max(tcp_snd_cwnd(tp) >> 1U, 2U);
518317a76f9SStephen Hemminger }
519317a76f9SStephen Hemminger EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
520317a76f9SStephen Hemminger 
tcp_reno_undo_cwnd(struct sock * sk)521400031e0SDavid Vernet __bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk)
522e9799183SFlorian Westphal {
523e9799183SFlorian Westphal 	const struct tcp_sock *tp = tcp_sk(sk);
524e9799183SFlorian Westphal 
52540570375SEric Dumazet 	return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
526e9799183SFlorian Westphal }
527e9799183SFlorian Westphal EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
528e9799183SFlorian Westphal 
529317a76f9SStephen Hemminger struct tcp_congestion_ops tcp_reno = {
530164891aaSStephen Hemminger 	.flags		= TCP_CONG_NON_RESTRICTED,
531317a76f9SStephen Hemminger 	.name		= "reno",
532317a76f9SStephen Hemminger 	.owner		= THIS_MODULE,
533317a76f9SStephen Hemminger 	.ssthresh	= tcp_reno_ssthresh,
534317a76f9SStephen Hemminger 	.cong_avoid	= tcp_reno_cong_avoid,
535e9799183SFlorian Westphal 	.undo_cwnd	= tcp_reno_undo_cwnd,
536317a76f9SStephen Hemminger };
537