xref: /linux/net/ipv4/tcp_plb.c (revision 7ae9fb1b7ecbb5d85d07857943f677fd1a559b18)
11a91bb7cSMubashir Adnan Qureshi /* Protective Load Balancing (PLB)
21a91bb7cSMubashir Adnan Qureshi  *
31a91bb7cSMubashir Adnan Qureshi  * PLB was designed to reduce link load imbalance across datacenter
41a91bb7cSMubashir Adnan Qureshi  * switches. PLB is a host-based optimization; it leverages congestion
51a91bb7cSMubashir Adnan Qureshi  * signals from the transport layer to randomly change the path of the
61a91bb7cSMubashir Adnan Qureshi  * connection experiencing sustained congestion. PLB prefers to repath
71a91bb7cSMubashir Adnan Qureshi  * after idle periods to minimize packet reordering. It repaths by
81a91bb7cSMubashir Adnan Qureshi  * changing the IPv6 Flow Label on the packets of a connection, which
91a91bb7cSMubashir Adnan Qureshi  * datacenter switches include as part of ECMP/WCMP hashing.
101a91bb7cSMubashir Adnan Qureshi  *
111a91bb7cSMubashir Adnan Qureshi  * PLB is described in detail in:
121a91bb7cSMubashir Adnan Qureshi  *
131a91bb7cSMubashir Adnan Qureshi  *	Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu,
141a91bb7cSMubashir Adnan Qureshi  *	Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson,
151a91bb7cSMubashir Adnan Qureshi  *	David Wetherall,Abdul Kabbani:
161a91bb7cSMubashir Adnan Qureshi  *	"PLB: Congestion Signals are Simple and Effective for
171a91bb7cSMubashir Adnan Qureshi  *	 Network Load Balancing"
181a91bb7cSMubashir Adnan Qureshi  *	In ACM SIGCOMM 2022, Amsterdam Netherlands.
191a91bb7cSMubashir Adnan Qureshi  *
201a91bb7cSMubashir Adnan Qureshi  */
211a91bb7cSMubashir Adnan Qureshi 
221a91bb7cSMubashir Adnan Qureshi #include <net/tcp.h>
231a91bb7cSMubashir Adnan Qureshi 
241a91bb7cSMubashir Adnan Qureshi /* Called once per round-trip to update PLB state for a connection. */
tcp_plb_update_state(const struct sock * sk,struct tcp_plb_state * plb,const int cong_ratio)251a91bb7cSMubashir Adnan Qureshi void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
261a91bb7cSMubashir Adnan Qureshi 			  const int cong_ratio)
271a91bb7cSMubashir Adnan Qureshi {
281a91bb7cSMubashir Adnan Qureshi 	struct net *net = sock_net(sk);
291a91bb7cSMubashir Adnan Qureshi 
301a91bb7cSMubashir Adnan Qureshi 	if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
311a91bb7cSMubashir Adnan Qureshi 		return;
321a91bb7cSMubashir Adnan Qureshi 
331a91bb7cSMubashir Adnan Qureshi 	if (cong_ratio >= 0) {
341a91bb7cSMubashir Adnan Qureshi 		if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh))
351a91bb7cSMubashir Adnan Qureshi 			plb->consec_cong_rounds = 0;
361a91bb7cSMubashir Adnan Qureshi 		else if (plb->consec_cong_rounds <
371a91bb7cSMubashir Adnan Qureshi 			 READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds))
381a91bb7cSMubashir Adnan Qureshi 			plb->consec_cong_rounds++;
391a91bb7cSMubashir Adnan Qureshi 	}
401a91bb7cSMubashir Adnan Qureshi }
411a91bb7cSMubashir Adnan Qureshi EXPORT_SYMBOL_GPL(tcp_plb_update_state);
421a91bb7cSMubashir Adnan Qureshi 
431a91bb7cSMubashir Adnan Qureshi /* Check whether recent congestion has been persistent enough to warrant
441a91bb7cSMubashir Adnan Qureshi  * a load balancing decision that switches the connection to another path.
451a91bb7cSMubashir Adnan Qureshi  */
tcp_plb_check_rehash(struct sock * sk,struct tcp_plb_state * plb)461a91bb7cSMubashir Adnan Qureshi void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
471a91bb7cSMubashir Adnan Qureshi {
481a91bb7cSMubashir Adnan Qureshi 	struct net *net = sock_net(sk);
491a91bb7cSMubashir Adnan Qureshi 	u32 max_suspend;
501a91bb7cSMubashir Adnan Qureshi 	bool forced_rehash = false, idle_rehash = false;
511a91bb7cSMubashir Adnan Qureshi 
521a91bb7cSMubashir Adnan Qureshi 	if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
531a91bb7cSMubashir Adnan Qureshi 		return;
541a91bb7cSMubashir Adnan Qureshi 
551a91bb7cSMubashir Adnan Qureshi 	forced_rehash = plb->consec_cong_rounds >=
561a91bb7cSMubashir Adnan Qureshi 			READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds);
571a91bb7cSMubashir Adnan Qureshi 	/* If sender goes idle then we check whether to rehash. */
581a91bb7cSMubashir Adnan Qureshi 	idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) &&
591a91bb7cSMubashir Adnan Qureshi 		      !tcp_sk(sk)->packets_out &&
601a91bb7cSMubashir Adnan Qureshi 		      plb->consec_cong_rounds >=
611a91bb7cSMubashir Adnan Qureshi 		      READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds);
621a91bb7cSMubashir Adnan Qureshi 
631a91bb7cSMubashir Adnan Qureshi 	if (!forced_rehash && !idle_rehash)
641a91bb7cSMubashir Adnan Qureshi 		return;
651a91bb7cSMubashir Adnan Qureshi 
661a91bb7cSMubashir Adnan Qureshi 	/* Note that tcp_jiffies32 can wrap; we detect wraps by checking for
671a91bb7cSMubashir Adnan Qureshi 	 * cases where the max suspension end is before the actual suspension
681a91bb7cSMubashir Adnan Qureshi 	 * end. We clear pause_until to 0 to indicate there is no recent
691a91bb7cSMubashir Adnan Qureshi 	 * RTO event that constrains PLB rehashing.
701a91bb7cSMubashir Adnan Qureshi 	 */
711a91bb7cSMubashir Adnan Qureshi 	max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
721a91bb7cSMubashir Adnan Qureshi 	if (plb->pause_until &&
731a91bb7cSMubashir Adnan Qureshi 	    (!before(tcp_jiffies32, plb->pause_until) ||
741a91bb7cSMubashir Adnan Qureshi 	     before(tcp_jiffies32 + max_suspend, plb->pause_until)))
751a91bb7cSMubashir Adnan Qureshi 		plb->pause_until = 0;
761a91bb7cSMubashir Adnan Qureshi 
771a91bb7cSMubashir Adnan Qureshi 	if (plb->pause_until)
781a91bb7cSMubashir Adnan Qureshi 		return;
791a91bb7cSMubashir Adnan Qureshi 
801a91bb7cSMubashir Adnan Qureshi 	sk_rethink_txhash(sk);
811a91bb7cSMubashir Adnan Qureshi 	plb->consec_cong_rounds = 0;
8229c1c446SMubashir Adnan Qureshi 	tcp_sk(sk)->plb_rehash++;
8329c1c446SMubashir Adnan Qureshi 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
841a91bb7cSMubashir Adnan Qureshi }
851a91bb7cSMubashir Adnan Qureshi EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
861a91bb7cSMubashir Adnan Qureshi 
871a91bb7cSMubashir Adnan Qureshi /* Upon RTO, disallow load balancing for a while, to avoid having load
881a91bb7cSMubashir Adnan Qureshi  * balancing decisions switch traffic to a black-holed path that was
891a91bb7cSMubashir Adnan Qureshi  * previously avoided with a sk_rethink_txhash() call at RTO time.
901a91bb7cSMubashir Adnan Qureshi  */
tcp_plb_update_state_upon_rto(struct sock * sk,struct tcp_plb_state * plb)911a91bb7cSMubashir Adnan Qureshi void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb)
921a91bb7cSMubashir Adnan Qureshi {
931a91bb7cSMubashir Adnan Qureshi 	struct net *net = sock_net(sk);
941a91bb7cSMubashir Adnan Qureshi 	u32 pause;
951a91bb7cSMubashir Adnan Qureshi 
961a91bb7cSMubashir Adnan Qureshi 	if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
971a91bb7cSMubashir Adnan Qureshi 		return;
981a91bb7cSMubashir Adnan Qureshi 
991a91bb7cSMubashir Adnan Qureshi 	pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
100*3c202d14SJason A. Donenfeld 	pause += get_random_u32_below(pause);
1011a91bb7cSMubashir Adnan Qureshi 	plb->pause_until = tcp_jiffies32 + pause;
1021a91bb7cSMubashir Adnan Qureshi 
1031a91bb7cSMubashir Adnan Qureshi 	/* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call
1041a91bb7cSMubashir Adnan Qureshi 	 * that may switch this connection to a path with completely different
1051a91bb7cSMubashir Adnan Qureshi 	 * congestion characteristics.
1061a91bb7cSMubashir Adnan Qureshi 	 */
1071a91bb7cSMubashir Adnan Qureshi 	plb->consec_cong_rounds = 0;
1081a91bb7cSMubashir Adnan Qureshi }
1091a91bb7cSMubashir Adnan Qureshi EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);
110