xref: /linux/net/ipv4/tcp_metrics.c (revision 1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95)
1  // SPDX-License-Identifier: GPL-2.0
2  #include <linux/rcupdate.h>
3  #include <linux/spinlock.h>
4  #include <linux/jiffies.h>
5  #include <linux/module.h>
6  #include <linux/cache.h>
7  #include <linux/slab.h>
8  #include <linux/init.h>
9  #include <linux/tcp.h>
10  #include <linux/hash.h>
11  #include <linux/tcp_metrics.h>
12  #include <linux/vmalloc.h>
13  
14  #include <net/inet_connection_sock.h>
15  #include <net/net_namespace.h>
16  #include <net/request_sock.h>
17  #include <net/inetpeer.h>
18  #include <net/sock.h>
19  #include <net/ipv6.h>
20  #include <net/dst.h>
21  #include <net/tcp.h>
22  #include <net/genetlink.h>
23  
24  static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
25  						   const struct inetpeer_addr *daddr,
26  						   struct net *net, unsigned int hash);
27  
28  struct tcp_fastopen_metrics {
29  	u16	mss;
30  	u16	syn_loss:10,		/* Recurring Fast Open SYN losses */
31  		try_exp:2;		/* Request w/ exp. option (once) */
32  	unsigned long	last_syn_loss;	/* Last Fast Open SYN loss */
33  	struct	tcp_fastopen_cookie	cookie;
34  };
35  
36  /* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
37   * Kernel only stores RTT and RTTVAR in usec resolution
38   */
39  #define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
40  
41  struct tcp_metrics_block {
42  	struct tcp_metrics_block __rcu	*tcpm_next;
43  	struct net			*tcpm_net;
44  	struct inetpeer_addr		tcpm_saddr;
45  	struct inetpeer_addr		tcpm_daddr;
46  	unsigned long			tcpm_stamp;
47  	u32				tcpm_lock;
48  	u32				tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
49  	struct tcp_fastopen_metrics	tcpm_fastopen;
50  
51  	struct rcu_head			rcu_head;
52  };
53  
tm_net(const struct tcp_metrics_block * tm)54  static inline struct net *tm_net(const struct tcp_metrics_block *tm)
55  {
56  	/* Paired with the WRITE_ONCE() in tcpm_new() */
57  	return READ_ONCE(tm->tcpm_net);
58  }
59  
tcp_metric_locked(struct tcp_metrics_block * tm,enum tcp_metric_index idx)60  static bool tcp_metric_locked(struct tcp_metrics_block *tm,
61  			      enum tcp_metric_index idx)
62  {
63  	/* Paired with WRITE_ONCE() in tcpm_suck_dst() */
64  	return READ_ONCE(tm->tcpm_lock) & (1 << idx);
65  }
66  
tcp_metric_get(const struct tcp_metrics_block * tm,enum tcp_metric_index idx)67  static u32 tcp_metric_get(const struct tcp_metrics_block *tm,
68  			  enum tcp_metric_index idx)
69  {
70  	/* Paired with WRITE_ONCE() in tcp_metric_set() */
71  	return READ_ONCE(tm->tcpm_vals[idx]);
72  }
73  
tcp_metric_set(struct tcp_metrics_block * tm,enum tcp_metric_index idx,u32 val)74  static void tcp_metric_set(struct tcp_metrics_block *tm,
75  			   enum tcp_metric_index idx,
76  			   u32 val)
77  {
78  	/* Paired with READ_ONCE() in tcp_metric_get() */
79  	WRITE_ONCE(tm->tcpm_vals[idx], val);
80  }
81  
addr_same(const struct inetpeer_addr * a,const struct inetpeer_addr * b)82  static bool addr_same(const struct inetpeer_addr *a,
83  		      const struct inetpeer_addr *b)
84  {
85  	return (a->family == b->family) && !inetpeer_addr_cmp(a, b);
86  }
87  
88  struct tcpm_hash_bucket {
89  	struct tcp_metrics_block __rcu	*chain;
90  };
91  
92  static struct tcpm_hash_bucket	*tcp_metrics_hash __read_mostly;
93  static unsigned int		tcp_metrics_hash_log __read_mostly;
94  
95  static DEFINE_SPINLOCK(tcp_metrics_lock);
96  static DEFINE_SEQLOCK(fastopen_seqlock);
97  
tcpm_suck_dst(struct tcp_metrics_block * tm,const struct dst_entry * dst,bool fastopen_clear)98  static void tcpm_suck_dst(struct tcp_metrics_block *tm,
99  			  const struct dst_entry *dst,
100  			  bool fastopen_clear)
101  {
102  	u32 msval;
103  	u32 val;
104  
105  	WRITE_ONCE(tm->tcpm_stamp, jiffies);
106  
107  	val = 0;
108  	if (dst_metric_locked(dst, RTAX_RTT))
109  		val |= 1 << TCP_METRIC_RTT;
110  	if (dst_metric_locked(dst, RTAX_RTTVAR))
111  		val |= 1 << TCP_METRIC_RTTVAR;
112  	if (dst_metric_locked(dst, RTAX_SSTHRESH))
113  		val |= 1 << TCP_METRIC_SSTHRESH;
114  	if (dst_metric_locked(dst, RTAX_CWND))
115  		val |= 1 << TCP_METRIC_CWND;
116  	if (dst_metric_locked(dst, RTAX_REORDERING))
117  		val |= 1 << TCP_METRIC_REORDERING;
118  	/* Paired with READ_ONCE() in tcp_metric_locked() */
119  	WRITE_ONCE(tm->tcpm_lock, val);
120  
121  	msval = dst_metric_raw(dst, RTAX_RTT);
122  	tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC);
123  
124  	msval = dst_metric_raw(dst, RTAX_RTTVAR);
125  	tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC);
126  	tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
127  		       dst_metric_raw(dst, RTAX_SSTHRESH));
128  	tcp_metric_set(tm, TCP_METRIC_CWND,
129  		       dst_metric_raw(dst, RTAX_CWND));
130  	tcp_metric_set(tm, TCP_METRIC_REORDERING,
131  		       dst_metric_raw(dst, RTAX_REORDERING));
132  	if (fastopen_clear) {
133  		write_seqlock(&fastopen_seqlock);
134  		tm->tcpm_fastopen.mss = 0;
135  		tm->tcpm_fastopen.syn_loss = 0;
136  		tm->tcpm_fastopen.try_exp = 0;
137  		tm->tcpm_fastopen.cookie.exp = false;
138  		tm->tcpm_fastopen.cookie.len = 0;
139  		write_sequnlock(&fastopen_seqlock);
140  	}
141  }
142  
143  #define TCP_METRICS_TIMEOUT		(60 * 60 * HZ)
144  
tcpm_check_stamp(struct tcp_metrics_block * tm,const struct dst_entry * dst)145  static void tcpm_check_stamp(struct tcp_metrics_block *tm,
146  			     const struct dst_entry *dst)
147  {
148  	unsigned long limit;
149  
150  	if (!tm)
151  		return;
152  	limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT;
153  	if (unlikely(time_after(jiffies, limit)))
154  		tcpm_suck_dst(tm, dst, false);
155  }
156  
157  #define TCP_METRICS_RECLAIM_DEPTH	5
158  #define TCP_METRICS_RECLAIM_PTR		(struct tcp_metrics_block *) 0x1UL
159  
160  #define deref_locked(p)	\
161  	rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock))
162  
tcpm_new(struct dst_entry * dst,struct inetpeer_addr * saddr,struct inetpeer_addr * daddr,unsigned int hash)163  static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
164  					  struct inetpeer_addr *saddr,
165  					  struct inetpeer_addr *daddr,
166  					  unsigned int hash)
167  {
168  	struct tcp_metrics_block *tm;
169  	struct net *net;
170  	bool reclaim = false;
171  
172  	spin_lock_bh(&tcp_metrics_lock);
173  	net = dev_net_rcu(dst->dev);
174  
175  	/* While waiting for the spin-lock the cache might have been populated
176  	 * with this entry and so we have to check again.
177  	 */
178  	tm = __tcp_get_metrics(saddr, daddr, net, hash);
179  	if (tm == TCP_METRICS_RECLAIM_PTR) {
180  		reclaim = true;
181  		tm = NULL;
182  	}
183  	if (tm) {
184  		tcpm_check_stamp(tm, dst);
185  		goto out_unlock;
186  	}
187  
188  	if (unlikely(reclaim)) {
189  		struct tcp_metrics_block *oldest;
190  
191  		oldest = deref_locked(tcp_metrics_hash[hash].chain);
192  		for (tm = deref_locked(oldest->tcpm_next); tm;
193  		     tm = deref_locked(tm->tcpm_next)) {
194  			if (time_before(READ_ONCE(tm->tcpm_stamp),
195  					READ_ONCE(oldest->tcpm_stamp)))
196  				oldest = tm;
197  		}
198  		tm = oldest;
199  	} else {
200  		tm = kzalloc(sizeof(*tm), GFP_ATOMIC);
201  		if (!tm)
202  			goto out_unlock;
203  	}
204  	/* Paired with the READ_ONCE() in tm_net() */
205  	WRITE_ONCE(tm->tcpm_net, net);
206  
207  	tm->tcpm_saddr = *saddr;
208  	tm->tcpm_daddr = *daddr;
209  
210  	tcpm_suck_dst(tm, dst, reclaim);
211  
212  	if (likely(!reclaim)) {
213  		tm->tcpm_next = tcp_metrics_hash[hash].chain;
214  		rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
215  	}
216  
217  out_unlock:
218  	spin_unlock_bh(&tcp_metrics_lock);
219  	return tm;
220  }
221  
tcp_get_encode(struct tcp_metrics_block * tm,int depth)222  static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
223  {
224  	if (tm)
225  		return tm;
226  	if (depth > TCP_METRICS_RECLAIM_DEPTH)
227  		return TCP_METRICS_RECLAIM_PTR;
228  	return NULL;
229  }
230  
__tcp_get_metrics(const struct inetpeer_addr * saddr,const struct inetpeer_addr * daddr,struct net * net,unsigned int hash)231  static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
232  						   const struct inetpeer_addr *daddr,
233  						   struct net *net, unsigned int hash)
234  {
235  	struct tcp_metrics_block *tm;
236  	int depth = 0;
237  
238  	for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
239  	     tm = rcu_dereference(tm->tcpm_next)) {
240  		if (addr_same(&tm->tcpm_saddr, saddr) &&
241  		    addr_same(&tm->tcpm_daddr, daddr) &&
242  		    net_eq(tm_net(tm), net))
243  			break;
244  		depth++;
245  	}
246  	return tcp_get_encode(tm, depth);
247  }
248  
__tcp_get_metrics_req(struct request_sock * req,struct dst_entry * dst)249  static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
250  						       struct dst_entry *dst)
251  {
252  	struct tcp_metrics_block *tm;
253  	struct inetpeer_addr saddr, daddr;
254  	unsigned int hash;
255  	struct net *net;
256  
257  	saddr.family = req->rsk_ops->family;
258  	daddr.family = req->rsk_ops->family;
259  	switch (daddr.family) {
260  	case AF_INET:
261  		inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr);
262  		inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr);
263  		hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr);
264  		break;
265  #if IS_ENABLED(CONFIG_IPV6)
266  	case AF_INET6:
267  		inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr);
268  		inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr);
269  		hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
270  		break;
271  #endif
272  	default:
273  		return NULL;
274  	}
275  
276  	net = dev_net_rcu(dst->dev);
277  	hash ^= net_hash_mix(net);
278  	hash = hash_32(hash, tcp_metrics_hash_log);
279  
280  	for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
281  	     tm = rcu_dereference(tm->tcpm_next)) {
282  		if (addr_same(&tm->tcpm_saddr, &saddr) &&
283  		    addr_same(&tm->tcpm_daddr, &daddr) &&
284  		    net_eq(tm_net(tm), net))
285  			break;
286  	}
287  	tcpm_check_stamp(tm, dst);
288  	return tm;
289  }
290  
tcp_get_metrics(struct sock * sk,struct dst_entry * dst,bool create)291  static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
292  						 struct dst_entry *dst,
293  						 bool create)
294  {
295  	struct tcp_metrics_block *tm;
296  	struct inetpeer_addr saddr, daddr;
297  	unsigned int hash;
298  	struct net *net;
299  
300  	if (sk->sk_family == AF_INET) {
301  		inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
302  		inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
303  		hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
304  	}
305  #if IS_ENABLED(CONFIG_IPV6)
306  	else if (sk->sk_family == AF_INET6) {
307  		if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
308  			inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
309  			inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
310  			hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
311  		} else {
312  			inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr);
313  			inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr);
314  			hash = ipv6_addr_hash(&sk->sk_v6_daddr);
315  		}
316  	}
317  #endif
318  	else
319  		return NULL;
320  
321  	net = dev_net_rcu(dst->dev);
322  	hash ^= net_hash_mix(net);
323  	hash = hash_32(hash, tcp_metrics_hash_log);
324  
325  	tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
326  	if (tm == TCP_METRICS_RECLAIM_PTR)
327  		tm = NULL;
328  	if (!tm && create)
329  		tm = tcpm_new(dst, &saddr, &daddr, hash);
330  	else
331  		tcpm_check_stamp(tm, dst);
332  
333  	return tm;
334  }
335  
336  /* Save metrics learned by this TCP session.  This function is called
337   * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
338   * or goes from LAST-ACK to CLOSE.
339   */
tcp_update_metrics(struct sock * sk)340  void tcp_update_metrics(struct sock *sk)
341  {
342  	const struct inet_connection_sock *icsk = inet_csk(sk);
343  	struct dst_entry *dst = __sk_dst_get(sk);
344  	struct tcp_sock *tp = tcp_sk(sk);
345  	struct net *net = sock_net(sk);
346  	struct tcp_metrics_block *tm;
347  	unsigned long rtt;
348  	u32 val;
349  	int m;
350  
351  	sk_dst_confirm(sk);
352  	if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst)
353  		return;
354  
355  	rcu_read_lock();
356  	if (icsk->icsk_backoff || !tp->srtt_us) {
357  		/* This session failed to estimate rtt. Why?
358  		 * Probably, no packets returned in time.  Reset our
359  		 * results.
360  		 */
361  		tm = tcp_get_metrics(sk, dst, false);
362  		if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
363  			tcp_metric_set(tm, TCP_METRIC_RTT, 0);
364  		goto out_unlock;
365  	} else
366  		tm = tcp_get_metrics(sk, dst, true);
367  
368  	if (!tm)
369  		goto out_unlock;
370  
371  	rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
372  	m = rtt - tp->srtt_us;
373  
374  	/* If newly calculated rtt larger than stored one, store new
375  	 * one. Otherwise, use EWMA. Remember, rtt overestimation is
376  	 * always better than underestimation.
377  	 */
378  	if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
379  		if (m <= 0)
380  			rtt = tp->srtt_us;
381  		else
382  			rtt -= (m >> 3);
383  		tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
384  	}
385  
386  	if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
387  		unsigned long var;
388  
389  		if (m < 0)
390  			m = -m;
391  
392  		/* Scale deviation to rttvar fixed point */
393  		m >>= 1;
394  		if (m < tp->mdev_us)
395  			m = tp->mdev_us;
396  
397  		var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
398  		if (m >= var)
399  			var = m;
400  		else
401  			var -= (var - m) >> 2;
402  
403  		tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
404  	}
405  
406  	if (tcp_in_initial_slowstart(tp)) {
407  		/* Slow start still did not finish. */
408  		if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
409  		    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
410  			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
411  			if (val && (tcp_snd_cwnd(tp) >> 1) > val)
412  				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
413  					       tcp_snd_cwnd(tp) >> 1);
414  		}
415  		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
416  			val = tcp_metric_get(tm, TCP_METRIC_CWND);
417  			if (tcp_snd_cwnd(tp) > val)
418  				tcp_metric_set(tm, TCP_METRIC_CWND,
419  					       tcp_snd_cwnd(tp));
420  		}
421  	} else if (!tcp_in_slow_start(tp) &&
422  		   icsk->icsk_ca_state == TCP_CA_Open) {
423  		/* Cong. avoidance phase, cwnd is reliable. */
424  		if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
425  		    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
426  			tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
427  				       max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh));
428  		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
429  			val = tcp_metric_get(tm, TCP_METRIC_CWND);
430  			tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1);
431  		}
432  	} else {
433  		/* Else slow start did not finish, cwnd is non-sense,
434  		 * ssthresh may be also invalid.
435  		 */
436  		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
437  			val = tcp_metric_get(tm, TCP_METRIC_CWND);
438  			tcp_metric_set(tm, TCP_METRIC_CWND,
439  				       (val + tp->snd_ssthresh) >> 1);
440  		}
441  		if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
442  		    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
443  			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
444  			if (val && tp->snd_ssthresh > val)
445  				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
446  					       tp->snd_ssthresh);
447  		}
448  		if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
449  			val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
450  			if (val < tp->reordering &&
451  			    tp->reordering !=
452  			    READ_ONCE(net->ipv4.sysctl_tcp_reordering))
453  				tcp_metric_set(tm, TCP_METRIC_REORDERING,
454  					       tp->reordering);
455  		}
456  	}
457  	WRITE_ONCE(tm->tcpm_stamp, jiffies);
458  out_unlock:
459  	rcu_read_unlock();
460  }
461  
462  /* Initialize metrics on socket. */
463  
tcp_init_metrics(struct sock * sk)464  void tcp_init_metrics(struct sock *sk)
465  {
466  	struct dst_entry *dst = __sk_dst_get(sk);
467  	struct tcp_sock *tp = tcp_sk(sk);
468  	struct net *net = sock_net(sk);
469  	struct tcp_metrics_block *tm;
470  	u32 val, crtt = 0; /* cached RTT scaled by 8 */
471  
472  	sk_dst_confirm(sk);
473  	/* ssthresh may have been reduced unnecessarily during.
474  	 * 3WHS. Restore it back to its initial default.
475  	 */
476  	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
477  	if (!dst)
478  		goto reset;
479  
480  	rcu_read_lock();
481  	tm = tcp_get_metrics(sk, dst, false);
482  	if (!tm) {
483  		rcu_read_unlock();
484  		goto reset;
485  	}
486  
487  	if (tcp_metric_locked(tm, TCP_METRIC_CWND))
488  		tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
489  
490  	val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
491  	      0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
492  	if (val) {
493  		tp->snd_ssthresh = val;
494  		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
495  			tp->snd_ssthresh = tp->snd_cwnd_clamp;
496  	}
497  	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
498  	if (val && tp->reordering != val)
499  		tp->reordering = val;
500  
501  	crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
502  	rcu_read_unlock();
503  reset:
504  	/* The initial RTT measurement from the SYN/SYN-ACK is not ideal
505  	 * to seed the RTO for later data packets because SYN packets are
506  	 * small. Use the per-dst cached values to seed the RTO but keep
507  	 * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
508  	 * Later the RTO will be updated immediately upon obtaining the first
509  	 * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
510  	 * influences the first RTO but not later RTT estimation.
511  	 *
512  	 * But if RTT is not available from the SYN (due to retransmits or
513  	 * syn cookies) or the cache, force a conservative 3secs timeout.
514  	 *
515  	 * A bit of theory. RTT is time passed after "normal" sized packet
516  	 * is sent until it is ACKed. In normal circumstances sending small
517  	 * packets force peer to delay ACKs and calculation is correct too.
518  	 * The algorithm is adaptive and, provided we follow specs, it
519  	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
520  	 * tricks sort of "quick acks" for time long enough to decrease RTT
521  	 * to low value, and then abruptly stops to do it and starts to delay
522  	 * ACKs, wait for troubles.
523  	 */
524  	if (crtt > tp->srtt_us) {
525  		/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
526  		crtt /= 8 * USEC_PER_SEC / HZ;
527  		inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
528  	} else if (tp->srtt_us == 0) {
529  		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
530  		 * 3WHS. This is most likely due to retransmission,
531  		 * including spurious one. Reset the RTO back to 3secs
532  		 * from the more aggressive 1sec to avoid more spurious
533  		 * retransmission.
534  		 */
535  		tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
536  		tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
537  
538  		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
539  	}
540  }
541  
tcp_peer_is_proven(struct request_sock * req,struct dst_entry * dst)542  bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
543  {
544  	struct tcp_metrics_block *tm;
545  	bool ret;
546  
547  	if (!dst)
548  		return false;
549  
550  	rcu_read_lock();
551  	tm = __tcp_get_metrics_req(req, dst);
552  	if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
553  		ret = true;
554  	else
555  		ret = false;
556  	rcu_read_unlock();
557  
558  	return ret;
559  }
560  
tcp_fastopen_cache_get(struct sock * sk,u16 * mss,struct tcp_fastopen_cookie * cookie)561  void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
562  			    struct tcp_fastopen_cookie *cookie)
563  {
564  	struct tcp_metrics_block *tm;
565  
566  	rcu_read_lock();
567  	tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
568  	if (tm) {
569  		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
570  		unsigned int seq;
571  
572  		do {
573  			seq = read_seqbegin(&fastopen_seqlock);
574  			if (tfom->mss)
575  				*mss = tfom->mss;
576  			*cookie = tfom->cookie;
577  			if (cookie->len <= 0 && tfom->try_exp == 1)
578  				cookie->exp = true;
579  		} while (read_seqretry(&fastopen_seqlock, seq));
580  	}
581  	rcu_read_unlock();
582  }
583  
tcp_fastopen_cache_set(struct sock * sk,u16 mss,struct tcp_fastopen_cookie * cookie,bool syn_lost,u16 try_exp)584  void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
585  			    struct tcp_fastopen_cookie *cookie, bool syn_lost,
586  			    u16 try_exp)
587  {
588  	struct dst_entry *dst = __sk_dst_get(sk);
589  	struct tcp_metrics_block *tm;
590  
591  	if (!dst)
592  		return;
593  	rcu_read_lock();
594  	tm = tcp_get_metrics(sk, dst, true);
595  	if (tm) {
596  		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
597  
598  		write_seqlock_bh(&fastopen_seqlock);
599  		if (mss)
600  			tfom->mss = mss;
601  		if (cookie && cookie->len > 0)
602  			tfom->cookie = *cookie;
603  		else if (try_exp > tfom->try_exp &&
604  			 tfom->cookie.len <= 0 && !tfom->cookie.exp)
605  			tfom->try_exp = try_exp;
606  		if (syn_lost) {
607  			++tfom->syn_loss;
608  			tfom->last_syn_loss = jiffies;
609  		} else
610  			tfom->syn_loss = 0;
611  		write_sequnlock_bh(&fastopen_seqlock);
612  	}
613  	rcu_read_unlock();
614  }
615  
616  static struct genl_family tcp_metrics_nl_family;
617  
618  static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
619  	[TCP_METRICS_ATTR_ADDR_IPV4]	= { .type = NLA_U32, },
620  	[TCP_METRICS_ATTR_ADDR_IPV6]	=
621  		NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
622  
623  	[TCP_METRICS_ATTR_SADDR_IPV4]	= { .type = NLA_U32, },
624  	[TCP_METRICS_ATTR_SADDR_IPV6]	=
625  		NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
626  
627  	/* Following attributes are not received for GET/DEL,
628  	 * we keep them for reference
629  	 */
630  #if 0
631  	[TCP_METRICS_ATTR_AGE]		= { .type = NLA_MSECS, },
632  	[TCP_METRICS_ATTR_TW_TSVAL]	= { .type = NLA_U32, },
633  	[TCP_METRICS_ATTR_TW_TS_STAMP]	= { .type = NLA_S32, },
634  	[TCP_METRICS_ATTR_VALS]		= { .type = NLA_NESTED, },
635  	[TCP_METRICS_ATTR_FOPEN_MSS]	= { .type = NLA_U16, },
636  	[TCP_METRICS_ATTR_FOPEN_SYN_DROPS]	= { .type = NLA_U16, },
637  	[TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]	= { .type = NLA_MSECS, },
638  	[TCP_METRICS_ATTR_FOPEN_COOKIE]	= { .type = NLA_BINARY,
639  					    .len = TCP_FASTOPEN_COOKIE_MAX, },
640  #endif
641  };
642  
643  /* Add attributes, caller cancels its header on failure */
tcp_metrics_fill_info(struct sk_buff * msg,struct tcp_metrics_block * tm)644  static int tcp_metrics_fill_info(struct sk_buff *msg,
645  				 struct tcp_metrics_block *tm)
646  {
647  	struct nlattr *nest;
648  	int i;
649  
650  	switch (tm->tcpm_daddr.family) {
651  	case AF_INET:
652  		if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
653  				    inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0)
654  			goto nla_put_failure;
655  		if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
656  				    inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0)
657  			goto nla_put_failure;
658  		break;
659  	case AF_INET6:
660  		if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
661  				     inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0)
662  			goto nla_put_failure;
663  		if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
664  				     inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0)
665  			goto nla_put_failure;
666  		break;
667  	default:
668  		return -EAFNOSUPPORT;
669  	}
670  
671  	if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
672  			  jiffies - READ_ONCE(tm->tcpm_stamp),
673  			  TCP_METRICS_ATTR_PAD) < 0)
674  		goto nla_put_failure;
675  
676  	{
677  		int n = 0;
678  
679  		nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS);
680  		if (!nest)
681  			goto nla_put_failure;
682  		for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
683  			u32 val = tcp_metric_get(tm, i);
684  
685  			if (!val)
686  				continue;
687  			if (i == TCP_METRIC_RTT) {
688  				if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
689  						val) < 0)
690  					goto nla_put_failure;
691  				n++;
692  				val = max(val / 1000, 1U);
693  			}
694  			if (i == TCP_METRIC_RTTVAR) {
695  				if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
696  						val) < 0)
697  					goto nla_put_failure;
698  				n++;
699  				val = max(val / 1000, 1U);
700  			}
701  			if (nla_put_u32(msg, i + 1, val) < 0)
702  				goto nla_put_failure;
703  			n++;
704  		}
705  		if (n)
706  			nla_nest_end(msg, nest);
707  		else
708  			nla_nest_cancel(msg, nest);
709  	}
710  
711  	{
712  		struct tcp_fastopen_metrics tfom_copy[1], *tfom;
713  		unsigned int seq;
714  
715  		do {
716  			seq = read_seqbegin(&fastopen_seqlock);
717  			tfom_copy[0] = tm->tcpm_fastopen;
718  		} while (read_seqretry(&fastopen_seqlock, seq));
719  
720  		tfom = tfom_copy;
721  		if (tfom->mss &&
722  		    nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
723  				tfom->mss) < 0)
724  			goto nla_put_failure;
725  		if (tfom->syn_loss &&
726  		    (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
727  				tfom->syn_loss) < 0 ||
728  		     nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
729  				jiffies - tfom->last_syn_loss,
730  				TCP_METRICS_ATTR_PAD) < 0))
731  			goto nla_put_failure;
732  		if (tfom->cookie.len > 0 &&
733  		    nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
734  			    tfom->cookie.len, tfom->cookie.val) < 0)
735  			goto nla_put_failure;
736  	}
737  
738  	return 0;
739  
740  nla_put_failure:
741  	return -EMSGSIZE;
742  }
743  
tcp_metrics_dump_info(struct sk_buff * skb,struct netlink_callback * cb,struct tcp_metrics_block * tm)744  static int tcp_metrics_dump_info(struct sk_buff *skb,
745  				 struct netlink_callback *cb,
746  				 struct tcp_metrics_block *tm)
747  {
748  	void *hdr;
749  
750  	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
751  			  &tcp_metrics_nl_family, NLM_F_MULTI,
752  			  TCP_METRICS_CMD_GET);
753  	if (!hdr)
754  		return -EMSGSIZE;
755  
756  	if (tcp_metrics_fill_info(skb, tm) < 0)
757  		goto nla_put_failure;
758  
759  	genlmsg_end(skb, hdr);
760  	return 0;
761  
762  nla_put_failure:
763  	genlmsg_cancel(skb, hdr);
764  	return -EMSGSIZE;
765  }
766  
tcp_metrics_nl_dump(struct sk_buff * skb,struct netlink_callback * cb)767  static int tcp_metrics_nl_dump(struct sk_buff *skb,
768  			       struct netlink_callback *cb)
769  {
770  	struct net *net = sock_net(skb->sk);
771  	unsigned int max_rows = 1U << tcp_metrics_hash_log;
772  	unsigned int row, s_row = cb->args[0];
773  	int s_col = cb->args[1], col = s_col;
774  	int res = 0;
775  
776  	for (row = s_row; row < max_rows; row++, s_col = 0) {
777  		struct tcp_metrics_block *tm;
778  		struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;
779  
780  		rcu_read_lock();
781  		for (col = 0, tm = rcu_dereference(hb->chain); tm;
782  		     tm = rcu_dereference(tm->tcpm_next), col++) {
783  			if (!net_eq(tm_net(tm), net))
784  				continue;
785  			if (col < s_col)
786  				continue;
787  			res = tcp_metrics_dump_info(skb, cb, tm);
788  			if (res < 0) {
789  				rcu_read_unlock();
790  				goto done;
791  			}
792  		}
793  		rcu_read_unlock();
794  	}
795  
796  done:
797  	cb->args[0] = row;
798  	cb->args[1] = col;
799  	return res;
800  }
801  
__parse_nl_addr(struct genl_info * info,struct inetpeer_addr * addr,unsigned int * hash,int optional,int v4,int v6)802  static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
803  			   unsigned int *hash, int optional, int v4, int v6)
804  {
805  	struct nlattr *a;
806  
807  	a = info->attrs[v4];
808  	if (a) {
809  		inetpeer_set_addr_v4(addr, nla_get_in_addr(a));
810  		if (hash)
811  			*hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr));
812  		return 0;
813  	}
814  	a = info->attrs[v6];
815  	if (a) {
816  		struct in6_addr in6;
817  
818  		in6 = nla_get_in6_addr(a);
819  		inetpeer_set_addr_v6(addr, &in6);
820  		if (hash)
821  			*hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr));
822  		return 0;
823  	}
824  	return optional ? 1 : -EAFNOSUPPORT;
825  }
826  
parse_nl_addr(struct genl_info * info,struct inetpeer_addr * addr,unsigned int * hash,int optional)827  static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
828  			 unsigned int *hash, int optional)
829  {
830  	return __parse_nl_addr(info, addr, hash, optional,
831  			       TCP_METRICS_ATTR_ADDR_IPV4,
832  			       TCP_METRICS_ATTR_ADDR_IPV6);
833  }
834  
parse_nl_saddr(struct genl_info * info,struct inetpeer_addr * addr)835  static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr)
836  {
837  	return __parse_nl_addr(info, addr, NULL, 0,
838  			       TCP_METRICS_ATTR_SADDR_IPV4,
839  			       TCP_METRICS_ATTR_SADDR_IPV6);
840  }
841  
tcp_metrics_nl_cmd_get(struct sk_buff * skb,struct genl_info * info)842  static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
843  {
844  	struct tcp_metrics_block *tm;
845  	struct inetpeer_addr saddr, daddr;
846  	unsigned int hash;
847  	struct sk_buff *msg;
848  	struct net *net = genl_info_net(info);
849  	void *reply;
850  	int ret;
851  	bool src = true;
852  
853  	ret = parse_nl_addr(info, &daddr, &hash, 0);
854  	if (ret < 0)
855  		return ret;
856  
857  	ret = parse_nl_saddr(info, &saddr);
858  	if (ret < 0)
859  		src = false;
860  
861  	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
862  	if (!msg)
863  		return -ENOMEM;
864  
865  	reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
866  				  info->genlhdr->cmd);
867  	if (!reply)
868  		goto nla_put_failure;
869  
870  	hash ^= net_hash_mix(net);
871  	hash = hash_32(hash, tcp_metrics_hash_log);
872  	ret = -ESRCH;
873  	rcu_read_lock();
874  	for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
875  	     tm = rcu_dereference(tm->tcpm_next)) {
876  		if (addr_same(&tm->tcpm_daddr, &daddr) &&
877  		    (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
878  		    net_eq(tm_net(tm), net)) {
879  			ret = tcp_metrics_fill_info(msg, tm);
880  			break;
881  		}
882  	}
883  	rcu_read_unlock();
884  	if (ret < 0)
885  		goto out_free;
886  
887  	genlmsg_end(msg, reply);
888  	return genlmsg_reply(msg, info);
889  
890  nla_put_failure:
891  	ret = -EMSGSIZE;
892  
893  out_free:
894  	nlmsg_free(msg);
895  	return ret;
896  }
897  
tcp_metrics_flush_all(struct net * net)898  static void tcp_metrics_flush_all(struct net *net)
899  {
900  	unsigned int max_rows = 1U << tcp_metrics_hash_log;
901  	struct tcpm_hash_bucket *hb = tcp_metrics_hash;
902  	struct tcp_metrics_block *tm;
903  	unsigned int row;
904  
905  	for (row = 0; row < max_rows; row++, hb++) {
906  		struct tcp_metrics_block __rcu **pp = &hb->chain;
907  		bool match;
908  
909  		if (!rcu_access_pointer(*pp))
910  			continue;
911  
912  		spin_lock_bh(&tcp_metrics_lock);
913  		for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
914  			match = net ? net_eq(tm_net(tm), net) :
915  				!refcount_read(&tm_net(tm)->ns.count);
916  			if (match) {
917  				rcu_assign_pointer(*pp, tm->tcpm_next);
918  				kfree_rcu(tm, rcu_head);
919  			} else {
920  				pp = &tm->tcpm_next;
921  			}
922  		}
923  		spin_unlock_bh(&tcp_metrics_lock);
924  		cond_resched();
925  	}
926  }
927  
tcp_metrics_nl_cmd_del(struct sk_buff * skb,struct genl_info * info)928  static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
929  {
930  	struct tcpm_hash_bucket *hb;
931  	struct tcp_metrics_block *tm;
932  	struct tcp_metrics_block __rcu **pp;
933  	struct inetpeer_addr saddr, daddr;
934  	unsigned int hash;
935  	struct net *net = genl_info_net(info);
936  	int ret;
937  	bool src = true, found = false;
938  
939  	ret = parse_nl_addr(info, &daddr, &hash, 1);
940  	if (ret < 0)
941  		return ret;
942  	if (ret > 0) {
943  		tcp_metrics_flush_all(net);
944  		return 0;
945  	}
946  	ret = parse_nl_saddr(info, &saddr);
947  	if (ret < 0)
948  		src = false;
949  
950  	hash ^= net_hash_mix(net);
951  	hash = hash_32(hash, tcp_metrics_hash_log);
952  	hb = tcp_metrics_hash + hash;
953  	pp = &hb->chain;
954  	spin_lock_bh(&tcp_metrics_lock);
955  	for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
956  		if (addr_same(&tm->tcpm_daddr, &daddr) &&
957  		    (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
958  		    net_eq(tm_net(tm), net)) {
959  			rcu_assign_pointer(*pp, tm->tcpm_next);
960  			kfree_rcu(tm, rcu_head);
961  			found = true;
962  		} else {
963  			pp = &tm->tcpm_next;
964  		}
965  	}
966  	spin_unlock_bh(&tcp_metrics_lock);
967  	if (!found)
968  		return -ESRCH;
969  	return 0;
970  }
971  
972  static const struct genl_small_ops tcp_metrics_nl_ops[] = {
973  	{
974  		.cmd = TCP_METRICS_CMD_GET,
975  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
976  		.doit = tcp_metrics_nl_cmd_get,
977  		.dumpit = tcp_metrics_nl_dump,
978  	},
979  	{
980  		.cmd = TCP_METRICS_CMD_DEL,
981  		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
982  		.doit = tcp_metrics_nl_cmd_del,
983  		.flags = GENL_ADMIN_PERM,
984  	},
985  };
986  
987  static struct genl_family tcp_metrics_nl_family __ro_after_init = {
988  	.hdrsize	= 0,
989  	.name		= TCP_METRICS_GENL_NAME,
990  	.version	= TCP_METRICS_GENL_VERSION,
991  	.maxattr	= TCP_METRICS_ATTR_MAX,
992  	.policy = tcp_metrics_nl_policy,
993  	.netnsok	= true,
994  	.parallel_ops	= true,
995  	.module		= THIS_MODULE,
996  	.small_ops	= tcp_metrics_nl_ops,
997  	.n_small_ops	= ARRAY_SIZE(tcp_metrics_nl_ops),
998  	.resv_start_op	= TCP_METRICS_CMD_DEL + 1,
999  };
1000  
1001  static unsigned int tcpmhash_entries __initdata;
set_tcpmhash_entries(char * str)1002  static int __init set_tcpmhash_entries(char *str)
1003  {
1004  	ssize_t ret;
1005  
1006  	if (!str)
1007  		return 0;
1008  
1009  	ret = kstrtouint(str, 0, &tcpmhash_entries);
1010  	if (ret)
1011  		return 0;
1012  
1013  	return 1;
1014  }
1015  __setup("tcpmhash_entries=", set_tcpmhash_entries);
1016  
tcp_metrics_hash_alloc(void)1017  static void __init tcp_metrics_hash_alloc(void)
1018  {
1019  	unsigned int slots = tcpmhash_entries;
1020  	size_t size;
1021  
1022  	if (!slots) {
1023  		if (totalram_pages() >= 128 * 1024)
1024  			slots = 16 * 1024;
1025  		else
1026  			slots = 8 * 1024;
1027  	}
1028  
1029  	tcp_metrics_hash_log = order_base_2(slots);
1030  	size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
1031  
1032  	tcp_metrics_hash = kvzalloc(size, GFP_KERNEL);
1033  	if (!tcp_metrics_hash)
1034  		panic("Could not allocate the tcp_metrics hash table\n");
1035  }
1036  
tcp_net_metrics_exit_batch(struct list_head * net_exit_list)1037  static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list)
1038  {
1039  	tcp_metrics_flush_all(NULL);
1040  }
1041  
1042  static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
1043  	.exit_batch	=	tcp_net_metrics_exit_batch,
1044  };
1045  
tcp_metrics_init(void)1046  void __init tcp_metrics_init(void)
1047  {
1048  	int ret;
1049  
1050  	tcp_metrics_hash_alloc();
1051  
1052  	ret = register_pernet_subsys(&tcp_net_metrics_ops);
1053  	if (ret < 0)
1054  		panic("Could not register tcp_net_metrics_ops\n");
1055  
1056  	ret = genl_register_family(&tcp_metrics_nl_family);
1057  	if (ret < 0)
1058  		panic("Could not register tcp_metrics generic netlink\n");
1059  }
1060