xref: /freebsd/sys/netinet/tcp_hostcache.c (revision 6db169e920810bf1d7e9bd2fad5da92bfb696677)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. The name of the author may not be used to endorse or promote
16  *    products derived from this software without specific prior written
17  *    permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * The tcp_hostcache moves the tcp-specific cached metrics from the routing
34  * table to a dedicated structure indexed by the remote IP address.  It keeps
35  * information on the measured TCP parameters of past TCP sessions to allow
36  * better initial start values to be used with later connections to/from the
37  * same source.  Depending on the network parameters (delay, max MTU,
38  * congestion window) between local and remote sites, this can lead to
39  * significant speed-ups for new TCP connections after the first one.
40  *
41  * Due to the tcp_hostcache, all TCP-specific metrics information in the
42  * routing table have been removed.  The inpcb no longer keeps a pointer to
43  * the routing entry, and protocol-initiated route cloning has been removed
44  * as well.  With these changes, the routing table has gone back to being
45  * more lightwight and only carries information related to packet forwarding.
46  *
47  * tcp_hostcache is designed for multiple concurrent access in SMP
48  * environments and high contention.  All bucket rows have their own lock and
49  * thus multiple lookups and modifies can be done at the same time as long as
50  * they are in different bucket rows.  If a request for insertion of a new
51  * record can't be satisfied, it simply returns an empty structure.  Nobody
52  * and nothing outside of tcp_hostcache.c will ever point directly to any
53  * entry in the tcp_hostcache.  All communication is done in an
54  * object-oriented way and only functions of tcp_hostcache will manipulate
55  * hostcache entries.  Otherwise, we are unable to achieve good behaviour in
56  * concurrent access situations.  Since tcp_hostcache is only caching
57  * information, there are no fatal consequences if we either can't satisfy
58  * any particular request or have to drop/overwrite an existing entry because
59  * of bucket limit memory constrains.
60  */
61 
62 /*
63  * Many thanks to jlemon for basic structure of tcp_syncache which is being
64  * followed here.
65  */
66 
67 #include <sys/cdefs.h>
68 __FBSDID("$FreeBSD$");
69 
70 #include "opt_inet6.h"
71 
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/hash.h>
75 #include <sys/jail.h>
76 #include <sys/kernel.h>
77 #include <sys/lock.h>
78 #include <sys/mutex.h>
79 #include <sys/malloc.h>
80 #include <sys/proc.h>
81 #include <sys/sbuf.h>
82 #include <sys/socket.h>
83 #include <sys/socketvar.h>
84 #include <sys/sysctl.h>
85 
86 #include <net/vnet.h>
87 
88 #include <netinet/in.h>
89 #include <netinet/in_pcb.h>
90 #include <netinet/tcp.h>
91 #include <netinet/tcp_var.h>
92 
93 #include <vm/uma.h>
94 
95 TAILQ_HEAD(hc_qhead, hc_metrics);
96 
97 struct hc_head {
98 	struct hc_qhead	hch_bucket;
99 	u_int		hch_length;
100 	struct mtx	hch_mtx;
101 };
102 
103 struct hc_metrics {
104 	/* housekeeping */
105 	TAILQ_ENTRY(hc_metrics) rmx_q;
106 	struct		hc_head *rmx_head; /* head of bucket tail queue */
107 	struct		in_addr ip4;	/* IP address */
108 	struct		in6_addr ip6;	/* IP6 address */
109 	uint32_t	ip6_zoneid;	/* IPv6 scope zone id */
110 	/* endpoint specific values for tcp */
111 	uint32_t	rmx_mtu;	/* MTU for this path */
112 	uint32_t	rmx_ssthresh;	/* outbound gateway buffer limit */
113 	uint32_t	rmx_rtt;	/* estimated round trip time */
114 	uint32_t	rmx_rttvar;	/* estimated rtt variance */
115 	uint32_t	rmx_cwnd;	/* congestion window */
116 	uint32_t	rmx_sendpipe;	/* outbound delay-bandwidth product */
117 	uint32_t	rmx_recvpipe;	/* inbound delay-bandwidth product */
118 	/* TCP hostcache internal data */
119 	int		rmx_expire;	/* lifetime for object */
120 #ifdef	TCP_HC_COUNTERS
121 	u_long		rmx_hits;	/* number of hits */
122 	u_long		rmx_updates;	/* number of updates */
123 #endif
124 };
125 
126 struct tcp_hostcache {
127 	struct hc_head	*hashbase;
128 	uma_zone_t	zone;
129 	u_int		hashsize;
130 	u_int		hashmask;
131 	u_int		hashsalt;
132 	u_int		bucket_limit;
133 	u_int		cache_count;
134 	u_int		cache_limit;
135 	int		expire;
136 	int		prune;
137 	int		purgeall;
138 };
139 
140 /* Arbitrary values */
141 #define TCP_HOSTCACHE_HASHSIZE		512
142 #define TCP_HOSTCACHE_BUCKETLIMIT	30
143 #define TCP_HOSTCACHE_EXPIRE		60*60	/* one hour */
144 #define TCP_HOSTCACHE_PRUNE		5*60	/* every 5 minutes */
145 
146 VNET_DEFINE_STATIC(struct tcp_hostcache, tcp_hostcache);
147 #define	V_tcp_hostcache		VNET(tcp_hostcache)
148 
149 VNET_DEFINE_STATIC(struct callout, tcp_hc_callout);
150 #define	V_tcp_hc_callout	VNET(tcp_hc_callout)
151 
152 static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *, bool);
153 static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
154 static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
155 static int sysctl_tcp_hc_histo(SYSCTL_HANDLER_ARGS);
156 static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS);
157 static void tcp_hc_purge_internal(int);
158 static void tcp_hc_purge(void *);
159 
160 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache,
161     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
162     "TCP Host cache");
163 
164 VNET_DEFINE(int, tcp_use_hostcache) = 1;
165 #define V_tcp_use_hostcache  VNET(tcp_use_hostcache)
166 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
167     &VNET_NAME(tcp_use_hostcache), 0,
168     "Enable the TCP hostcache");
169 
170 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
171     &VNET_NAME(tcp_hostcache.cache_limit), 0,
172     "Overall entry limit for hostcache");
173 
174 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
175     &VNET_NAME(tcp_hostcache.hashsize), 0,
176     "Size of TCP hostcache hashtable");
177 
178 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
179     CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
180     "Per-bucket hash limit for hostcache");
181 
182 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_VNET | CTLFLAG_RD,
183     &VNET_NAME(tcp_hostcache.cache_count), 0,
184     "Current number of entries in hostcache");
185 
186 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_VNET | CTLFLAG_RW,
187     &VNET_NAME(tcp_hostcache.expire), 0,
188     "Expire time of TCP hostcache entries");
189 
190 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_VNET | CTLFLAG_RW,
191     &VNET_NAME(tcp_hostcache.prune), 0,
192     "Time between purge runs");
193 
194 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_VNET | CTLFLAG_RW,
195     &VNET_NAME(tcp_hostcache.purgeall), 0,
196     "Expire all entires on next purge run");
197 
198 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
199     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
200     0, 0, sysctl_tcp_hc_list, "A",
201     "List of all hostcache entries");
202 
203 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, histo,
204     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
205     0, 0, sysctl_tcp_hc_histo, "A",
206     "Print a histogram of hostcache hashbucket utilization");
207 
208 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, purgenow,
209     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
210     NULL, 0, sysctl_tcp_hc_purgenow, "I",
211     "Immediately purge all entries");
212 
213 static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
214 
215 /* Use jenkins_hash32(), as in other parts of the tcp stack */
216 #define HOSTCACHE_HASH(ip) \
217 	(jenkins_hash32((uint32_t *)(ip), 1, V_tcp_hostcache.hashsalt) & \
218 	 V_tcp_hostcache.hashmask)
219 
220 #define HOSTCACHE_HASH6(ip6)				\
221 	(jenkins_hash32((uint32_t *)&((ip6)->s6_addr32[0]), 4, \
222 	 V_tcp_hostcache.hashsalt) & \
223 	 V_tcp_hostcache.hashmask)
224 
225 #define THC_LOCK(h)		mtx_lock(&(h)->hch_mtx)
226 #define THC_UNLOCK(h)		mtx_unlock(&(h)->hch_mtx)
227 
228 void
229 tcp_hc_init(void)
230 {
231 	u_int cache_limit;
232 	int i;
233 
234 	/*
235 	 * Initialize hostcache structures.
236 	 */
237 	atomic_store_int(&V_tcp_hostcache.cache_count, 0);
238 	V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
239 	V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
240 	V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
241 	V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE;
242 	V_tcp_hostcache.hashsalt = arc4random();
243 
244 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
245 	    &V_tcp_hostcache.hashsize);
246 	if (!powerof2(V_tcp_hostcache.hashsize)) {
247 		printf("WARNING: hostcache hash size is not a power of 2.\n");
248 		V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */
249 	}
250 	V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1;
251 
252 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
253 	    &V_tcp_hostcache.bucket_limit);
254 
255 	cache_limit = V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit;
256 	V_tcp_hostcache.cache_limit = cache_limit;
257 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
258 	    &V_tcp_hostcache.cache_limit);
259 	if (V_tcp_hostcache.cache_limit > cache_limit)
260 		V_tcp_hostcache.cache_limit = cache_limit;
261 
262 	/*
263 	 * Allocate the hash table.
264 	 */
265 	V_tcp_hostcache.hashbase = (struct hc_head *)
266 	    malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head),
267 		   M_HOSTCACHE, M_WAITOK | M_ZERO);
268 
269 	/*
270 	 * Initialize the hash buckets.
271 	 */
272 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
273 		TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket);
274 		V_tcp_hostcache.hashbase[i].hch_length = 0;
275 		mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
276 			  NULL, MTX_DEF);
277 	}
278 
279 	/*
280 	 * Allocate the hostcache entries.
281 	 */
282 	V_tcp_hostcache.zone =
283 	    uma_zcreate("hostcache", sizeof(struct hc_metrics),
284 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
285 	uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit);
286 
287 	/*
288 	 * Set up periodic cache cleanup.
289 	 */
290 	callout_init(&V_tcp_hc_callout, 1);
291 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
292 	    tcp_hc_purge, curvnet);
293 }
294 
295 #ifdef VIMAGE
296 void
297 tcp_hc_destroy(void)
298 {
299 	int i;
300 
301 	callout_drain(&V_tcp_hc_callout);
302 
303 	/* Purge all hc entries. */
304 	tcp_hc_purge_internal(1);
305 
306 	/* Free the uma zone and the allocated hash table. */
307 	uma_zdestroy(V_tcp_hostcache.zone);
308 
309 	for (i = 0; i < V_tcp_hostcache.hashsize; i++)
310 		mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx);
311 	free(V_tcp_hostcache.hashbase, M_HOSTCACHE);
312 }
313 #endif
314 
315 /*
316  * Internal function: look up an entry in the hostcache or return NULL.
317  *
318  * If an entry has been returned, the caller becomes responsible for
319  * unlocking the bucket row after he is done reading/modifying the entry.
320  */
321 static struct hc_metrics *
322 tcp_hc_lookup(struct in_conninfo *inc, bool update)
323 {
324 	int hash;
325 	struct hc_head *hc_head;
326 	struct hc_metrics *hc_entry;
327 
328 	KASSERT(inc != NULL, ("%s: NULL in_conninfo", __func__));
329 
330 	/*
331 	 * Hash the foreign ip address.
332 	 */
333 	if (inc->inc_flags & INC_ISIPV6)
334 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
335 	else
336 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
337 
338 	hc_head = &V_tcp_hostcache.hashbase[hash];
339 
340 	/*
341 	 * Acquire lock for this bucket row; we release the lock if we don't
342 	 * find an entry, otherwise the caller has to unlock after he is
343 	 * done.
344 	 */
345 	THC_LOCK(hc_head);
346 
347 	/*
348 	 * Iterate through entries in bucket row looking for a match.
349 	 */
350 	TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
351 		if (inc->inc_flags & INC_ISIPV6) {
352 			/* XXX: check ip6_zoneid */
353 			if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
354 			    sizeof(inc->inc6_faddr)) == 0)
355 				goto found;
356 		} else {
357 			if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
358 			    sizeof(inc->inc_faddr)) == 0)
359 				goto found;
360 		}
361 	}
362 
363 	/*
364 	 * We were unsuccessful and didn't find anything.
365 	 */
366 	THC_UNLOCK(hc_head);
367 	return (NULL);
368 
369 found:
370 #ifdef	TCP_HC_COUNTERS
371 	if (update)
372 		hc_entry->rmx_updates++;
373 	else
374 		hc_entry->rmx_hits++;
375 #endif
376 	hc_entry->rmx_expire = V_tcp_hostcache.expire;
377 
378 	return (hc_entry);
379 }
380 
381 /*
382  * Internal function: insert an entry into the hostcache or return NULL if
383  * unable to allocate a new one.
384  *
385  * If an entry has been returned, the caller becomes responsible for
386  * unlocking the bucket row after he is done reading/modifying the entry.
387  */
388 static struct hc_metrics *
389 tcp_hc_insert(struct in_conninfo *inc)
390 {
391 	int hash;
392 	struct hc_head *hc_head;
393 	struct hc_metrics *hc_entry;
394 
395 	KASSERT(inc != NULL, ("%s: NULL in_conninfo", __func__));
396 
397 	/*
398 	 * Hash the foreign ip address.
399 	 */
400 	if (inc->inc_flags & INC_ISIPV6)
401 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
402 	else
403 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
404 
405 	hc_head = &V_tcp_hostcache.hashbase[hash];
406 
407 	/*
408 	 * Acquire lock for this bucket row; we release the lock if we don't
409 	 * find an entry, otherwise the caller has to unlock after he is
410 	 * done.
411 	 */
412 	THC_LOCK(hc_head);
413 
414 	/*
415 	 * If the bucket limit is reached, reuse the least-used element.
416 	 */
417 	if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit ||
418 	    atomic_load_int(&V_tcp_hostcache.cache_count) >= V_tcp_hostcache.cache_limit) {
419 		hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
420 		/*
421 		 * At first we were dropping the last element, just to
422 		 * reacquire it in the next two lines again, which isn't very
423 		 * efficient.  Instead just reuse the least used element.
424 		 * We may drop something that is still "in-use" but we can be
425 		 * "lossy".
426 		 * Just give up if this bucket row is empty and we don't have
427 		 * anything to replace.
428 		 */
429 		if (hc_entry == NULL) {
430 			THC_UNLOCK(hc_head);
431 			return (NULL);
432 		}
433 		TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
434 		KASSERT(V_tcp_hostcache.hashbase[hash].hch_length > 0 &&
435 		    V_tcp_hostcache.hashbase[hash].hch_length <=
436 		    V_tcp_hostcache.bucket_limit,
437 		    ("tcp_hostcache: bucket length range violated at %u: %u",
438 		    hash, V_tcp_hostcache.hashbase[hash].hch_length));
439 		V_tcp_hostcache.hashbase[hash].hch_length--;
440 		atomic_subtract_int(&V_tcp_hostcache.cache_count, 1);
441 		TCPSTAT_INC(tcps_hc_bucketoverflow);
442 #if 0
443 		uma_zfree(V_tcp_hostcache.zone, hc_entry);
444 #endif
445 	} else {
446 		/*
447 		 * Allocate a new entry, or balk if not possible.
448 		 */
449 		hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT);
450 		if (hc_entry == NULL) {
451 			THC_UNLOCK(hc_head);
452 			return (NULL);
453 		}
454 	}
455 
456 	/*
457 	 * Initialize basic information of hostcache entry.
458 	 */
459 	bzero(hc_entry, sizeof(*hc_entry));
460 	if (inc->inc_flags & INC_ISIPV6) {
461 		hc_entry->ip6 = inc->inc6_faddr;
462 		hc_entry->ip6_zoneid = inc->inc6_zoneid;
463 	} else
464 		hc_entry->ip4 = inc->inc_faddr;
465 	hc_entry->rmx_head = hc_head;
466 	hc_entry->rmx_expire = V_tcp_hostcache.expire;
467 
468 	/*
469 	 * Put it upfront.
470 	 */
471 	TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
472 	V_tcp_hostcache.hashbase[hash].hch_length++;
473 	KASSERT(V_tcp_hostcache.hashbase[hash].hch_length <
474 	    V_tcp_hostcache.bucket_limit,
475 	    ("tcp_hostcache: bucket length too high at %u: %u",
476 	    hash, V_tcp_hostcache.hashbase[hash].hch_length));
477 	atomic_add_int(&V_tcp_hostcache.cache_count, 1);
478 	TCPSTAT_INC(tcps_hc_added);
479 
480 	return (hc_entry);
481 }
482 
483 /*
484  * External function: look up an entry in the hostcache and fill out the
485  * supplied TCP metrics structure.  Fills in NULL when no entry was found or
486  * a value is not set.
487  */
488 void
489 tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
490 {
491 	struct hc_metrics *hc_entry;
492 
493 	if (!V_tcp_use_hostcache) {
494 		bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
495 		return;
496 	}
497 
498 	/*
499 	 * Find the right bucket.
500 	 */
501 	hc_entry = tcp_hc_lookup(inc, false);
502 
503 	/*
504 	 * If we don't have an existing object.
505 	 */
506 	if (hc_entry == NULL) {
507 		bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
508 		return;
509 	}
510 
511 	hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
512 	hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
513 	hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
514 	hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
515 	hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
516 	hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
517 	hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
518 
519 	/*
520 	 * Unlock bucket row.
521 	 */
522 	THC_UNLOCK(hc_entry->rmx_head);
523 }
524 
525 /*
526  * External function: look up an entry in the hostcache and return the
527  * discovered path MTU.  Returns 0 if no entry is found or value is not
528  * set.
529  */
530 uint32_t
531 tcp_hc_getmtu(struct in_conninfo *inc)
532 {
533 	struct hc_metrics *hc_entry;
534 	uint32_t mtu;
535 
536 	if (!V_tcp_use_hostcache)
537 		return (0);
538 
539 	hc_entry = tcp_hc_lookup(inc, false);
540 	if (hc_entry == NULL) {
541 		return (0);
542 	}
543 
544 	mtu = hc_entry->rmx_mtu;
545 	THC_UNLOCK(hc_entry->rmx_head);
546 	return (mtu);
547 }
548 
549 /*
550  * External function: update the MTU value of an entry in the hostcache.
551  * Creates a new entry if none was found.
552  */
553 void
554 tcp_hc_updatemtu(struct in_conninfo *inc, uint32_t mtu)
555 {
556 	struct hc_metrics_lite hcml = { .rmx_mtu = mtu };
557 
558 	return (tcp_hc_update(inc, &hcml));
559 }
560 
561 /*
562  * External function: update the TCP metrics of an entry in the hostcache.
563  * Creates a new entry if none was found.
564  */
565 void
566 tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
567 {
568 	struct hc_metrics *hc_entry;
569 
570 	if (!V_tcp_use_hostcache)
571 		return;
572 
573 	hc_entry = tcp_hc_lookup(inc, true);
574 	if (hc_entry == NULL) {
575 		hc_entry = tcp_hc_insert(inc);
576 		if (hc_entry == NULL)
577 			return;
578 	}
579 
580 	if (hcml->rmx_mtu != 0) {
581 		hc_entry->rmx_mtu = hcml->rmx_mtu;
582 	}
583 	if (hcml->rmx_rtt != 0) {
584 		if (hc_entry->rmx_rtt == 0)
585 			hc_entry->rmx_rtt = hcml->rmx_rtt;
586 		else
587 			hc_entry->rmx_rtt = ((uint64_t)hc_entry->rmx_rtt +
588 			    (uint64_t)hcml->rmx_rtt) / 2;
589 		TCPSTAT_INC(tcps_cachedrtt);
590 	}
591 	if (hcml->rmx_rttvar != 0) {
592 		if (hc_entry->rmx_rttvar == 0)
593 			hc_entry->rmx_rttvar = hcml->rmx_rttvar;
594 		else
595 			hc_entry->rmx_rttvar = ((uint64_t)hc_entry->rmx_rttvar +
596 			    (uint64_t)hcml->rmx_rttvar) / 2;
597 		TCPSTAT_INC(tcps_cachedrttvar);
598 	}
599 	if (hcml->rmx_ssthresh != 0) {
600 		if (hc_entry->rmx_ssthresh == 0)
601 			hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
602 		else
603 			hc_entry->rmx_ssthresh =
604 			    (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
605 		TCPSTAT_INC(tcps_cachedssthresh);
606 	}
607 	if (hcml->rmx_cwnd != 0) {
608 		if (hc_entry->rmx_cwnd == 0)
609 			hc_entry->rmx_cwnd = hcml->rmx_cwnd;
610 		else
611 			hc_entry->rmx_cwnd = ((uint64_t)hc_entry->rmx_cwnd +
612 			    (uint64_t)hcml->rmx_cwnd) / 2;
613 		/* TCPSTAT_INC(tcps_cachedcwnd); */
614 	}
615 	if (hcml->rmx_sendpipe != 0) {
616 		if (hc_entry->rmx_sendpipe == 0)
617 			hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
618 		else
619 			hc_entry->rmx_sendpipe =
620 			    ((uint64_t)hc_entry->rmx_sendpipe +
621 			    (uint64_t)hcml->rmx_sendpipe) /2;
622 		/* TCPSTAT_INC(tcps_cachedsendpipe); */
623 	}
624 	if (hcml->rmx_recvpipe != 0) {
625 		if (hc_entry->rmx_recvpipe == 0)
626 			hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
627 		else
628 			hc_entry->rmx_recvpipe =
629 			    ((uint64_t)hc_entry->rmx_recvpipe +
630 			    (uint64_t)hcml->rmx_recvpipe) /2;
631 		/* TCPSTAT_INC(tcps_cachedrecvpipe); */
632 	}
633 
634 	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
635 	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
636 	THC_UNLOCK(hc_entry->rmx_head);
637 }
638 
639 /*
640  * Sysctl function: prints the list and values of all hostcache entries in
641  * unsorted order.
642  */
643 static int
644 sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
645 {
646 	const int linesize = 128;
647 	struct sbuf sb;
648 	int i, error, len;
649 	struct hc_metrics *hc_entry;
650 	char ip4buf[INET_ADDRSTRLEN];
651 #ifdef INET6
652 	char ip6buf[INET6_ADDRSTRLEN];
653 #endif
654 
655 	if (jailed_without_vnet(curthread->td_ucred) != 0)
656 		return (EPERM);
657 
658 	/* Optimize Buffer length query by sbin/sysctl */
659 	if (req->oldptr == NULL) {
660 		len = (atomic_load_int(&V_tcp_hostcache.cache_count) + 1) *
661 			linesize;
662 		return (SYSCTL_OUT(req, NULL, len));
663 	}
664 
665 	error = sysctl_wire_old_buffer(req, 0);
666 	if (error != 0) {
667 		return(error);
668 	}
669 
670 	/* Use a buffer sized for one full bucket */
671 	sbuf_new_for_sysctl(&sb, NULL, V_tcp_hostcache.bucket_limit *
672 		linesize, req);
673 
674 	sbuf_printf(&sb,
675 		"\nIP address        MTU  SSTRESH      RTT   RTTVAR "
676 		"    CWND SENDPIPE RECVPIPE "
677 #ifdef	TCP_HC_COUNTERS
678 		"HITS  UPD  "
679 #endif
680 		"EXP\n");
681 	sbuf_drain(&sb);
682 
683 #define msec(u) (((u) + 500) / 1000)
684 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
685 		THC_LOCK(&V_tcp_hostcache.hashbase[i]);
686 		TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
687 		    rmx_q) {
688 			sbuf_printf(&sb,
689 			    "%-15s %5u %8u %6lums %6lums %8u %8u %8u "
690 #ifdef	TCP_HC_COUNTERS
691 			    "%4lu %4lu "
692 #endif
693 			    "%4i\n",
694 			    hc_entry->ip4.s_addr ?
695 			        inet_ntoa_r(hc_entry->ip4, ip4buf) :
696 #ifdef INET6
697 				ip6_sprintf(ip6buf, &hc_entry->ip6),
698 #else
699 				"IPv6?",
700 #endif
701 			    hc_entry->rmx_mtu,
702 			    hc_entry->rmx_ssthresh,
703 			    msec((u_long)hc_entry->rmx_rtt *
704 				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
705 			    msec((u_long)hc_entry->rmx_rttvar *
706 				(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))),
707 			    hc_entry->rmx_cwnd,
708 			    hc_entry->rmx_sendpipe,
709 			    hc_entry->rmx_recvpipe,
710 #ifdef	TCP_HC_COUNTERS
711 			    hc_entry->rmx_hits,
712 			    hc_entry->rmx_updates,
713 #endif
714 			    hc_entry->rmx_expire);
715 		}
716 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i]);
717 		sbuf_drain(&sb);
718 	}
719 #undef msec
720 	error = sbuf_finish(&sb);
721 	sbuf_delete(&sb);
722 	return(error);
723 }
724 
725 /*
726  * Sysctl function: prints a histogram of the hostcache hashbucket
727  * utilization.
728  */
729 static int
730 sysctl_tcp_hc_histo(SYSCTL_HANDLER_ARGS)
731 {
732 	const int linesize = 50;
733 	struct sbuf sb;
734 	int i, error;
735 	int *histo;
736 	u_int hch_length;
737 
738 	if (jailed_without_vnet(curthread->td_ucred) != 0)
739 		return (EPERM);
740 
741 	histo = (int *)malloc(sizeof(int) * (V_tcp_hostcache.bucket_limit + 1),
742 			M_TEMP, M_NOWAIT|M_ZERO);
743 	if (histo == NULL)
744 		return(ENOMEM);
745 
746 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
747 		hch_length = V_tcp_hostcache.hashbase[i].hch_length;
748 		KASSERT(hch_length <= V_tcp_hostcache.bucket_limit,
749 		    ("tcp_hostcache: bucket limit exceeded at %u: %u",
750 		    i, hch_length));
751 		histo[hch_length]++;
752 	}
753 
754 	/* Use a buffer for 16 lines */
755 	sbuf_new_for_sysctl(&sb, NULL, 16 * linesize, req);
756 
757 	sbuf_printf(&sb, "\nLength\tCount\n");
758 	for (i = 0; i <= V_tcp_hostcache.bucket_limit; i++) {
759 		sbuf_printf(&sb, "%u\t%u\n", i, histo[i]);
760 	}
761 	error = sbuf_finish(&sb);
762 	sbuf_delete(&sb);
763 	free(histo, M_TEMP);
764 	return(error);
765 }
766 
767 /*
768  * Caller has to make sure the curvnet is set properly.
769  */
770 static void
771 tcp_hc_purge_internal(int all)
772 {
773 	struct hc_metrics *hc_entry, *hc_next;
774 	int i;
775 
776 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
777 		THC_LOCK(&V_tcp_hostcache.hashbase[i]);
778 		TAILQ_FOREACH_SAFE(hc_entry,
779 		    &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) {
780 			KASSERT(V_tcp_hostcache.hashbase[i].hch_length > 0 &&
781 			    V_tcp_hostcache.hashbase[i].hch_length <=
782 			    V_tcp_hostcache.bucket_limit, ("tcp_hostcache: "
783 			    "bucket length out of range at %u: %u",
784 			    i, V_tcp_hostcache.hashbase[i].hch_length));
785 			if (all || hc_entry->rmx_expire <= 0) {
786 				TAILQ_REMOVE(
787 				    &V_tcp_hostcache.hashbase[i].hch_bucket,
788 				    hc_entry, rmx_q);
789 				uma_zfree(V_tcp_hostcache.zone, hc_entry);
790 				V_tcp_hostcache.hashbase[i].hch_length--;
791 				atomic_subtract_int(&V_tcp_hostcache.cache_count, 1);
792 			} else
793 				hc_entry->rmx_expire -= V_tcp_hostcache.prune;
794 		}
795 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i]);
796 	}
797 }
798 
799 /*
800  * Expire and purge (old|all) entries in the tcp_hostcache.  Runs
801  * periodically from the callout.
802  */
803 static void
804 tcp_hc_purge(void *arg)
805 {
806 	CURVNET_SET((struct vnet *) arg);
807 	int all = 0;
808 
809 	if (V_tcp_hostcache.purgeall) {
810 		if (V_tcp_hostcache.purgeall == 2)
811 			V_tcp_hostcache.hashsalt = arc4random();
812 		all = 1;
813 		V_tcp_hostcache.purgeall = 0;
814 	}
815 
816 	tcp_hc_purge_internal(all);
817 
818 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
819 	    tcp_hc_purge, arg);
820 	CURVNET_RESTORE();
821 }
822 
823 /*
824  * Expire and purge all entries in hostcache immediately.
825  */
826 static int
827 sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS)
828 {
829 	int error, val;
830 
831 	val = 0;
832 	error = sysctl_handle_int(oidp, &val, 0, req);
833 	if (error || !req->newptr)
834 		return (error);
835 
836 	if (val == 2)
837 		V_tcp_hostcache.hashsalt = arc4random();
838 	tcp_hc_purge_internal(1);
839 
840 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
841 	    tcp_hc_purge, curvnet);
842 
843 	return (0);
844 }
845