xref: /freebsd/sys/netinet/tcp_hostcache.c (revision 99c2ce7ef12f0852f25155d1d6718beccafbae0e)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. The name of the author may not be used to endorse or promote
16  *    products derived from this software without specific prior written
17  *    permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * The tcp_hostcache moves the tcp-specific cached metrics from the routing
34  * table to a dedicated structure indexed by the remote IP address.  It keeps
35  * information on the measured TCP parameters of past TCP sessions to allow
36  * better initial start values to be used with later connections to/from the
37  * same source.  Depending on the network parameters (delay, max MTU,
38  * congestion window) between local and remote sites, this can lead to
39  * significant speed-ups for new TCP connections after the first one.
40  *
41  * Due to the tcp_hostcache, all TCP-specific metrics information in the
42  * routing table have been removed.  The inpcb no longer keeps a pointer to
43  * the routing entry, and protocol-initiated route cloning has been removed
44  * as well.  With these changes, the routing table has gone back to being
45  * more lightwight and only carries information related to packet forwarding.
46  *
47  * tcp_hostcache is designed for multiple concurrent access in SMP
48  * environments and high contention.  All bucket rows have their own lock and
49  * thus multiple lookups and modifies can be done at the same time as long as
50  * they are in different bucket rows.  If a request for insertion of a new
51  * record can't be satisfied, it simply returns an empty structure.  Nobody
52  * and nothing outside of tcp_hostcache.c will ever point directly to any
53  * entry in the tcp_hostcache.  All communication is done in an
54  * object-oriented way and only functions of tcp_hostcache will manipulate
55  * hostcache entries.  Otherwise, we are unable to achieve good behaviour in
56  * concurrent access situations.  Since tcp_hostcache is only caching
57  * information, there are no fatal consequences if we either can't satisfy
58  * any particular request or have to drop/overwrite an existing entry because
59  * of bucket limit memory constrains.
60  */
61 
62 /*
63  * Many thanks to jlemon for basic structure of tcp_syncache which is being
64  * followed here.
65  */
66 
67 #include <sys/cdefs.h>
68 __FBSDID("$FreeBSD$");
69 
70 #include "opt_inet6.h"
71 
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/hash.h>
75 #include <sys/jail.h>
76 #include <sys/kernel.h>
77 #include <sys/lock.h>
78 #include <sys/mutex.h>
79 #include <sys/malloc.h>
80 #include <sys/proc.h>
81 #include <sys/sbuf.h>
82 #include <sys/socket.h>
83 #include <sys/socketvar.h>
84 #include <sys/sysctl.h>
85 
86 #include <net/vnet.h>
87 
88 #include <netinet/in.h>
89 #include <netinet/in_pcb.h>
90 #include <netinet/tcp.h>
91 #include <netinet/tcp_var.h>
92 
93 #include <vm/uma.h>
94 
95 TAILQ_HEAD(hc_qhead, hc_metrics);
96 
97 struct hc_head {
98 	struct hc_qhead	hch_bucket;
99 	u_int		hch_length;
100 	struct mtx	hch_mtx;
101 };
102 
103 struct hc_metrics {
104 	/* housekeeping */
105 	TAILQ_ENTRY(hc_metrics) rmx_q;
106 	struct		hc_head *rmx_head; /* head of bucket tail queue */
107 	struct		in_addr ip4;	/* IP address */
108 	struct		in6_addr ip6;	/* IP6 address */
109 	uint32_t	ip6_zoneid;	/* IPv6 scope zone id */
110 	/* endpoint specific values for tcp */
111 	uint32_t	rmx_mtu;	/* MTU for this path */
112 	uint32_t	rmx_ssthresh;	/* outbound gateway buffer limit */
113 	uint32_t	rmx_rtt;	/* estimated round trip time */
114 	uint32_t	rmx_rttvar;	/* estimated rtt variance */
115 	uint32_t	rmx_cwnd;	/* congestion window */
116 	uint32_t	rmx_sendpipe;	/* outbound delay-bandwidth product */
117 	uint32_t	rmx_recvpipe;	/* inbound delay-bandwidth product */
118 	/* TCP hostcache internal data */
119 	int		rmx_expire;	/* lifetime for object */
120 #ifdef	TCP_HC_COUNTERS
121 	u_long		rmx_hits;	/* number of hits */
122 	u_long		rmx_updates;	/* number of updates */
123 #endif
124 };
125 
126 struct tcp_hostcache {
127 	struct hc_head	*hashbase;
128 	uma_zone_t	zone;
129 	u_int		hashsize;
130 	u_int		hashmask;
131 	u_int		hashsalt;
132 	u_int		bucket_limit;
133 	u_int		cache_count;
134 	u_int		cache_limit;
135 	int		expire;
136 	int		prune;
137 	int		purgeall;
138 };
139 
140 /* Arbitrary values */
141 #define TCP_HOSTCACHE_HASHSIZE		512
142 #define TCP_HOSTCACHE_BUCKETLIMIT	30
143 #define TCP_HOSTCACHE_EXPIRE		60*60	/* one hour */
144 #define TCP_HOSTCACHE_PRUNE		5*60	/* every 5 minutes */
145 
146 VNET_DEFINE_STATIC(struct tcp_hostcache, tcp_hostcache);
147 #define	V_tcp_hostcache		VNET(tcp_hostcache)
148 
149 VNET_DEFINE_STATIC(struct callout, tcp_hc_callout);
150 #define	V_tcp_hc_callout	VNET(tcp_hc_callout)
151 
152 static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *, bool);
153 static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
154 static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
155 static int sysctl_tcp_hc_histo(SYSCTL_HANDLER_ARGS);
156 static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS);
157 static void tcp_hc_purge_internal(int);
158 static void tcp_hc_purge(void *);
159 
160 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache,
161     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
162     "TCP Host cache");
163 
164 VNET_DEFINE(int, tcp_use_hostcache) = 1;
165 #define V_tcp_use_hostcache  VNET(tcp_use_hostcache)
166 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
167     &VNET_NAME(tcp_use_hostcache), 0,
168     "Enable the TCP hostcache");
169 
170 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
171     &VNET_NAME(tcp_hostcache.cache_limit), 0,
172     "Overall entry limit for hostcache");
173 
174 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
175     &VNET_NAME(tcp_hostcache.hashsize), 0,
176     "Size of TCP hostcache hashtable");
177 
178 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
179     CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
180     "Per-bucket hash limit for hostcache");
181 
182 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_VNET | CTLFLAG_RD,
183      &VNET_NAME(tcp_hostcache.cache_count), 0,
184     "Current number of entries in hostcache");
185 
186 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_VNET | CTLFLAG_RW,
187     &VNET_NAME(tcp_hostcache.expire), 0,
188     "Expire time of TCP hostcache entries");
189 
190 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_VNET | CTLFLAG_RW,
191     &VNET_NAME(tcp_hostcache.prune), 0,
192     "Time between purge runs");
193 
194 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_VNET | CTLFLAG_RW,
195     &VNET_NAME(tcp_hostcache.purgeall), 0,
196     "Expire all entires on next purge run");
197 
198 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
199     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
200     0, 0, sysctl_tcp_hc_list, "A",
201     "List of all hostcache entries");
202 
203 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, histo,
204     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
205     0, 0, sysctl_tcp_hc_histo, "A",
206     "Print a histogram of hostcache hashbucket utilization");
207 
208 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, purgenow,
209     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
210     NULL, 0, sysctl_tcp_hc_purgenow, "I",
211     "Immediately purge all entries");
212 
213 static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
214 
215 /* Use jenkins_hash32(), as in other parts of the tcp stack */
216 #define HOSTCACHE_HASH(ip) \
217 	(jenkins_hash32((uint32_t *)(ip), 1, V_tcp_hostcache.hashsalt) & \
218 	 V_tcp_hostcache.hashmask)
219 
220 #define HOSTCACHE_HASH6(ip6)				\
221 	(jenkins_hash32((uint32_t *)&((ip6)->s6_addr32[0]), 4, \
222 	 V_tcp_hostcache.hashsalt) & \
223 	 V_tcp_hostcache.hashmask)
224 
225 #define THC_LOCK(lp)		mtx_lock(lp)
226 #define THC_UNLOCK(lp)		mtx_unlock(lp)
227 
228 void
229 tcp_hc_init(void)
230 {
231 	u_int cache_limit;
232 	int i;
233 
234 	/*
235 	 * Initialize hostcache structures.
236 	 */
237 	atomic_store_int(&V_tcp_hostcache.cache_count, 0);
238 	V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
239 	V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
240 	V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
241 	V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE;
242 	V_tcp_hostcache.hashsalt = arc4random();
243 
244 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
245 	    &V_tcp_hostcache.hashsize);
246 	if (!powerof2(V_tcp_hostcache.hashsize)) {
247 		printf("WARNING: hostcache hash size is not a power of 2.\n");
248 		V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */
249 	}
250 	V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1;
251 
252 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
253 	    &V_tcp_hostcache.bucket_limit);
254 
255 	cache_limit = V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit;
256 	V_tcp_hostcache.cache_limit = cache_limit;
257 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
258 	    &V_tcp_hostcache.cache_limit);
259 	if (V_tcp_hostcache.cache_limit > cache_limit)
260 		V_tcp_hostcache.cache_limit = cache_limit;
261 
262 	/*
263 	 * Allocate the hash table.
264 	 */
265 	V_tcp_hostcache.hashbase = (struct hc_head *)
266 	    malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head),
267 		   M_HOSTCACHE, M_WAITOK | M_ZERO);
268 
269 	/*
270 	 * Initialize the hash buckets.
271 	 */
272 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
273 		TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket);
274 		V_tcp_hostcache.hashbase[i].hch_length = 0;
275 		mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
276 			  NULL, MTX_DEF);
277 	}
278 
279 	/*
280 	 * Allocate the hostcache entries.
281 	 */
282 	V_tcp_hostcache.zone =
283 	    uma_zcreate("hostcache", sizeof(struct hc_metrics),
284 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
285 	uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit);
286 
287 	/*
288 	 * Set up periodic cache cleanup.
289 	 */
290 	callout_init(&V_tcp_hc_callout, 1);
291 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
292 	    tcp_hc_purge, curvnet);
293 }
294 
295 #ifdef VIMAGE
296 void
297 tcp_hc_destroy(void)
298 {
299 	int i;
300 
301 	callout_drain(&V_tcp_hc_callout);
302 
303 	/* Purge all hc entries. */
304 	tcp_hc_purge_internal(1);
305 
306 	/* Free the uma zone and the allocated hash table. */
307 	uma_zdestroy(V_tcp_hostcache.zone);
308 
309 	for (i = 0; i < V_tcp_hostcache.hashsize; i++)
310 		mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx);
311 	free(V_tcp_hostcache.hashbase, M_HOSTCACHE);
312 }
313 #endif
314 
315 /*
316  * Internal function: look up an entry in the hostcache or return NULL.
317  *
318  * If an entry has been returned, the caller becomes responsible for
319  * unlocking the bucket row after he is done reading/modifying the entry.
320  */
321 static struct hc_metrics *
322 tcp_hc_lookup(struct in_conninfo *inc, bool update)
323 {
324 	int hash;
325 	struct hc_head *hc_head;
326 	struct hc_metrics *hc_entry;
327 
328 	if (!V_tcp_use_hostcache)
329 		return NULL;
330 
331 	KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
332 
333 	/*
334 	 * Hash the foreign ip address.
335 	 */
336 	if (inc->inc_flags & INC_ISIPV6)
337 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
338 	else
339 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
340 
341 	hc_head = &V_tcp_hostcache.hashbase[hash];
342 
343 	/*
344 	 * Acquire lock for this bucket row; we release the lock if we don't
345 	 * find an entry, otherwise the caller has to unlock after he is
346 	 * done.
347 	 */
348 	THC_LOCK(&hc_head->hch_mtx);
349 
350 	/*
351 	 * Iterate through entries in bucket row looking for a match.
352 	 */
353 	TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
354 		if (inc->inc_flags & INC_ISIPV6) {
355 			/* XXX: check ip6_zoneid */
356 			if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
357 			    sizeof(inc->inc6_faddr)) == 0)
358 				goto found;
359 		} else {
360 			if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
361 			    sizeof(inc->inc_faddr)) == 0)
362 				goto found;
363 		}
364 	}
365 
366 	/*
367 	 * We were unsuccessful and didn't find anything.
368 	 */
369 	THC_UNLOCK(&hc_head->hch_mtx);
370 	return (NULL);
371 
372 found:
373 #ifdef	TCP_HC_COUNTERS
374 	if (update)
375 		hc_entry->rmx_updates++;
376 	else
377 		hc_entry->rmx_hits++;
378 #endif
379 	hc_entry->rmx_expire = V_tcp_hostcache.expire;
380 
381 	return (hc_entry);
382 }
383 
384 /*
385  * Internal function: insert an entry into the hostcache or return NULL if
386  * unable to allocate a new one.
387  *
388  * If an entry has been returned, the caller becomes responsible for
389  * unlocking the bucket row after he is done reading/modifying the entry.
390  */
391 static struct hc_metrics *
392 tcp_hc_insert(struct in_conninfo *inc)
393 {
394 	int hash;
395 	struct hc_head *hc_head;
396 	struct hc_metrics *hc_entry;
397 
398 	if (!V_tcp_use_hostcache)
399 		return NULL;
400 
401 	KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
402 
403 	/*
404 	 * Hash the foreign ip address.
405 	 */
406 	if (inc->inc_flags & INC_ISIPV6)
407 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
408 	else
409 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
410 
411 	hc_head = &V_tcp_hostcache.hashbase[hash];
412 
413 	/*
414 	 * Acquire lock for this bucket row; we release the lock if we don't
415 	 * find an entry, otherwise the caller has to unlock after he is
416 	 * done.
417 	 */
418 	THC_LOCK(&hc_head->hch_mtx);
419 
420 	/*
421 	 * If the bucket limit is reached, reuse the least-used element.
422 	 */
423 	if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit ||
424 	    atomic_load_int(&V_tcp_hostcache.cache_count) >= V_tcp_hostcache.cache_limit) {
425 		hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
426 		/*
427 		 * At first we were dropping the last element, just to
428 		 * reacquire it in the next two lines again, which isn't very
429 		 * efficient.  Instead just reuse the least used element.
430 		 * We may drop something that is still "in-use" but we can be
431 		 * "lossy".
432 		 * Just give up if this bucket row is empty and we don't have
433 		 * anything to replace.
434 		 */
435 		if (hc_entry == NULL) {
436 			THC_UNLOCK(&hc_head->hch_mtx);
437 			return NULL;
438 		}
439 		TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
440 		KASSERT(V_tcp_hostcache.hashbase[hash].hch_length > 0 &&
441 			V_tcp_hostcache.hashbase[hash].hch_length <=
442 			V_tcp_hostcache.bucket_limit,
443 			("tcp_hostcache: bucket length range violated at %u: %u",
444 			hash, V_tcp_hostcache.hashbase[hash].hch_length));
445 		V_tcp_hostcache.hashbase[hash].hch_length--;
446 		atomic_subtract_int(&V_tcp_hostcache.cache_count, 1);
447 		TCPSTAT_INC(tcps_hc_bucketoverflow);
448 #if 0
449 		uma_zfree(V_tcp_hostcache.zone, hc_entry);
450 #endif
451 	} else {
452 		/*
453 		 * Allocate a new entry, or balk if not possible.
454 		 */
455 		hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT);
456 		if (hc_entry == NULL) {
457 			THC_UNLOCK(&hc_head->hch_mtx);
458 			return NULL;
459 		}
460 	}
461 
462 	/*
463 	 * Initialize basic information of hostcache entry.
464 	 */
465 	bzero(hc_entry, sizeof(*hc_entry));
466 	if (inc->inc_flags & INC_ISIPV6) {
467 		hc_entry->ip6 = inc->inc6_faddr;
468 		hc_entry->ip6_zoneid = inc->inc6_zoneid;
469 	} else
470 		hc_entry->ip4 = inc->inc_faddr;
471 	hc_entry->rmx_head = hc_head;
472 	hc_entry->rmx_expire = V_tcp_hostcache.expire;
473 
474 	/*
475 	 * Put it upfront.
476 	 */
477 	TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
478 	V_tcp_hostcache.hashbase[hash].hch_length++;
479 	KASSERT(V_tcp_hostcache.hashbase[hash].hch_length <
480 		V_tcp_hostcache.bucket_limit,
481 		("tcp_hostcache: bucket length too high at %u: %u",
482 		hash, V_tcp_hostcache.hashbase[hash].hch_length));
483 	atomic_add_int(&V_tcp_hostcache.cache_count, 1);
484 	TCPSTAT_INC(tcps_hc_added);
485 
486 	return hc_entry;
487 }
488 
489 /*
490  * External function: look up an entry in the hostcache and fill out the
491  * supplied TCP metrics structure.  Fills in NULL when no entry was found or
492  * a value is not set.
493  */
494 void
495 tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
496 {
497 	struct hc_metrics *hc_entry;
498 
499 	if (!V_tcp_use_hostcache) {
500 		bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
501 		return;
502 	}
503 
504 	/*
505 	 * Find the right bucket.
506 	 */
507 	hc_entry = tcp_hc_lookup(inc, false);
508 
509 	/*
510 	 * If we don't have an existing object.
511 	 */
512 	if (hc_entry == NULL) {
513 		bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
514 		return;
515 	}
516 
517 	hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
518 	hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
519 	hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
520 	hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
521 	hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
522 	hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
523 	hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
524 
525 	/*
526 	 * Unlock bucket row.
527 	 */
528 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
529 }
530 
531 /*
532  * External function: look up an entry in the hostcache and return the
533  * discovered path MTU.  Returns 0 if no entry is found or value is not
534  * set.
535  */
536 uint32_t
537 tcp_hc_getmtu(struct in_conninfo *inc)
538 {
539 	struct hc_metrics *hc_entry;
540 	uint32_t mtu;
541 
542 	if (!V_tcp_use_hostcache)
543 		return 0;
544 
545 	hc_entry = tcp_hc_lookup(inc, false);
546 	if (hc_entry == NULL) {
547 		return 0;
548 	}
549 
550 	mtu = hc_entry->rmx_mtu;
551 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
552 	return mtu;
553 }
554 
555 /*
556  * External function: update the MTU value of an entry in the hostcache.
557  * Creates a new entry if none was found.
558  */
559 void
560 tcp_hc_updatemtu(struct in_conninfo *inc, uint32_t mtu)
561 {
562 	struct hc_metrics *hc_entry;
563 
564 	if (!V_tcp_use_hostcache)
565 		return;
566 
567 	/*
568 	 * Find the right bucket.
569 	 */
570 	hc_entry = tcp_hc_lookup(inc, true);
571 
572 	/*
573 	 * If we don't have an existing object, try to insert a new one.
574 	 */
575 	if (hc_entry == NULL) {
576 		hc_entry = tcp_hc_insert(inc);
577 		if (hc_entry == NULL)
578 			return;
579 	}
580 
581 	hc_entry->rmx_mtu = mtu;
582 
583 	/*
584 	 * Put it upfront so we find it faster next time.
585 	 */
586 	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
587 	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
588 
589 	/*
590 	 * Unlock bucket row.
591 	 */
592 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
593 }
594 
595 /*
596  * External function: update the TCP metrics of an entry in the hostcache.
597  * Creates a new entry if none was found.
598  */
599 void
600 tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
601 {
602 	struct hc_metrics *hc_entry;
603 
604 	if (!V_tcp_use_hostcache)
605 		return;
606 
607 	hc_entry = tcp_hc_lookup(inc, true);
608 	if (hc_entry == NULL) {
609 		hc_entry = tcp_hc_insert(inc);
610 		if (hc_entry == NULL)
611 			return;
612 	}
613 
614 	if (hcml->rmx_rtt != 0) {
615 		if (hc_entry->rmx_rtt == 0)
616 			hc_entry->rmx_rtt = hcml->rmx_rtt;
617 		else
618 			hc_entry->rmx_rtt = ((uint64_t)hc_entry->rmx_rtt +
619 			    (uint64_t)hcml->rmx_rtt) / 2;
620 		TCPSTAT_INC(tcps_cachedrtt);
621 	}
622 	if (hcml->rmx_rttvar != 0) {
623 	        if (hc_entry->rmx_rttvar == 0)
624 			hc_entry->rmx_rttvar = hcml->rmx_rttvar;
625 		else
626 			hc_entry->rmx_rttvar = ((uint64_t)hc_entry->rmx_rttvar +
627 			    (uint64_t)hcml->rmx_rttvar) / 2;
628 		TCPSTAT_INC(tcps_cachedrttvar);
629 	}
630 	if (hcml->rmx_ssthresh != 0) {
631 		if (hc_entry->rmx_ssthresh == 0)
632 			hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
633 		else
634 			hc_entry->rmx_ssthresh =
635 			    (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
636 		TCPSTAT_INC(tcps_cachedssthresh);
637 	}
638 	if (hcml->rmx_cwnd != 0) {
639 		if (hc_entry->rmx_cwnd == 0)
640 			hc_entry->rmx_cwnd = hcml->rmx_cwnd;
641 		else
642 			hc_entry->rmx_cwnd = ((uint64_t)hc_entry->rmx_cwnd +
643 			    (uint64_t)hcml->rmx_cwnd) / 2;
644 		/* TCPSTAT_INC(tcps_cachedcwnd); */
645 	}
646 	if (hcml->rmx_sendpipe != 0) {
647 		if (hc_entry->rmx_sendpipe == 0)
648 			hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
649 		else
650 			hc_entry->rmx_sendpipe =
651 			    ((uint64_t)hc_entry->rmx_sendpipe +
652 			    (uint64_t)hcml->rmx_sendpipe) /2;
653 		/* TCPSTAT_INC(tcps_cachedsendpipe); */
654 	}
655 	if (hcml->rmx_recvpipe != 0) {
656 		if (hc_entry->rmx_recvpipe == 0)
657 			hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
658 		else
659 			hc_entry->rmx_recvpipe =
660 			    ((uint64_t)hc_entry->rmx_recvpipe +
661 			    (uint64_t)hcml->rmx_recvpipe) /2;
662 		/* TCPSTAT_INC(tcps_cachedrecvpipe); */
663 	}
664 
665 	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
666 	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
667 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
668 }
669 
670 /*
671  * Sysctl function: prints the list and values of all hostcache entries in
672  * unsorted order.
673  */
674 static int
675 sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
676 {
677 	const int linesize = 128;
678 	struct sbuf sb;
679 	int i, error, len;
680 	struct hc_metrics *hc_entry;
681 	char ip4buf[INET_ADDRSTRLEN];
682 #ifdef INET6
683 	char ip6buf[INET6_ADDRSTRLEN];
684 #endif
685 
686 	if (jailed_without_vnet(curthread->td_ucred) != 0)
687 		return (EPERM);
688 
689 	/* Optimize Buffer length query by sbin/sysctl */
690 	if (req->oldptr == NULL) {
691 		len = (atomic_load_int(&V_tcp_hostcache.cache_count) + 1) *
692 			linesize;
693 		return (SYSCTL_OUT(req, NULL, len));
694 	}
695 
696 	error = sysctl_wire_old_buffer(req, 0);
697 	if (error != 0) {
698 		return(error);
699 	}
700 
701 	/* Use a buffer sized for one full bucket */
702 	sbuf_new_for_sysctl(&sb, NULL, V_tcp_hostcache.bucket_limit *
703 		linesize, req);
704 
705 	sbuf_printf(&sb,
706 		"\nIP address        MTU  SSTRESH      RTT   RTTVAR "
707 		"    CWND SENDPIPE RECVPIPE "
708 #ifdef	TCP_HC_COUNTERS
709 		"HITS  UPD  "
710 #endif
711 		"EXP\n");
712 	sbuf_drain(&sb);
713 
714 #define msec(u) (((u) + 500) / 1000)
715 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
716 		THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
717 		TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
718 			      rmx_q) {
719 			sbuf_printf(&sb,
720 			    "%-15s %5u %8u %6lums %6lums %8u %8u %8u "
721 #ifdef	TCP_HC_COUNTERS
722 			    "%4lu %4lu "
723 #endif
724 			    "%4i\n",
725 			    hc_entry->ip4.s_addr ?
726 			        inet_ntoa_r(hc_entry->ip4, ip4buf) :
727 #ifdef INET6
728 				ip6_sprintf(ip6buf, &hc_entry->ip6),
729 #else
730 				"IPv6?",
731 #endif
732 			    hc_entry->rmx_mtu,
733 			    hc_entry->rmx_ssthresh,
734 			    msec((u_long)hc_entry->rmx_rtt *
735 				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
736 			    msec((u_long)hc_entry->rmx_rttvar *
737 				(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))),
738 			    hc_entry->rmx_cwnd,
739 			    hc_entry->rmx_sendpipe,
740 			    hc_entry->rmx_recvpipe,
741 #ifdef	TCP_HC_COUNTERS
742 			    hc_entry->rmx_hits,
743 			    hc_entry->rmx_updates,
744 #endif
745 			    hc_entry->rmx_expire);
746 		}
747 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
748 		sbuf_drain(&sb);
749 	}
750 #undef msec
751 	error = sbuf_finish(&sb);
752 	sbuf_delete(&sb);
753 	return(error);
754 }
755 
756 /*
757  * Sysctl function: prints a histogram of the hostcache hashbucket
758  * utilization.
759  */
760 static int
761 sysctl_tcp_hc_histo(SYSCTL_HANDLER_ARGS)
762 {
763 	const int linesize = 50;
764 	struct sbuf sb;
765 	int i, error;
766 	int *histo;
767 	u_int hch_length;
768 
769 	if (jailed_without_vnet(curthread->td_ucred) != 0)
770 		return (EPERM);
771 
772 	histo = (int *)malloc(sizeof(int) * (V_tcp_hostcache.bucket_limit + 1),
773 			M_TEMP, M_NOWAIT|M_ZERO);
774 	if (histo == NULL)
775 		return(ENOMEM);
776 
777 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
778 		hch_length = V_tcp_hostcache.hashbase[i].hch_length;
779 		KASSERT(hch_length <= V_tcp_hostcache.bucket_limit,
780 			("tcp_hostcache: bucket limit exceeded at %u: %u",
781 			i, hch_length));
782 		histo[hch_length]++;
783 	}
784 
785 	/* Use a buffer for 16 lines */
786 	sbuf_new_for_sysctl(&sb, NULL, 16 * linesize, req);
787 
788 	sbuf_printf(&sb, "\nLength\tCount\n");
789 	for (i = 0; i <= V_tcp_hostcache.bucket_limit; i++) {
790 		sbuf_printf(&sb, "%u\t%u\n", i, histo[i]);
791 	}
792 	error = sbuf_finish(&sb);
793 	sbuf_delete(&sb);
794 	free(histo, M_TEMP);
795 	return(error);
796 }
797 
798 /*
799  * Caller has to make sure the curvnet is set properly.
800  */
801 static void
802 tcp_hc_purge_internal(int all)
803 {
804 	struct hc_metrics *hc_entry, *hc_next;
805 	int i;
806 
807 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
808 		THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
809 		TAILQ_FOREACH_SAFE(hc_entry,
810 		    &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) {
811 			KASSERT(V_tcp_hostcache.hashbase[i].hch_length > 0 &&
812 				V_tcp_hostcache.hashbase[i].hch_length <=
813 				V_tcp_hostcache.bucket_limit,
814 				("tcp_hostcache: bucket length out of range at %u: %u",
815 				i, V_tcp_hostcache.hashbase[i].hch_length));
816 			if (all || hc_entry->rmx_expire <= 0) {
817 				TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket,
818 					      hc_entry, rmx_q);
819 				uma_zfree(V_tcp_hostcache.zone, hc_entry);
820 				V_tcp_hostcache.hashbase[i].hch_length--;
821 				atomic_subtract_int(&V_tcp_hostcache.cache_count, 1);
822 			} else
823 				hc_entry->rmx_expire -= V_tcp_hostcache.prune;
824 		}
825 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
826 	}
827 }
828 
829 /*
830  * Expire and purge (old|all) entries in the tcp_hostcache.  Runs
831  * periodically from the callout.
832  */
833 static void
834 tcp_hc_purge(void *arg)
835 {
836 	CURVNET_SET((struct vnet *) arg);
837 	int all = 0;
838 
839 	if (V_tcp_hostcache.purgeall) {
840 		if (V_tcp_hostcache.purgeall == 2)
841 			V_tcp_hostcache.hashsalt = arc4random();
842 		all = 1;
843 		V_tcp_hostcache.purgeall = 0;
844 	}
845 
846 	tcp_hc_purge_internal(all);
847 
848 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
849 	    tcp_hc_purge, arg);
850 	CURVNET_RESTORE();
851 }
852 
853 /*
854  * Expire and purge all entries in hostcache immediately.
855  */
856 static int
857 sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS)
858 {
859 	int error, val;
860 
861 	val = 0;
862 	error = sysctl_handle_int(oidp, &val, 0, req);
863 	if (error || !req->newptr)
864 		return (error);
865 
866 	if (val == 2)
867 		V_tcp_hostcache.hashsalt = arc4random();
868 	tcp_hc_purge_internal(1);
869 
870 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
871 	    tcp_hc_purge, curvnet);
872 
873 	return (0);
874 }
875