xref: /freebsd/sys/netinet/tcp_pcap.c (revision 2e3507c25e42292b45a5482e116d278f5515d04d)
1 /*-
2  * Copyright (c) 2015
3  *	Jonathan Looney. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/queue.h>
28 #include <sys/param.h>
29 #include <sys/types.h>
30 #include <sys/socket.h>
31 #include <sys/socketvar.h>
32 #include <sys/sysctl.h>
33 #include <sys/systm.h>
34 #include <sys/mbuf.h>
35 #include <sys/eventhandler.h>
36 #include <machine/atomic.h>
37 #include <netinet/in.h>
38 #include <netinet/in_pcb.h>
39 #include <netinet/tcp_var.h>
40 #include <netinet/tcp_pcap.h>
41 
42 #define M_LEADINGSPACE_NOWRITE(m)					\
43 	((m)->m_data - M_START(m))
44 
45 int tcp_pcap_aggressive_free = 1;
46 static int tcp_pcap_clusters_referenced_cur = 0;
47 static int tcp_pcap_clusters_referenced_max = 0;
48 
49 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free,
50 	CTLFLAG_RW, &tcp_pcap_aggressive_free, 0,
51 	"Free saved packets when the memory system comes under pressure");
52 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur,
53 	CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0,
54 	"Number of clusters currently referenced on TCP PCAP queues");
55 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max,
56 	CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0,
57 	"Maximum number of clusters allowed to be referenced on TCP PCAP "
58 	"queues");
59 
60 static int tcp_pcap_alloc_reuse_ext = 0;
61 static int tcp_pcap_alloc_reuse_mbuf = 0;
62 static int tcp_pcap_alloc_new_mbuf = 0;
63 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext,
64 	CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0,
65 	"Number of mbufs with external storage reused for the TCP PCAP "
66 	"functionality");
67 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf,
68 	CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0,
69 	"Number of mbufs with internal storage reused for the TCP PCAP "
70 	"functionality");
71 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf,
72 	CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0,
73 	"Number of new mbufs allocated for the TCP PCAP functionality");
74 
75 VNET_DEFINE(int, tcp_pcap_packets) = 0;
76 #define V_tcp_pcap_packets	VNET(tcp_pcap_packets)
77 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets,
78 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0,
79 	"Default number of packets saved per direction per TCPCB");
80 
81 /* Initialize the values. */
82 static void
83 tcp_pcap_max_set(void)
84 {
85 
86 	tcp_pcap_clusters_referenced_max = nmbclusters / 4;
87 }
88 
89 void
90 tcp_pcap_init(void)
91 {
92 
93 	tcp_pcap_max_set();
94 	EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set,
95 		NULL, EVENTHANDLER_PRI_ANY);
96 }
97 
98 /*
99  * If we are below the maximum allowed cluster references,
100  * increment the reference count and return TRUE. Otherwise,
101  * leave the reference count alone and return FALSE.
102  */
103 static __inline bool
104 tcp_pcap_take_cluster_reference(void)
105 {
106 	if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >=
107 		tcp_pcap_clusters_referenced_max) {
108 		atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1);
109 		return FALSE;
110 	}
111 	return TRUE;
112 }
113 
114 /*
115  * For all the external entries in m, apply the given adjustment.
116  * This can be used to adjust the counter when an mbuf chain is
117  * copied or freed.
118  */
119 static __inline void
120 tcp_pcap_adj_cluster_reference(struct mbuf *m, int adj)
121 {
122 	while (m) {
123 		if (m->m_flags & M_EXT)
124 			atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj);
125 
126 		m = m->m_next;
127 	}
128 }
129 
130 /*
131  * Free all mbufs in a chain, decrementing the reference count as
132  * necessary.
133  *
134  * Functions in this file should use this instead of m_freem() when
135  * they are freeing mbuf chains that may contain clusters that were
136  * already included in tcp_pcap_clusters_referenced_cur.
137  */
138 static void
139 tcp_pcap_m_freem(struct mbuf *mb)
140 {
141 	while (mb != NULL) {
142 		if (mb->m_flags & M_EXT)
143 			atomic_subtract_int(&tcp_pcap_clusters_referenced_cur,
144 			    1);
145 		mb = m_free(mb);
146 	}
147 }
148 
149 /*
150  * Copy data from m to n, where n cannot fit all the data we might
151  * want from m.
152  *
153  * Prioritize data like this:
154  * 1. TCP header
155  * 2. IP header
156  * 3. Data
157  */
158 static void
159 tcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n)
160 {
161 	struct mbuf *m_cur = m;
162 	int bytes_to_copy=0, trailing_data, skip=0, tcp_off;
163 
164 	/* Below, we assume these will be non-NULL. */
165 	KASSERT(th, ("%s: called with th == NULL", __func__));
166 	KASSERT(m, ("%s: called with m == NULL", __func__));
167 	KASSERT(n, ("%s: called with n == NULL", __func__));
168 
169 	/* We assume this initialization occurred elsewhere. */
170 	KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)",
171 		__func__, n->m_len));
172 	KASSERT(n->m_data == M_START(n),
173 		("%s: called with n->m_data != M_START(n)", __func__));
174 
175 	/*
176 	 * Calculate the size of the TCP header. We use this often
177 	 * enough that it is worth just calculating at the start.
178 	 */
179 	tcp_off = th->th_off << 2;
180 
181 	/* Trim off leading empty mbufs. */
182 	while (m && m->m_len == 0)
183 		m = m->m_next;
184 
185 	if (m) {
186 		m_cur = m;
187 	}
188 	else {
189 		/*
190 		 * No data? Highly unusual. We would expect to at
191 		 * least see a TCP header in the mbuf.
192 		 * As we have a pointer to the TCP header, I guess
193 		 * we should just copy that. (???)
194 		 */
195 fallback:
196 		bytes_to_copy = tcp_off;
197 		if (bytes_to_copy > M_SIZE(n))
198 			bytes_to_copy = M_SIZE(n);
199 		bcopy(th, n->m_data, bytes_to_copy);
200 		n->m_len = bytes_to_copy;
201 		return;
202 	}
203 
204 	/*
205 	 * Find TCP header. Record the total number of bytes up to,
206 	 * and including, the TCP header.
207 	 */
208 	while (m_cur) {
209 		if ((caddr_t) th >= (caddr_t) m_cur->m_data &&
210 			(caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len))
211 			break;
212 		bytes_to_copy += m_cur->m_len;
213 		m_cur = m_cur->m_next;
214 	}
215 	if (m_cur)
216 		bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data;
217 	else
218 		goto fallback;
219 	bytes_to_copy += tcp_off;
220 
221 	/*
222 	 * If we already want to copy more bytes than we can hold
223 	 * in the destination mbuf, skip leading bytes and copy
224 	 * what we can.
225 	 *
226 	 * Otherwise, consider trailing data.
227 	 */
228 	if (bytes_to_copy > M_SIZE(n)) {
229 		skip  = bytes_to_copy - M_SIZE(n);
230 		bytes_to_copy = M_SIZE(n);
231 	}
232 	else {
233 		/*
234 		 * Determine how much trailing data is in the chain.
235 		 * We start with the length of this mbuf (the one
236 		 * containing th) and subtract the size of the TCP
237 		 * header (tcp_off) and the size of the data prior
238 		 * to th (th - m_cur->m_data).
239 		 *
240 		 * This *should not* be negative, as the TCP code
241 		 * should put the whole TCP header in a single
242 		 * mbuf. But, it isn't a problem if it is. We will
243 		 * simple work off our negative balance as we look
244 		 * at subsequent mbufs.
245 		 */
246 		trailing_data = m_cur->m_len - tcp_off;
247 		trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data;
248 		m_cur = m_cur->m_next;
249 		while (m_cur) {
250 			trailing_data += m_cur->m_len;
251 			m_cur = m_cur->m_next;
252 		}
253 		if ((bytes_to_copy + trailing_data) > M_SIZE(n))
254 			bytes_to_copy = M_SIZE(n);
255 		else
256 			bytes_to_copy += trailing_data;
257 	}
258 
259 	m_copydata(m, skip, bytes_to_copy, n->m_data);
260 	n->m_len = bytes_to_copy;
261 }
262 
263 void
264 tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
265 {
266 	struct mbuf *n = NULL, *mhead;
267 
268 	KASSERT(th, ("%s: called with th == NULL", __func__));
269 	KASSERT(m, ("%s: called with m == NULL", __func__));
270 	KASSERT(queue, ("%s: called with queue == NULL", __func__));
271 
272 	/* We only care about data packets. */
273 	while (m && m->m_type != MT_DATA)
274 		m = m->m_next;
275 
276 	/* We only need to do something if we still have an mbuf. */
277 	if (!m)
278 		return;
279 
280 	/* If we are not saving mbufs, return now. */
281 	if (queue->mq_maxlen == 0)
282 		return;
283 
284 	/*
285 	 * Check to see if we will need to recycle mbufs.
286 	 *
287 	 * If we need to get rid of mbufs to stay below
288 	 * our packet count, try to reuse the mbuf. Once
289 	 * we already have a new mbuf (n), then we can
290 	 * simply free subsequent mbufs.
291 	 *
292 	 * Note that most of the logic in here is to deal
293 	 * with the reuse. If we are fine with constant
294 	 * mbuf allocs/deallocs, we could ditch this logic.
295 	 * But, it only seems to make sense to reuse
296 	 * mbufs we already have.
297 	 */
298 	while (mbufq_full(queue)) {
299 		mhead = mbufq_dequeue(queue);
300 
301 		if (n) {
302 			tcp_pcap_m_freem(mhead);
303 		}
304 		else {
305 			/*
306 			 * If this held an external cluster, try to
307 			 * detach the cluster. But, if we held the
308 			 * last reference, go through the normal
309 			 * free-ing process.
310 			 */
311 			if (mhead->m_flags & M_EXTPG) {
312 				/* Don't mess around with these. */
313 				tcp_pcap_m_freem(mhead);
314 				continue;
315 			} else if (mhead->m_flags & M_EXT) {
316 				switch (mhead->m_ext.ext_type) {
317 				case EXT_SFBUF:
318 					/* Don't mess around with these. */
319 					tcp_pcap_m_freem(mhead);
320 					continue;
321 				default:
322 					if (atomic_fetchadd_int(
323 						mhead->m_ext.ext_cnt, -1) == 1)
324 					{
325 						/*
326 						 * We held the last reference
327 						 * on this cluster. Restore
328 						 * the reference count and put
329 						 * it back in the pool.
330 				 		 */
331 						*(mhead->m_ext.ext_cnt) = 1;
332 						tcp_pcap_m_freem(mhead);
333 						continue;
334 					}
335 					/*
336 					 * We were able to cleanly free the
337 					 * reference.
338 				 	 */
339 					atomic_subtract_int(
340 					    &tcp_pcap_clusters_referenced_cur,
341 					    1);
342 					tcp_pcap_alloc_reuse_ext++;
343 					break;
344 				}
345 			} else {
346 				tcp_pcap_alloc_reuse_mbuf++;
347 			}
348 
349 			n = mhead;
350 			tcp_pcap_m_freem(n->m_next);
351 			m_init(n, M_NOWAIT, MT_DATA, 0);
352 		}
353 	}
354 
355 	/* Check to see if we need to get a new mbuf. */
356 	if (!n) {
357 		if (!(n = m_get(M_NOWAIT, MT_DATA)))
358 			return;
359 		tcp_pcap_alloc_new_mbuf++;
360 	}
361 
362 	/*
363 	 * What are we dealing with? If a cluster, attach it. Otherwise,
364 	 * try to copy the data from the beginning of the mbuf to the
365 	 * end of data. (There may be data between the start of the data
366 	 * area and the current data pointer. We want to get this, because
367 	 * it may contain header information that is useful.)
368 	 * In cases where that isn't possible, settle for what we can
369 	 * get.
370 	 */
371 	if ((m->m_flags & (M_EXT | M_EXTPG)) &&
372 	    tcp_pcap_take_cluster_reference()) {
373 		n->m_data = m->m_data;
374 		n->m_len = m->m_len;
375 		mb_dupcl(n, m);
376 	}
377 	else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) {
378 		/*
379 		 * At this point, n is guaranteed to be a normal mbuf
380 		 * with no cluster and no packet header. Because the
381 		 * logic in this code block requires this, the assert
382 		 * is here to catch any instances where someone
383 		 * changes the logic to invalidate that assumption.
384 		 */
385 		KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0,
386 			("%s: Unexpected flags (%#x) for mbuf",
387 			__func__, n->m_flags));
388 		n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
389 		n->m_len = m->m_len;
390 		if (m->m_flags & M_EXTPG)
391 			m_copydata(m, 0, m->m_len, n->m_data);
392 		else
393 			bcopy(M_START(m), n->m_dat,
394 			    m->m_len + M_LEADINGSPACE_NOWRITE(m));
395 	}
396 	else {
397 		/*
398 		 * This is the case where we need to "settle for what
399 		 * we can get". The most probable way to this code
400 		 * path is that we've already taken references to the
401 		 * maximum number of mbuf clusters we can, and the data
402 		 * is too long to fit in an mbuf's internal storage.
403 		 * Try for a "best fit".
404 		 */
405 		tcp_pcap_copy_bestfit(th, m, n);
406 
407 		/* Don't try to get additional data. */
408 		goto add_to_queue;
409 	}
410 
411 	if (m->m_next) {
412 		n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT);
413 		tcp_pcap_adj_cluster_reference(n->m_next, 1);
414 	}
415 
416 add_to_queue:
417 	/* Add the new mbuf to the list. */
418 	if (mbufq_enqueue(queue, n)) {
419 		/* This shouldn't happen. If INVARIANTS is defined, panic. */
420 		KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__));
421 		tcp_pcap_m_freem(n);
422 	}
423 }
424 
425 void
426 tcp_pcap_drain(struct mbufq *queue)
427 {
428 	struct mbuf *m;
429 	while ((m = mbufq_dequeue(queue)))
430 		tcp_pcap_m_freem(m);
431 }
432 
433 void
434 tcp_pcap_tcpcb_init(struct tcpcb *tp)
435 {
436 	mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets);
437 	mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets);
438 }
439 
440 void
441 tcp_pcap_set_sock_max(struct mbufq *queue, int newval)
442 {
443 	queue->mq_maxlen = newval;
444 	while (queue->mq_len > queue->mq_maxlen)
445 		tcp_pcap_m_freem(mbufq_dequeue(queue));
446 }
447 
448 int
449 tcp_pcap_get_sock_max(struct mbufq *queue)
450 {
451 	return queue->mq_maxlen;
452 }
453