1 /*- 2 * Copyright (c) 2015 3 * Jonathan Looney. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/queue.h> 30 #include <sys/param.h> 31 #include <sys/types.h> 32 #include <sys/socket.h> 33 #include <sys/socketvar.h> 34 #include <sys/sysctl.h> 35 #include <sys/systm.h> 36 #include <sys/mbuf.h> 37 #include <sys/eventhandler.h> 38 #include <machine/atomic.h> 39 #include <netinet/tcp_var.h> 40 #include <netinet/tcp_pcap.h> 41 42 #define M_LEADINGSPACE_NOWRITE(m) \ 43 ((m)->m_data - M_START(m)) 44 45 int tcp_pcap_aggressive_free = 1; 46 static int tcp_pcap_clusters_referenced_cur = 0; 47 static int tcp_pcap_clusters_referenced_max = 0; 48 49 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free, 50 CTLFLAG_RW, &tcp_pcap_aggressive_free, 0, 51 "Free saved packets when the memory system comes under pressure"); 52 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur, 53 CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0, 54 "Number of clusters currently referenced on TCP PCAP queues"); 55 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max, 56 CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0, 57 "Maximum number of clusters allowed to be referenced on TCP PCAP " 58 "queues"); 59 60 static int tcp_pcap_alloc_reuse_ext = 0; 61 static int tcp_pcap_alloc_reuse_mbuf = 0; 62 static int tcp_pcap_alloc_new_mbuf = 0; 63 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext, 64 CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0, 65 "Number of mbufs with external storage reused for the TCP PCAP " 66 "functionality"); 67 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf, 68 CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0, 69 "Number of mbufs with internal storage reused for the TCP PCAP " 70 "functionality"); 71 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf, 72 CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0, 73 "Number of new mbufs allocated for the TCP PCAP functionality"); 74 75 VNET_DEFINE(int, tcp_pcap_packets) = 0; 76 #define V_tcp_pcap_packets VNET(tcp_pcap_packets) 77 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets, 78 CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0, 79 "Default number of packets saved per direction per TCPCB"); 80 81 /* Initialize the values. */ 82 static void 83 tcp_pcap_max_set(void) 84 { 85 86 tcp_pcap_clusters_referenced_max = nmbclusters / 4; 87 } 88 89 void 90 tcp_pcap_init(void) 91 { 92 93 tcp_pcap_max_set(); 94 EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set, 95 NULL, EVENTHANDLER_PRI_ANY); 96 } 97 98 /* 99 * If we are below the maximum allowed cluster references, 100 * increment the reference count and return TRUE. Otherwise, 101 * leave the reference count alone and return FALSE. 102 */ 103 static __inline bool 104 tcp_pcap_take_cluster_reference(void) 105 { 106 if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >= 107 tcp_pcap_clusters_referenced_max) { 108 atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1); 109 return FALSE; 110 } 111 return TRUE; 112 } 113 114 /* 115 * For all the external entries in m, apply the given adjustment. 116 * This can be used to adjust the counter when an mbuf chain is 117 * copied or freed. 118 */ 119 static __inline void 120 tcp_pcap_adj_cluster_reference(struct mbuf *m, int adj) 121 { 122 while (m) { 123 if (m->m_flags & M_EXT) 124 atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj); 125 126 m = m->m_next; 127 } 128 } 129 130 /* 131 * Free all mbufs in a chain, decrementing the reference count as 132 * necessary. 133 * 134 * Functions in this file should use this instead of m_freem() when 135 * they are freeing mbuf chains that may contain clusters that were 136 * already included in tcp_pcap_clusters_referenced_cur. 137 */ 138 static void 139 tcp_pcap_m_freem(struct mbuf *mb) 140 { 141 while (mb != NULL) { 142 if (mb->m_flags & M_EXT) 143 atomic_subtract_int(&tcp_pcap_clusters_referenced_cur, 144 1); 145 mb = m_free(mb); 146 } 147 } 148 149 /* 150 * Copy data from m to n, where n cannot fit all the data we might 151 * want from m. 152 * 153 * Prioritize data like this: 154 * 1. TCP header 155 * 2. IP header 156 * 3. Data 157 */ 158 static void 159 tcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n) 160 { 161 struct mbuf *m_cur = m; 162 int bytes_to_copy=0, trailing_data, skip=0, tcp_off; 163 164 /* Below, we assume these will be non-NULL. */ 165 KASSERT(th, ("%s: called with th == NULL", __func__)); 166 KASSERT(m, ("%s: called with m == NULL", __func__)); 167 KASSERT(n, ("%s: called with n == NULL", __func__)); 168 169 /* We assume this initialization occurred elsewhere. */ 170 KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)", 171 __func__, n->m_len)); 172 KASSERT(n->m_data == M_START(n), 173 ("%s: called with n->m_data != M_START(n)", __func__)); 174 175 /* 176 * Calculate the size of the TCP header. We use this often 177 * enough that it is worth just calculating at the start. 178 */ 179 tcp_off = th->th_off << 2; 180 181 /* Trim off leading empty mbufs. */ 182 while (m && m->m_len == 0) 183 m = m->m_next; 184 185 if (m) { 186 m_cur = m; 187 } 188 else { 189 /* 190 * No data? Highly unusual. We would expect to at 191 * least see a TCP header in the mbuf. 192 * As we have a pointer to the TCP header, I guess 193 * we should just copy that. (???) 194 */ 195 fallback: 196 bytes_to_copy = tcp_off; 197 if (bytes_to_copy > M_SIZE(n)) 198 bytes_to_copy = M_SIZE(n); 199 bcopy(th, n->m_data, bytes_to_copy); 200 n->m_len = bytes_to_copy; 201 return; 202 } 203 204 /* 205 * Find TCP header. Record the total number of bytes up to, 206 * and including, the TCP header. 207 */ 208 while (m_cur) { 209 if ((caddr_t) th >= (caddr_t) m_cur->m_data && 210 (caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len)) 211 break; 212 bytes_to_copy += m_cur->m_len; 213 m_cur = m_cur->m_next; 214 } 215 if (m_cur) 216 bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data; 217 else 218 goto fallback; 219 bytes_to_copy += tcp_off; 220 221 /* 222 * If we already want to copy more bytes than we can hold 223 * in the destination mbuf, skip leading bytes and copy 224 * what we can. 225 * 226 * Otherwise, consider trailing data. 227 */ 228 if (bytes_to_copy > M_SIZE(n)) { 229 skip = bytes_to_copy - M_SIZE(n); 230 bytes_to_copy = M_SIZE(n); 231 } 232 else { 233 /* 234 * Determine how much trailing data is in the chain. 235 * We start with the length of this mbuf (the one 236 * containing th) and subtract the size of the TCP 237 * header (tcp_off) and the size of the data prior 238 * to th (th - m_cur->m_data). 239 * 240 * This *should not* be negative, as the TCP code 241 * should put the whole TCP header in a single 242 * mbuf. But, it isn't a problem if it is. We will 243 * simple work off our negative balance as we look 244 * at subsequent mbufs. 245 */ 246 trailing_data = m_cur->m_len - tcp_off; 247 trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data; 248 m_cur = m_cur->m_next; 249 while (m_cur) { 250 trailing_data += m_cur->m_len; 251 m_cur = m_cur->m_next; 252 } 253 if ((bytes_to_copy + trailing_data) > M_SIZE(n)) 254 bytes_to_copy = M_SIZE(n); 255 else 256 bytes_to_copy += trailing_data; 257 } 258 259 m_copydata(m, skip, bytes_to_copy, n->m_data); 260 n->m_len = bytes_to_copy; 261 } 262 263 void 264 tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue) 265 { 266 struct mbuf *n = NULL, *mhead; 267 268 KASSERT(th, ("%s: called with th == NULL", __func__)); 269 KASSERT(m, ("%s: called with m == NULL", __func__)); 270 KASSERT(queue, ("%s: called with queue == NULL", __func__)); 271 272 /* We only care about data packets. */ 273 while (m && m->m_type != MT_DATA) 274 m = m->m_next; 275 276 /* We only need to do something if we still have an mbuf. */ 277 if (!m) 278 return; 279 280 /* If we are not saving mbufs, return now. */ 281 if (queue->mq_maxlen == 0) 282 return; 283 284 /* 285 * Check to see if we will need to recycle mbufs. 286 * 287 * If we need to get rid of mbufs to stay below 288 * our packet count, try to reuse the mbuf. Once 289 * we already have a new mbuf (n), then we can 290 * simply free subsequent mbufs. 291 * 292 * Note that most of the logic in here is to deal 293 * with the reuse. If we are fine with constant 294 * mbuf allocs/deallocs, we could ditch this logic. 295 * But, it only seems to make sense to reuse 296 * mbufs we already have. 297 */ 298 while (mbufq_full(queue)) { 299 mhead = mbufq_dequeue(queue); 300 301 if (n) { 302 tcp_pcap_m_freem(mhead); 303 } 304 else { 305 /* 306 * If this held an external cluster, try to 307 * detach the cluster. But, if we held the 308 * last reference, go through the normal 309 * free-ing process. 310 */ 311 if (mhead->m_flags & M_EXTPG) { 312 /* Don't mess around with these. */ 313 tcp_pcap_m_freem(mhead); 314 continue; 315 } else if (mhead->m_flags & M_EXT) { 316 switch (mhead->m_ext.ext_type) { 317 case EXT_SFBUF: 318 /* Don't mess around with these. */ 319 tcp_pcap_m_freem(mhead); 320 continue; 321 default: 322 if (atomic_fetchadd_int( 323 mhead->m_ext.ext_cnt, -1) == 1) 324 { 325 /* 326 * We held the last reference 327 * on this cluster. Restore 328 * the reference count and put 329 * it back in the pool. 330 */ 331 *(mhead->m_ext.ext_cnt) = 1; 332 tcp_pcap_m_freem(mhead); 333 continue; 334 } 335 /* 336 * We were able to cleanly free the 337 * reference. 338 */ 339 atomic_subtract_int( 340 &tcp_pcap_clusters_referenced_cur, 341 1); 342 tcp_pcap_alloc_reuse_ext++; 343 break; 344 } 345 } else { 346 tcp_pcap_alloc_reuse_mbuf++; 347 } 348 349 n = mhead; 350 tcp_pcap_m_freem(n->m_next); 351 m_init(n, M_NOWAIT, MT_DATA, 0); 352 } 353 } 354 355 /* Check to see if we need to get a new mbuf. */ 356 if (!n) { 357 if (!(n = m_get(M_NOWAIT, MT_DATA))) 358 return; 359 tcp_pcap_alloc_new_mbuf++; 360 } 361 362 /* 363 * What are we dealing with? If a cluster, attach it. Otherwise, 364 * try to copy the data from the beginning of the mbuf to the 365 * end of data. (There may be data between the start of the data 366 * area and the current data pointer. We want to get this, because 367 * it may contain header information that is useful.) 368 * In cases where that isn't possible, settle for what we can 369 * get. 370 */ 371 if ((m->m_flags & (M_EXT|M_EXTPG)) && 372 tcp_pcap_take_cluster_reference()) { 373 n->m_data = m->m_data; 374 n->m_len = m->m_len; 375 mb_dupcl(n, m); 376 } 377 else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) { 378 /* 379 * At this point, n is guaranteed to be a normal mbuf 380 * with no cluster and no packet header. Because the 381 * logic in this code block requires this, the assert 382 * is here to catch any instances where someone 383 * changes the logic to invalidate that assumption. 384 */ 385 KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0, 386 ("%s: Unexpected flags (%#x) for mbuf", 387 __func__, n->m_flags)); 388 n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m); 389 n->m_len = m->m_len; 390 if (m->m_flags & M_EXTPG) 391 m_copydata(m, 0, m->m_len, n->m_data); 392 else 393 bcopy(M_START(m), n->m_dat, 394 m->m_len + M_LEADINGSPACE_NOWRITE(m)); 395 } 396 else { 397 /* 398 * This is the case where we need to "settle for what 399 * we can get". The most probable way to this code 400 * path is that we've already taken references to the 401 * maximum number of mbuf clusters we can, and the data 402 * is too long to fit in an mbuf's internal storage. 403 * Try for a "best fit". 404 */ 405 tcp_pcap_copy_bestfit(th, m, n); 406 407 /* Don't try to get additional data. */ 408 goto add_to_queue; 409 } 410 411 if (m->m_next) { 412 n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT); 413 tcp_pcap_adj_cluster_reference(n->m_next, 1); 414 } 415 416 add_to_queue: 417 /* Add the new mbuf to the list. */ 418 if (mbufq_enqueue(queue, n)) { 419 /* This shouldn't happen. If INVARIANTS is defined, panic. */ 420 KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__)); 421 tcp_pcap_m_freem(n); 422 } 423 } 424 425 void 426 tcp_pcap_drain(struct mbufq *queue) 427 { 428 struct mbuf *m; 429 while ((m = mbufq_dequeue(queue))) 430 tcp_pcap_m_freem(m); 431 } 432 433 void 434 tcp_pcap_tcpcb_init(struct tcpcb *tp) 435 { 436 mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets); 437 mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets); 438 } 439 440 void 441 tcp_pcap_set_sock_max(struct mbufq *queue, int newval) 442 { 443 queue->mq_maxlen = newval; 444 while (queue->mq_len > queue->mq_maxlen) 445 tcp_pcap_m_freem(mbufq_dequeue(queue)); 446 } 447 448 int 449 tcp_pcap_get_sock_max(struct mbufq *queue) 450 { 451 return queue->mq_maxlen; 452 } 453