1 /*- 2 * Copyright (c) 2015 3 * Jonathan Looney. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/queue.h> 30 #include <sys/param.h> 31 #include <sys/types.h> 32 #include <sys/socket.h> 33 #include <sys/socketvar.h> 34 #include <sys/sysctl.h> 35 #include <sys/systm.h> 36 #include <sys/mbuf.h> 37 #include <sys/eventhandler.h> 38 #include <machine/atomic.h> 39 #include <netinet/in.h> 40 #include <netinet/in_pcb.h> 41 #include <netinet/tcp_var.h> 42 #include <netinet/tcp_pcap.h> 43 44 #define M_LEADINGSPACE_NOWRITE(m) \ 45 ((m)->m_data - M_START(m)) 46 47 int tcp_pcap_aggressive_free = 1; 48 static int tcp_pcap_clusters_referenced_cur = 0; 49 static int tcp_pcap_clusters_referenced_max = 0; 50 51 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free, 52 CTLFLAG_RW, &tcp_pcap_aggressive_free, 0, 53 "Free saved packets when the memory system comes under pressure"); 54 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur, 55 CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0, 56 "Number of clusters currently referenced on TCP PCAP queues"); 57 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max, 58 CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0, 59 "Maximum number of clusters allowed to be referenced on TCP PCAP " 60 "queues"); 61 62 static int tcp_pcap_alloc_reuse_ext = 0; 63 static int tcp_pcap_alloc_reuse_mbuf = 0; 64 static int tcp_pcap_alloc_new_mbuf = 0; 65 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext, 66 CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0, 67 "Number of mbufs with external storage reused for the TCP PCAP " 68 "functionality"); 69 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf, 70 CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0, 71 "Number of mbufs with internal storage reused for the TCP PCAP " 72 "functionality"); 73 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf, 74 CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0, 75 "Number of new mbufs allocated for the TCP PCAP functionality"); 76 77 VNET_DEFINE(int, tcp_pcap_packets) = 0; 78 #define V_tcp_pcap_packets VNET(tcp_pcap_packets) 79 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets, 80 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0, 81 "Default number of packets saved per direction per TCPCB"); 82 83 /* Initialize the values. */ 84 static void 85 tcp_pcap_max_set(void) 86 { 87 88 tcp_pcap_clusters_referenced_max = nmbclusters / 4; 89 } 90 91 void 92 tcp_pcap_init(void) 93 { 94 95 tcp_pcap_max_set(); 96 EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set, 97 NULL, EVENTHANDLER_PRI_ANY); 98 } 99 100 /* 101 * If we are below the maximum allowed cluster references, 102 * increment the reference count and return TRUE. Otherwise, 103 * leave the reference count alone and return FALSE. 104 */ 105 static __inline bool 106 tcp_pcap_take_cluster_reference(void) 107 { 108 if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >= 109 tcp_pcap_clusters_referenced_max) { 110 atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1); 111 return FALSE; 112 } 113 return TRUE; 114 } 115 116 /* 117 * For all the external entries in m, apply the given adjustment. 118 * This can be used to adjust the counter when an mbuf chain is 119 * copied or freed. 120 */ 121 static __inline void 122 tcp_pcap_adj_cluster_reference(struct mbuf *m, int adj) 123 { 124 while (m) { 125 if (m->m_flags & M_EXT) 126 atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj); 127 128 m = m->m_next; 129 } 130 } 131 132 /* 133 * Free all mbufs in a chain, decrementing the reference count as 134 * necessary. 135 * 136 * Functions in this file should use this instead of m_freem() when 137 * they are freeing mbuf chains that may contain clusters that were 138 * already included in tcp_pcap_clusters_referenced_cur. 139 */ 140 static void 141 tcp_pcap_m_freem(struct mbuf *mb) 142 { 143 while (mb != NULL) { 144 if (mb->m_flags & M_EXT) 145 atomic_subtract_int(&tcp_pcap_clusters_referenced_cur, 146 1); 147 mb = m_free(mb); 148 } 149 } 150 151 /* 152 * Copy data from m to n, where n cannot fit all the data we might 153 * want from m. 154 * 155 * Prioritize data like this: 156 * 1. TCP header 157 * 2. IP header 158 * 3. Data 159 */ 160 static void 161 tcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n) 162 { 163 struct mbuf *m_cur = m; 164 int bytes_to_copy=0, trailing_data, skip=0, tcp_off; 165 166 /* Below, we assume these will be non-NULL. */ 167 KASSERT(th, ("%s: called with th == NULL", __func__)); 168 KASSERT(m, ("%s: called with m == NULL", __func__)); 169 KASSERT(n, ("%s: called with n == NULL", __func__)); 170 171 /* We assume this initialization occurred elsewhere. */ 172 KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)", 173 __func__, n->m_len)); 174 KASSERT(n->m_data == M_START(n), 175 ("%s: called with n->m_data != M_START(n)", __func__)); 176 177 /* 178 * Calculate the size of the TCP header. We use this often 179 * enough that it is worth just calculating at the start. 180 */ 181 tcp_off = th->th_off << 2; 182 183 /* Trim off leading empty mbufs. */ 184 while (m && m->m_len == 0) 185 m = m->m_next; 186 187 if (m) { 188 m_cur = m; 189 } 190 else { 191 /* 192 * No data? Highly unusual. We would expect to at 193 * least see a TCP header in the mbuf. 194 * As we have a pointer to the TCP header, I guess 195 * we should just copy that. (???) 196 */ 197 fallback: 198 bytes_to_copy = tcp_off; 199 if (bytes_to_copy > M_SIZE(n)) 200 bytes_to_copy = M_SIZE(n); 201 bcopy(th, n->m_data, bytes_to_copy); 202 n->m_len = bytes_to_copy; 203 return; 204 } 205 206 /* 207 * Find TCP header. Record the total number of bytes up to, 208 * and including, the TCP header. 209 */ 210 while (m_cur) { 211 if ((caddr_t) th >= (caddr_t) m_cur->m_data && 212 (caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len)) 213 break; 214 bytes_to_copy += m_cur->m_len; 215 m_cur = m_cur->m_next; 216 } 217 if (m_cur) 218 bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data; 219 else 220 goto fallback; 221 bytes_to_copy += tcp_off; 222 223 /* 224 * If we already want to copy more bytes than we can hold 225 * in the destination mbuf, skip leading bytes and copy 226 * what we can. 227 * 228 * Otherwise, consider trailing data. 229 */ 230 if (bytes_to_copy > M_SIZE(n)) { 231 skip = bytes_to_copy - M_SIZE(n); 232 bytes_to_copy = M_SIZE(n); 233 } 234 else { 235 /* 236 * Determine how much trailing data is in the chain. 237 * We start with the length of this mbuf (the one 238 * containing th) and subtract the size of the TCP 239 * header (tcp_off) and the size of the data prior 240 * to th (th - m_cur->m_data). 241 * 242 * This *should not* be negative, as the TCP code 243 * should put the whole TCP header in a single 244 * mbuf. But, it isn't a problem if it is. We will 245 * simple work off our negative balance as we look 246 * at subsequent mbufs. 247 */ 248 trailing_data = m_cur->m_len - tcp_off; 249 trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data; 250 m_cur = m_cur->m_next; 251 while (m_cur) { 252 trailing_data += m_cur->m_len; 253 m_cur = m_cur->m_next; 254 } 255 if ((bytes_to_copy + trailing_data) > M_SIZE(n)) 256 bytes_to_copy = M_SIZE(n); 257 else 258 bytes_to_copy += trailing_data; 259 } 260 261 m_copydata(m, skip, bytes_to_copy, n->m_data); 262 n->m_len = bytes_to_copy; 263 } 264 265 void 266 tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue) 267 { 268 struct mbuf *n = NULL, *mhead; 269 270 KASSERT(th, ("%s: called with th == NULL", __func__)); 271 KASSERT(m, ("%s: called with m == NULL", __func__)); 272 KASSERT(queue, ("%s: called with queue == NULL", __func__)); 273 274 /* We only care about data packets. */ 275 while (m && m->m_type != MT_DATA) 276 m = m->m_next; 277 278 /* We only need to do something if we still have an mbuf. */ 279 if (!m) 280 return; 281 282 /* If we are not saving mbufs, return now. */ 283 if (queue->mq_maxlen == 0) 284 return; 285 286 /* 287 * Check to see if we will need to recycle mbufs. 288 * 289 * If we need to get rid of mbufs to stay below 290 * our packet count, try to reuse the mbuf. Once 291 * we already have a new mbuf (n), then we can 292 * simply free subsequent mbufs. 293 * 294 * Note that most of the logic in here is to deal 295 * with the reuse. If we are fine with constant 296 * mbuf allocs/deallocs, we could ditch this logic. 297 * But, it only seems to make sense to reuse 298 * mbufs we already have. 299 */ 300 while (mbufq_full(queue)) { 301 mhead = mbufq_dequeue(queue); 302 303 if (n) { 304 tcp_pcap_m_freem(mhead); 305 } 306 else { 307 /* 308 * If this held an external cluster, try to 309 * detach the cluster. But, if we held the 310 * last reference, go through the normal 311 * free-ing process. 312 */ 313 if (mhead->m_flags & M_EXTPG) { 314 /* Don't mess around with these. */ 315 tcp_pcap_m_freem(mhead); 316 continue; 317 } else if (mhead->m_flags & M_EXT) { 318 switch (mhead->m_ext.ext_type) { 319 case EXT_SFBUF: 320 /* Don't mess around with these. */ 321 tcp_pcap_m_freem(mhead); 322 continue; 323 default: 324 if (atomic_fetchadd_int( 325 mhead->m_ext.ext_cnt, -1) == 1) 326 { 327 /* 328 * We held the last reference 329 * on this cluster. Restore 330 * the reference count and put 331 * it back in the pool. 332 */ 333 *(mhead->m_ext.ext_cnt) = 1; 334 tcp_pcap_m_freem(mhead); 335 continue; 336 } 337 /* 338 * We were able to cleanly free the 339 * reference. 340 */ 341 atomic_subtract_int( 342 &tcp_pcap_clusters_referenced_cur, 343 1); 344 tcp_pcap_alloc_reuse_ext++; 345 break; 346 } 347 } else { 348 tcp_pcap_alloc_reuse_mbuf++; 349 } 350 351 n = mhead; 352 tcp_pcap_m_freem(n->m_next); 353 m_init(n, M_NOWAIT, MT_DATA, 0); 354 } 355 } 356 357 /* Check to see if we need to get a new mbuf. */ 358 if (!n) { 359 if (!(n = m_get(M_NOWAIT, MT_DATA))) 360 return; 361 tcp_pcap_alloc_new_mbuf++; 362 } 363 364 /* 365 * What are we dealing with? If a cluster, attach it. Otherwise, 366 * try to copy the data from the beginning of the mbuf to the 367 * end of data. (There may be data between the start of the data 368 * area and the current data pointer. We want to get this, because 369 * it may contain header information that is useful.) 370 * In cases where that isn't possible, settle for what we can 371 * get. 372 */ 373 if ((m->m_flags & (M_EXT|M_EXTPG)) && 374 tcp_pcap_take_cluster_reference()) { 375 n->m_data = m->m_data; 376 n->m_len = m->m_len; 377 mb_dupcl(n, m); 378 } 379 else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) { 380 /* 381 * At this point, n is guaranteed to be a normal mbuf 382 * with no cluster and no packet header. Because the 383 * logic in this code block requires this, the assert 384 * is here to catch any instances where someone 385 * changes the logic to invalidate that assumption. 386 */ 387 KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0, 388 ("%s: Unexpected flags (%#x) for mbuf", 389 __func__, n->m_flags)); 390 n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m); 391 n->m_len = m->m_len; 392 if (m->m_flags & M_EXTPG) 393 m_copydata(m, 0, m->m_len, n->m_data); 394 else 395 bcopy(M_START(m), n->m_dat, 396 m->m_len + M_LEADINGSPACE_NOWRITE(m)); 397 } 398 else { 399 /* 400 * This is the case where we need to "settle for what 401 * we can get". The most probable way to this code 402 * path is that we've already taken references to the 403 * maximum number of mbuf clusters we can, and the data 404 * is too long to fit in an mbuf's internal storage. 405 * Try for a "best fit". 406 */ 407 tcp_pcap_copy_bestfit(th, m, n); 408 409 /* Don't try to get additional data. */ 410 goto add_to_queue; 411 } 412 413 if (m->m_next) { 414 n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT); 415 tcp_pcap_adj_cluster_reference(n->m_next, 1); 416 } 417 418 add_to_queue: 419 /* Add the new mbuf to the list. */ 420 if (mbufq_enqueue(queue, n)) { 421 /* This shouldn't happen. If INVARIANTS is defined, panic. */ 422 KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__)); 423 tcp_pcap_m_freem(n); 424 } 425 } 426 427 void 428 tcp_pcap_drain(struct mbufq *queue) 429 { 430 struct mbuf *m; 431 while ((m = mbufq_dequeue(queue))) 432 tcp_pcap_m_freem(m); 433 } 434 435 void 436 tcp_pcap_tcpcb_init(struct tcpcb *tp) 437 { 438 mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets); 439 mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets); 440 } 441 442 void 443 tcp_pcap_set_sock_max(struct mbufq *queue, int newval) 444 { 445 queue->mq_maxlen = newval; 446 while (queue->mq_len > queue->mq_maxlen) 447 tcp_pcap_m_freem(mbufq_dequeue(queue)); 448 } 449 450 int 451 tcp_pcap_get_sock_max(struct mbufq *queue) 452 { 453 return queue->mq_maxlen; 454 } 455