1 /*- 2 * Copyright (c) 2015 3 * Jonathan Looney. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/queue.h> 30 #include <sys/param.h> 31 #include <sys/types.h> 32 #include <sys/socket.h> 33 #include <sys/socketvar.h> 34 #include <sys/sysctl.h> 35 #include <sys/systm.h> 36 #include <sys/mbuf.h> 37 #include <sys/eventhandler.h> 38 #include <machine/atomic.h> 39 #include <netinet/tcp_var.h> 40 #include <netinet/tcp_pcap.h> 41 42 #define M_LEADINGSPACE_NOWRITE(m) \ 43 ((m)->m_data - M_START(m)) 44 45 static int tcp_pcap_clusters_referenced_cur = 0; 46 static int tcp_pcap_clusters_referenced_max = 0; 47 48 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur, 49 CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0, 50 "Number of clusters currently referenced on TCP PCAP queues"); 51 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max, 52 CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0, 53 "Maximum number of clusters allowed to be referenced on TCP PCAP " 54 "queues"); 55 56 static int tcp_pcap_alloc_reuse_ext = 0; 57 static int tcp_pcap_alloc_reuse_mbuf = 0; 58 static int tcp_pcap_alloc_new_mbuf = 0; 59 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext, 60 CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0, 61 "Number of mbufs with external storage reused for the TCP PCAP " 62 "functionality"); 63 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf, 64 CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0, 65 "Number of mbufs with internal storage reused for the TCP PCAP " 66 "functionality"); 67 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf, 68 CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0, 69 "Number of new mbufs allocated for the TCP PCAP functionality"); 70 71 VNET_DEFINE(int, tcp_pcap_packets) = 0; 72 #define V_tcp_pcap_packets VNET(tcp_pcap_packets) 73 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets, 74 CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0, 75 "Default number of packets saved per direction per TCPCB"); 76 77 /* Initialize the values. */ 78 static void 79 tcp_pcap_max_set(void) 80 { 81 82 tcp_pcap_clusters_referenced_max = nmbclusters / 4; 83 } 84 85 void 86 tcp_pcap_init(void) 87 { 88 89 tcp_pcap_max_set(); 90 EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set, 91 NULL, EVENTHANDLER_PRI_ANY); 92 } 93 94 /* 95 * If we are below the maximum allowed cluster references, 96 * increment the reference count and return TRUE. Otherwise, 97 * leave the reference count alone and return FALSE. 98 */ 99 static __inline bool 100 tcp_pcap_take_cluster_reference(void) 101 { 102 if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >= 103 tcp_pcap_clusters_referenced_max) { 104 atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1); 105 return FALSE; 106 } 107 return TRUE; 108 } 109 110 /* 111 * For all the external entries in m, apply the given adjustment. 112 * This can be used to adjust the counter when an mbuf chain is 113 * copied or freed. 114 */ 115 static __inline void 116 tcp_pcap_adj_cluster_reference(struct mbuf *m, int adj) 117 { 118 while (m) { 119 if (m->m_flags & M_EXT) 120 atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj); 121 122 m = m->m_next; 123 } 124 } 125 126 /* 127 * Free all mbufs in a chain, decrementing the reference count as 128 * necessary. 129 * 130 * Functions in this file should use this instead of m_freem() when 131 * they are freeing mbuf chains that may contain clusters that were 132 * already included in tcp_pcap_clusters_referenced_cur. 133 */ 134 static void 135 tcp_pcap_m_freem(struct mbuf *mb) 136 { 137 while (mb != NULL) { 138 if (mb->m_flags & M_EXT) 139 atomic_subtract_int(&tcp_pcap_clusters_referenced_cur, 140 1); 141 mb = m_free(mb); 142 } 143 } 144 145 /* 146 * Copy data from m to n, where n cannot fit all the data we might 147 * want from m. 148 * 149 * Prioritize data like this: 150 * 1. TCP header 151 * 2. IP header 152 * 3. Data 153 */ 154 static void 155 tcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n) 156 { 157 struct mbuf *m_cur = m; 158 int bytes_to_copy=0, trailing_data, skip=0, tcp_off; 159 160 /* Below, we assume these will be non-NULL. */ 161 KASSERT(th, ("%s: called with th == NULL", __func__)); 162 KASSERT(m, ("%s: called with m == NULL", __func__)); 163 KASSERT(n, ("%s: called with n == NULL", __func__)); 164 165 /* We assume this initialization occurred elsewhere. */ 166 KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)", 167 __func__, n->m_len)); 168 KASSERT(n->m_data == M_START(n), 169 ("%s: called with n->m_data != M_START(n)", __func__)); 170 171 /* 172 * Calculate the size of the TCP header. We use this often 173 * enough that it is worth just calculating at the start. 174 */ 175 tcp_off = th->th_off << 2; 176 177 /* Trim off leading empty mbufs. */ 178 while (m && m->m_len == 0) 179 m = m->m_next; 180 181 if (m) { 182 m_cur = m; 183 } 184 else { 185 /* 186 * No data? Highly unusual. We would expect to at 187 * least see a TCP header in the mbuf. 188 * As we have a pointer to the TCP header, I guess 189 * we should just copy that. (???) 190 */ 191 fallback: 192 bytes_to_copy = tcp_off; 193 if (bytes_to_copy > M_SIZE(n)) 194 bytes_to_copy = M_SIZE(n); 195 bcopy(th, n->m_data, bytes_to_copy); 196 n->m_len = bytes_to_copy; 197 return; 198 } 199 200 /* 201 * Find TCP header. Record the total number of bytes up to, 202 * and including, the TCP header. 203 */ 204 while (m_cur) { 205 if ((caddr_t) th >= (caddr_t) m_cur->m_data && 206 (caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len)) 207 break; 208 bytes_to_copy += m_cur->m_len; 209 m_cur = m_cur->m_next; 210 } 211 if (m_cur) 212 bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data; 213 else 214 goto fallback; 215 bytes_to_copy += tcp_off; 216 217 /* 218 * If we already want to copy more bytes than we can hold 219 * in the destination mbuf, skip leading bytes and copy 220 * what we can. 221 * 222 * Otherwise, consider trailing data. 223 */ 224 if (bytes_to_copy > M_SIZE(n)) { 225 skip = bytes_to_copy - M_SIZE(n); 226 bytes_to_copy = M_SIZE(n); 227 } 228 else { 229 /* 230 * Determine how much trailing data is in the chain. 231 * We start with the length of this mbuf (the one 232 * containing th) and subtract the size of the TCP 233 * header (tcp_off) and the size of the data prior 234 * to th (th - m_cur->m_data). 235 * 236 * This *should not* be negative, as the TCP code 237 * should put the whole TCP header in a single 238 * mbuf. But, it isn't a problem if it is. We will 239 * simple work off our negative balance as we look 240 * at subsequent mbufs. 241 */ 242 trailing_data = m_cur->m_len - tcp_off; 243 trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data; 244 m_cur = m_cur->m_next; 245 while (m_cur) { 246 trailing_data += m_cur->m_len; 247 m_cur = m_cur->m_next; 248 } 249 if ((bytes_to_copy + trailing_data) > M_SIZE(n)) 250 bytes_to_copy = M_SIZE(n); 251 else 252 bytes_to_copy += trailing_data; 253 } 254 255 m_copydata(m, skip, bytes_to_copy, n->m_data); 256 n->m_len = bytes_to_copy; 257 } 258 259 void 260 tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue) 261 { 262 struct mbuf *n = NULL, *mhead; 263 264 KASSERT(th, ("%s: called with th == NULL", __func__)); 265 KASSERT(m, ("%s: called with m == NULL", __func__)); 266 KASSERT(queue, ("%s: called with queue == NULL", __func__)); 267 268 /* We only care about data packets. */ 269 while (m && m->m_type != MT_DATA) 270 m = m->m_next; 271 272 /* We only need to do something if we still have an mbuf. */ 273 if (!m) 274 return; 275 276 /* If we are not saving mbufs, return now. */ 277 if (queue->mq_maxlen == 0) 278 return; 279 280 /* 281 * Check to see if we will need to recycle mbufs. 282 * 283 * If we need to get rid of mbufs to stay below 284 * our packet count, try to reuse the mbuf. Once 285 * we already have a new mbuf (n), then we can 286 * simply free subsequent mbufs. 287 * 288 * Note that most of the logic in here is to deal 289 * with the reuse. If we are fine with constant 290 * mbuf allocs/deallocs, we could ditch this logic. 291 * But, it only seems to make sense to reuse 292 * mbufs we already have. 293 */ 294 while (mbufq_full(queue)) { 295 mhead = mbufq_dequeue(queue); 296 297 if (n) { 298 tcp_pcap_m_freem(mhead); 299 } 300 else { 301 /* 302 * If this held an external cluster, try to 303 * detach the cluster. But, if we held the 304 * last reference, go through the normal 305 * free-ing process. 306 */ 307 if (mhead->m_flags & M_EXT) { 308 switch (mhead->m_ext.ext_type) { 309 case EXT_SFBUF: 310 /* Don't mess around with these. */ 311 tcp_pcap_m_freem(mhead); 312 continue; 313 default: 314 if (atomic_fetchadd_int( 315 mhead->m_ext.ext_cnt, -1) == 1) 316 { 317 /* 318 * We held the last reference 319 * on this cluster. Restore 320 * the reference count and put 321 * it back in the pool. 322 */ 323 *(mhead->m_ext.ext_cnt) = 1; 324 tcp_pcap_m_freem(mhead); 325 continue; 326 } 327 /* 328 * We were able to cleanly free the 329 * reference. 330 */ 331 atomic_subtract_int( 332 &tcp_pcap_clusters_referenced_cur, 333 1); 334 tcp_pcap_alloc_reuse_ext++; 335 break; 336 } 337 } 338 else { 339 tcp_pcap_alloc_reuse_mbuf++; 340 } 341 342 n = mhead; 343 tcp_pcap_m_freem(n->m_next); 344 m_init(n, NULL, 0, M_NOWAIT, MT_DATA, 0); 345 } 346 } 347 348 /* Check to see if we need to get a new mbuf. */ 349 if (!n) { 350 if (!(n = m_get(M_NOWAIT, MT_DATA))) 351 return; 352 tcp_pcap_alloc_new_mbuf++; 353 } 354 355 /* 356 * What are we dealing with? If a cluster, attach it. Otherwise, 357 * try to copy the data from the beginning of the mbuf to the 358 * end of data. (There may be data between the start of the data 359 * area and the current data pointer. We want to get this, because 360 * it may contain header information that is useful.) 361 * In cases where that isn't possible, settle for what we can 362 * get. 363 */ 364 if ((m->m_flags & M_EXT) && tcp_pcap_take_cluster_reference()) { 365 n->m_data = m->m_data; 366 n->m_len = m->m_len; 367 mb_dupcl(n, m); 368 } 369 else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) { 370 /* 371 * At this point, n is guaranteed to be a normal mbuf 372 * with no cluster and no packet header. Because the 373 * logic in this code block requires this, the assert 374 * is here to catch any instances where someone 375 * changes the logic to invalidate that assumption. 376 */ 377 KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0, 378 ("%s: Unexpected flags (%#x) for mbuf", 379 __func__, n->m_flags)); 380 n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m); 381 n->m_len = m->m_len; 382 bcopy(M_START(m), n->m_dat, 383 m->m_len + M_LEADINGSPACE_NOWRITE(m)); 384 } 385 else { 386 /* 387 * This is the case where we need to "settle for what 388 * we can get". The most probable way to this code 389 * path is that we've already taken references to the 390 * maximum number of mbuf clusters we can, and the data 391 * is too long to fit in an mbuf's internal storage. 392 * Try for a "best fit". 393 */ 394 tcp_pcap_copy_bestfit(th, m, n); 395 396 /* Don't try to get additional data. */ 397 goto add_to_queue; 398 } 399 400 if (m->m_next) { 401 n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT); 402 tcp_pcap_adj_cluster_reference(n->m_next, 1); 403 } 404 405 add_to_queue: 406 /* Add the new mbuf to the list. */ 407 if (mbufq_enqueue(queue, n)) { 408 /* This shouldn't happen. If INVARIANTS is defined, panic. */ 409 KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__)); 410 tcp_pcap_m_freem(n); 411 } 412 } 413 414 void 415 tcp_pcap_drain(struct mbufq *queue) 416 { 417 struct mbuf *m; 418 while ((m = mbufq_dequeue(queue))) 419 tcp_pcap_m_freem(m); 420 } 421 422 void 423 tcp_pcap_tcpcb_init(struct tcpcb *tp) 424 { 425 mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets); 426 mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets); 427 } 428 429 void 430 tcp_pcap_set_sock_max(struct mbufq *queue, int newval) 431 { 432 queue->mq_maxlen = newval; 433 while (queue->mq_len > queue->mq_maxlen) 434 tcp_pcap_m_freem(mbufq_dequeue(queue)); 435 } 436 437 int 438 tcp_pcap_get_sock_max(struct mbufq *queue) 439 { 440 return queue->mq_maxlen; 441 } 442