1
2 /*-
3 * SPDX-License-Identifier: BSD-2-Clause
4 *
5 * Copyright (c) 2016-2018 Netflix, Inc.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30 #include "opt_inet.h"
31 #include "opt_ddb.h"
32 #include <sys/param.h>
33 #include <sys/arb.h>
34 #include <sys/hash.h>
35 #include <sys/kernel.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/qmath.h>
40 #include <sys/queue.h>
41 #include <sys/refcount.h>
42 #include <sys/rwlock.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/sysctl.h>
46 #ifdef DDB
47 #include <sys/time.h>
48 #endif
49 #include <sys/tree.h>
50 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
51 #include <sys/counter.h>
52 #include <dev/tcp_log/tcp_log_dev.h>
53
54 #ifdef DDB
55 #include <ddb/ddb.h>
56 #endif
57
58 #include <net/if.h>
59 #include <net/if_var.h>
60 #include <net/vnet.h>
61
62 #include <netinet/in.h>
63 #ifdef DDB
64 #include <netinet/in_kdtrace.h>
65 #endif
66 #include <netinet/in_pcb.h>
67 #include <netinet/in_var.h>
68 #include <netinet/tcp_var.h>
69 #include <netinet/tcp_log_buf.h>
70 #include <netinet/tcp_seq.h>
71 #include <netinet/tcp_hpts.h>
72
73 /* Default expiry time */
74 #define TCP_LOG_EXPIRE_TIME ((sbintime_t)60 * SBT_1S)
75
76 /* Max interval at which to run the expiry timer */
77 #define TCP_LOG_EXPIRE_INTVL ((sbintime_t)5 * SBT_1S)
78
79 bool tcp_log_verbose;
80 static uma_zone_t tcp_log_id_bucket_zone, tcp_log_id_node_zone, tcp_log_zone;
81 static int tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT;
82 static uint32_t tcp_log_version = TCP_LOG_BUF_VER;
83 RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket);
84 static struct tcp_log_id_tree tcp_log_id_head;
85 static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head =
86 STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head);
87 static struct mtx tcp_log_expireq_mtx;
88 static struct callout tcp_log_expireq_callout;
89 static u_long tcp_log_auto_ratio = 0;
90 static volatile u_long tcp_log_auto_ratio_cur = 0;
91 static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL;
92 static bool tcp_log_auto_all = false;
93 static uint32_t tcp_disable_all_bb_logs = 0;
94
95 RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
96
97 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
98 "TCP Black Box controls");
99
100 SYSCTL_NODE(_net_inet_tcp_bb, OID_AUTO, tp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
101 "TCP Black Box Trace Point controls");
102
103 SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose,
104 0, "Force verbose logging for TCP traces");
105
106 SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit,
107 CTLFLAG_RW, &tcp_log_session_limit, 0,
108 "Maximum number of events maintained for each TCP session");
109
110 uint32_t tcp_trace_point_config = 0;
111 SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, number, CTLFLAG_RW,
112 &tcp_trace_point_config, TCP_LOG_STATE_HEAD_AUTO,
113 "What is the trace point number to activate (0=none, 0xffffffff = all)?");
114
115 uint32_t tcp_trace_point_bb_mode = TCP_LOG_STATE_CONTINUAL;
116 SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, bbmode, CTLFLAG_RW,
117 &tcp_trace_point_bb_mode, TCP_LOG_STATE_HEAD_AUTO,
118 "What is BB logging mode that is activated?");
119
120 int32_t tcp_trace_point_count = 0;
121 SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, count, CTLFLAG_RW,
122 &tcp_trace_point_count, TCP_LOG_STATE_HEAD_AUTO,
123 "How many connections will have BB logging turned on that hit the tracepoint?");
124
125
126
127 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW,
128 &tcp_log_zone, "Maximum number of events maintained for all TCP sessions");
129
130 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD,
131 &tcp_log_zone, "Current number of events maintained for all TCP sessions");
132
133 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW,
134 &tcp_log_id_bucket_zone, "Maximum number of log IDs");
135
136 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD,
137 &tcp_log_id_bucket_zone, "Current number of log IDs");
138
139 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW,
140 &tcp_log_id_node_zone, "Maximum number of tcpcbs with log IDs");
141
142 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD,
143 &tcp_log_id_node_zone, "Current number of tcpcbs with log IDs");
144
145 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version,
146 0, "Version of log formats exported");
147
148 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, disable_all, CTLFLAG_RW,
149 &tcp_disable_all_bb_logs, 0,
150 "Disable all BB logging for all connections");
151
152 SYSCTL_ULONG(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW,
153 &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions");
154
155 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW,
156 &tcp_log_auto_mode, 0,
157 "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_TAIL)");
158
159 SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW,
160 &tcp_log_auto_all, 0,
161 "Auto-select from all sessions (rather than just those with IDs)");
162
163 #ifdef TCPLOG_DEBUG_COUNTERS
164 counter_u64_t tcp_log_queued;
165 counter_u64_t tcp_log_que_fail1;
166 counter_u64_t tcp_log_que_fail2;
167 counter_u64_t tcp_log_que_fail3;
168 counter_u64_t tcp_log_que_fail4;
169 counter_u64_t tcp_log_que_fail5;
170 counter_u64_t tcp_log_que_copyout;
171 counter_u64_t tcp_log_que_read;
172 counter_u64_t tcp_log_que_freed;
173
174 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD,
175 &tcp_log_queued, "Number of entries queued");
176 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD,
177 &tcp_log_que_fail1, "Number of entries queued but fail 1");
178 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD,
179 &tcp_log_que_fail2, "Number of entries queued but fail 2");
180 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD,
181 &tcp_log_que_fail3, "Number of entries queued but fail 3");
182 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD,
183 &tcp_log_que_fail4, "Number of entries queued but fail 4");
184 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD,
185 &tcp_log_que_fail5, "Number of entries queued but fail 4");
186 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD,
187 &tcp_log_que_copyout, "Number of entries copied out");
188 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD,
189 &tcp_log_que_read, "Number of entries read from the queue");
190 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD,
191 &tcp_log_que_freed, "Number of entries freed after reading");
192 #endif
193
194 #ifdef INVARIANTS
195 #define TCPLOG_DEBUG_RINGBUF
196 #endif
197 /* Number of requests to consider a PBCID "active". */
198 #define ACTIVE_REQUEST_COUNT 10
199
200 /* Statistic tracking for "active" PBCIDs. */
201 static counter_u64_t tcp_log_pcb_ids_cur;
202 static counter_u64_t tcp_log_pcb_ids_tot;
203
204 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_cur, CTLFLAG_RD,
205 &tcp_log_pcb_ids_cur, "Number of pcb IDs allocated in the system");
206 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_tot, CTLFLAG_RD,
207 &tcp_log_pcb_ids_tot, "Total number of pcb IDs that have been allocated");
208
209 struct tcp_log_mem
210 {
211 STAILQ_ENTRY(tcp_log_mem) tlm_queue;
212 struct tcp_log_buffer tlm_buf;
213 struct tcp_log_verbose tlm_v;
214 #ifdef TCPLOG_DEBUG_RINGBUF
215 volatile int tlm_refcnt;
216 #endif
217 };
218
219 /* 60 bytes for the header, + 16 bytes for padding */
220 static uint8_t zerobuf[76];
221
222 /*
223 * Lock order:
224 * 1. TCPID_TREE
225 * 2. TCPID_BUCKET
226 * 3. INP
227 *
228 * Rules:
229 * A. You need a lock on the Tree to add/remove buckets.
230 * B. You need a lock on the bucket to add/remove nodes from the bucket.
231 * C. To change information in a node, you need the INP lock if the tln_closed
232 * field is false. Otherwise, you need the bucket lock. (Note that the
233 * tln_closed field can change at any point, so you need to recheck the
234 * entry after acquiring the INP lock.)
235 * D. To remove a node from the bucket, you must have that entry locked,
236 * according to the criteria of Rule C. Also, the node must not be on
237 * the expiry queue.
238 * E. The exception to C is the expiry queue fields, which are locked by
239 * the TCPLOG_EXPIREQ lock.
240 *
241 * Buckets have a reference count. Each node is a reference. Further,
242 * other callers may add reference counts to keep a bucket from disappearing.
243 * You can add a reference as long as you own a lock sufficient to keep the
244 * bucket from disappearing. For example, a common use is:
245 * a. Have a locked INP, but need to lock the TCPID_BUCKET.
246 * b. Add a refcount on the bucket. (Safe because the INP lock prevents
247 * the TCPID_BUCKET from going away.)
248 * c. Drop the INP lock.
249 * d. Acquire a lock on the TCPID_BUCKET.
250 * e. Acquire a lock on the INP.
251 * f. Drop the refcount on the bucket.
252 * (At this point, the bucket may disappear.)
253 *
254 * Expire queue lock:
255 * You can acquire this with either the bucket or INP lock. Don't reverse it.
256 * When the expire code has committed to freeing a node, it resets the expiry
257 * time to SBT_MAX. That is the signal to everyone else that they should
258 * leave that node alone.
259 */
260 static struct rwlock tcp_id_tree_lock;
261 #define TCPID_TREE_WLOCK() rw_wlock(&tcp_id_tree_lock)
262 #define TCPID_TREE_RLOCK() rw_rlock(&tcp_id_tree_lock)
263 #define TCPID_TREE_UPGRADE() rw_try_upgrade(&tcp_id_tree_lock)
264 #define TCPID_TREE_WUNLOCK() rw_wunlock(&tcp_id_tree_lock)
265 #define TCPID_TREE_RUNLOCK() rw_runlock(&tcp_id_tree_lock)
266 #define TCPID_TREE_WLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_WLOCKED)
267 #define TCPID_TREE_RLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_RLOCKED)
268 #define TCPID_TREE_UNLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_UNLOCKED)
269
270 #define TCPID_BUCKET_LOCK_INIT(tlb) mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF)
271 #define TCPID_BUCKET_LOCK_DESTROY(tlb) mtx_destroy(&((tlb)->tlb_mtx))
272 #define TCPID_BUCKET_LOCK(tlb) mtx_lock(&((tlb)->tlb_mtx))
273 #define TCPID_BUCKET_UNLOCK(tlb) mtx_unlock(&((tlb)->tlb_mtx))
274 #define TCPID_BUCKET_LOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_OWNED)
275 #define TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED)
276
277 #define TCPID_BUCKET_REF(tlb) refcount_acquire(&((tlb)->tlb_refcnt))
278 #define TCPID_BUCKET_UNREF(tlb) refcount_release(&((tlb)->tlb_refcnt))
279
280 #define TCPLOG_EXPIREQ_LOCK() mtx_lock(&tcp_log_expireq_mtx)
281 #define TCPLOG_EXPIREQ_UNLOCK() mtx_unlock(&tcp_log_expireq_mtx)
282
283 SLIST_HEAD(tcp_log_id_head, tcp_log_id_node);
284
285 struct tcp_log_id_bucket
286 {
287 /*
288 * tlb_id must be first. This lets us use strcmp on
289 * (struct tcp_log_id_bucket *) and (char *) interchangeably.
290 */
291 char tlb_id[TCP_LOG_ID_LEN];
292 char tlb_tag[TCP_LOG_TAG_LEN];
293 RB_ENTRY(tcp_log_id_bucket) tlb_rb;
294 struct tcp_log_id_head tlb_head;
295 struct mtx tlb_mtx;
296 volatile u_int tlb_refcnt;
297 volatile u_int tlb_reqcnt;
298 uint32_t tlb_loglimit;
299 int8_t tlb_logstate;
300 };
301
302 struct tcp_log_id_node
303 {
304 SLIST_ENTRY(tcp_log_id_node) tln_list;
305 STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */
306 sbintime_t tln_expiretime; /* Locked by the expireq lock */
307
308 /*
309 * If INP is NULL, that means the connection has closed. We've
310 * saved the connection endpoint information and the log entries
311 * in the tln_ie and tln_entries members. We've also saved a pointer
312 * to the enclosing bucket here. If INP is not NULL, the information is
313 * in the PCB and not here.
314 */
315 struct inpcb *tln_inp;
316 struct tcpcb *tln_tp;
317 struct tcp_log_id_bucket *tln_bucket;
318 struct in_endpoints tln_ie;
319 struct tcp_log_stailq tln_entries;
320 int tln_count;
321 volatile int tln_closed;
322 uint8_t tln_af;
323 };
324
325 enum tree_lock_state {
326 TREE_UNLOCKED = 0,
327 TREE_RLOCKED,
328 TREE_WLOCKED,
329 };
330
331 /* Do we want to select this session for auto-logging? */
332 static __inline bool
tcp_log_selectauto(void)333 tcp_log_selectauto(void)
334 {
335
336 /*
337 * If we are doing auto-capturing, figure out whether we will capture
338 * this session.
339 */
340 if (tcp_log_auto_ratio &&
341 (tcp_disable_all_bb_logs == 0) &&
342 (atomic_fetchadd_long(&tcp_log_auto_ratio_cur, 1) %
343 tcp_log_auto_ratio) == 0)
344 return (true);
345 return (false);
346 }
347
348 static __inline int
tcp_log_id_cmp(struct tcp_log_id_bucket * a,struct tcp_log_id_bucket * b)349 tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b)
350 {
351 KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL"));
352 KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL"));
353 return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN);
354 }
355
RB_GENERATE_STATIC(tcp_log_id_tree,tcp_log_id_bucket,tlb_rb,tcp_log_id_cmp)356 RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
357
358 static __inline void
359 tcp_log_id_validate_tree_lock(int tree_locked)
360 {
361
362 #ifdef INVARIANTS
363 switch (tree_locked) {
364 case TREE_WLOCKED:
365 TCPID_TREE_WLOCK_ASSERT();
366 break;
367 case TREE_RLOCKED:
368 TCPID_TREE_RLOCK_ASSERT();
369 break;
370 case TREE_UNLOCKED:
371 TCPID_TREE_UNLOCK_ASSERT();
372 break;
373 default:
374 kassert_panic("%s:%d: unknown tree lock state", __func__,
375 __LINE__);
376 }
377 #endif
378 }
379
380 static __inline void
tcp_log_remove_bucket(struct tcp_log_id_bucket * tlb)381 tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb)
382 {
383
384 TCPID_TREE_WLOCK_ASSERT();
385 KASSERT(SLIST_EMPTY(&tlb->tlb_head),
386 ("%s: Attempt to remove non-empty bucket", __func__));
387 if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) {
388 #ifdef INVARIANTS
389 kassert_panic("%s:%d: error removing element from tree",
390 __func__, __LINE__);
391 #endif
392 }
393 TCPID_BUCKET_LOCK_DESTROY(tlb);
394 counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1);
395 uma_zfree(tcp_log_id_bucket_zone, tlb);
396 }
397
398 /*
399 * Call with a referenced and locked bucket.
400 * Will return true if the bucket was freed; otherwise, false.
401 * tlb: The bucket to unreference.
402 * tree_locked: A pointer to the state of the tree lock. If the tree lock
403 * state changes, the function will update it.
404 * inp: If not NULL and the function needs to drop the inp lock to relock the
405 * tree, it will do so. (The caller must ensure inp will not become invalid,
406 * probably by holding a reference to it.)
407 */
408 static bool
tcp_log_unref_bucket(struct tcp_log_id_bucket * tlb,int * tree_locked,struct inpcb * inp)409 tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked,
410 struct inpcb *inp)
411 {
412
413 KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__));
414 KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
415 __func__));
416
417 tcp_log_id_validate_tree_lock(*tree_locked);
418
419 /*
420 * Did we hold the last reference on the tlb? If so, we may need
421 * to free it. (Note that we can realistically only execute the
422 * loop twice: once without a write lock and once with a write
423 * lock.)
424 */
425 while (TCPID_BUCKET_UNREF(tlb)) {
426 /*
427 * We need a write lock on the tree to free this.
428 * If we can upgrade the tree lock, this is "easy". If we
429 * can't upgrade the tree lock, we need to do this the
430 * "hard" way: unwind all our locks and relock everything.
431 * In the meantime, anything could have changed. We even
432 * need to validate that we still need to free the bucket.
433 */
434 if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE())
435 *tree_locked = TREE_WLOCKED;
436 else if (*tree_locked != TREE_WLOCKED) {
437 TCPID_BUCKET_REF(tlb);
438 if (inp != NULL)
439 INP_WUNLOCK(inp);
440 TCPID_BUCKET_UNLOCK(tlb);
441 if (*tree_locked == TREE_RLOCKED)
442 TCPID_TREE_RUNLOCK();
443 TCPID_TREE_WLOCK();
444 *tree_locked = TREE_WLOCKED;
445 TCPID_BUCKET_LOCK(tlb);
446 if (inp != NULL)
447 INP_WLOCK(inp);
448 continue;
449 }
450
451 /*
452 * We have an empty bucket and a write lock on the tree.
453 * Remove the empty bucket.
454 */
455 tcp_log_remove_bucket(tlb);
456 return (true);
457 }
458 return (false);
459 }
460
461 /*
462 * Call with a locked bucket. This function will release the lock on the
463 * bucket before returning.
464 *
465 * The caller is responsible for freeing the tp->t_lin/tln node!
466 *
467 * Note: one of tp or both tlb and tln must be supplied.
468 *
469 * inp: A pointer to the inp. If the function needs to drop the inp lock to
470 * acquire the tree write lock, it will do so. (The caller must ensure inp
471 * will not become invalid, probably by holding a reference to it.)
472 * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored)
473 * tlb: A pointer to the bucket. (optional; ignored if tp is specified)
474 * tln: A pointer to the node. (optional; ignored if tp is specified)
475 * tree_locked: A pointer to the state of the tree lock. If the tree lock
476 * state changes, the function will update it.
477 *
478 * Will return true if the INP lock was reacquired; otherwise, false.
479 */
480 static bool
tcp_log_remove_id_node(struct inpcb * inp,struct tcpcb * tp,struct tcp_log_id_bucket * tlb,struct tcp_log_id_node * tln,int * tree_locked)481 tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp,
482 struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln,
483 int *tree_locked)
484 {
485 int orig_tree_locked;
486
487 KASSERT(tp != NULL || (tlb != NULL && tln != NULL),
488 ("%s: called with tp=%p, tlb=%p, tln=%p", __func__,
489 tp, tlb, tln));
490 KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
491 __func__));
492
493 if (tp != NULL) {
494 tlb = tp->t_lib;
495 tln = tp->t_lin;
496 KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__));
497 KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__));
498 }
499
500 tcp_log_id_validate_tree_lock(*tree_locked);
501 TCPID_BUCKET_LOCK_ASSERT(tlb);
502
503 /*
504 * Remove the node, clear the log bucket and node from the TCPCB, and
505 * decrement the bucket refcount. In the process, if this is the
506 * last reference, the bucket will be freed.
507 */
508 SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list);
509 if (tp != NULL) {
510 tp->t_lib = NULL;
511 tp->t_lin = NULL;
512 }
513 orig_tree_locked = *tree_locked;
514 if (!tcp_log_unref_bucket(tlb, tree_locked, inp))
515 TCPID_BUCKET_UNLOCK(tlb);
516 return (*tree_locked != orig_tree_locked);
517 }
518
519 #define RECHECK_INP_CLEAN(cleanup) do { \
520 if (inp->inp_flags & INP_DROPPED) { \
521 rv = ECONNRESET; \
522 cleanup; \
523 goto done; \
524 } \
525 tp = intotcpcb(inp); \
526 } while (0)
527
528 #define RECHECK_INP() RECHECK_INP_CLEAN(/* noop */)
529
530 static void
tcp_log_grow_tlb(char * tlb_id,struct tcpcb * tp)531 tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp)
532 {
533
534 INP_WLOCK_ASSERT(tptoinpcb(tp));
535
536 #ifdef STATS
537 if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL)
538 (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id));
539 #endif
540 }
541
542 static void
tcp_log_increment_reqcnt(struct tcp_log_id_bucket * tlb)543 tcp_log_increment_reqcnt(struct tcp_log_id_bucket *tlb)
544 {
545
546 atomic_fetchadd_int(&tlb->tlb_reqcnt, 1);
547 }
548
549 int
tcp_log_apply_ratio(struct tcpcb * tp,int ratio)550 tcp_log_apply_ratio(struct tcpcb *tp, int ratio)
551 {
552 struct tcp_log_id_bucket *tlb;
553 struct inpcb *inp = tptoinpcb(tp);
554 uint32_t hash, ratio_hash_thresh;
555 int rv, tree_locked;
556
557 rv = 0;
558 tree_locked = TREE_UNLOCKED;
559 tlb = tp->t_lib;
560
561 INP_WLOCK_ASSERT(inp);
562 if (tlb == NULL) {
563 INP_WUNLOCK(inp);
564 return (EOPNOTSUPP);
565 }
566 if (ratio)
567 ratio_hash_thresh = max(1, UINT32_MAX / ratio);
568 else
569 ratio_hash_thresh = 0;
570 TCPID_BUCKET_REF(tlb);
571 INP_WUNLOCK(inp);
572 TCPID_BUCKET_LOCK(tlb);
573
574 hash = hash32_buf(tlb->tlb_id, strlen(tlb->tlb_id), 0);
575 if (hash > ratio_hash_thresh && tp->_t_logstate == TCP_LOG_STATE_OFF &&
576 tlb->tlb_logstate == TCP_LOG_STATE_OFF) {
577 /*
578 * Ratio decision not to log this log ID (and this connection by
579 * way of association). We only apply a log ratio log disable
580 * decision if it would not interfere with a log enable decision
581 * made elsewhere e.g. tcp_log_selectauto() or setsockopt().
582 */
583 tlb->tlb_logstate = TCP_LOG_STATE_RATIO_OFF;
584 INP_WLOCK(inp);
585 RECHECK_INP();
586 (void)tcp_log_state_change(tp, TCP_LOG_STATE_OFF);
587 done:
588 INP_WUNLOCK(inp);
589 }
590
591 INP_UNLOCK_ASSERT(inp);
592 if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
593 TCPID_BUCKET_UNLOCK(tlb);
594
595 if (tree_locked == TREE_WLOCKED) {
596 TCPID_TREE_WLOCK_ASSERT();
597 TCPID_TREE_WUNLOCK();
598 } else if (tree_locked == TREE_RLOCKED) {
599 TCPID_TREE_RLOCK_ASSERT();
600 TCPID_TREE_RUNLOCK();
601 } else
602 TCPID_TREE_UNLOCK_ASSERT();
603
604 return (rv);
605 }
606
607 /*
608 * Associate the specified tag with a particular TCP log ID.
609 * Called with INPCB locked. Returns with it unlocked.
610 * Returns 0 on success or EOPNOTSUPP if the connection has no TCP log ID.
611 */
612 int
tcp_log_set_tag(struct tcpcb * tp,char * tag)613 tcp_log_set_tag(struct tcpcb *tp, char *tag)
614 {
615 struct inpcb *inp = tptoinpcb(tp);
616 struct tcp_log_id_bucket *tlb;
617 int tree_locked;
618
619 INP_WLOCK_ASSERT(inp);
620
621 tree_locked = TREE_UNLOCKED;
622 tlb = tp->t_lib;
623 if (tlb == NULL) {
624 INP_WUNLOCK(inp);
625 return (EOPNOTSUPP);
626 }
627
628 TCPID_BUCKET_REF(tlb);
629 INP_WUNLOCK(inp);
630 TCPID_BUCKET_LOCK(tlb);
631 strlcpy(tlb->tlb_tag, tag, TCP_LOG_TAG_LEN);
632 if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
633 TCPID_BUCKET_UNLOCK(tlb);
634
635 if (tree_locked == TREE_WLOCKED) {
636 TCPID_TREE_WLOCK_ASSERT();
637 TCPID_TREE_WUNLOCK();
638 } else if (tree_locked == TREE_RLOCKED) {
639 TCPID_TREE_RLOCK_ASSERT();
640 TCPID_TREE_RUNLOCK();
641 } else
642 TCPID_TREE_UNLOCK_ASSERT();
643
644 return (0);
645 }
646
647 /*
648 * Set the TCP log ID for a TCPCB.
649 * Called with INPCB locked. Returns with it unlocked.
650 */
651 int
tcp_log_set_id(struct tcpcb * tp,char * id)652 tcp_log_set_id(struct tcpcb *tp, char *id)
653 {
654 struct tcp_log_id_bucket *tlb, *tmp_tlb;
655 struct tcp_log_id_node *tln;
656 struct inpcb *inp = tptoinpcb(tp);
657 int tree_locked, rv;
658 bool bucket_locked, same;
659
660 tlb = NULL;
661 tln = NULL;
662 tree_locked = TREE_UNLOCKED;
663 bucket_locked = false;
664
665 restart:
666 INP_WLOCK_ASSERT(inp);
667 /* See if the ID is unchanged. */
668 same = ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) ||
669 (tp->t_lib == NULL && *id == 0));
670 if (tp->_t_logstate && STAILQ_FIRST(&tp->t_logs) && !same) {
671 /*
672 * There are residual logs left we may
673 * be changing id's so dump what we can.
674 */
675 switch(tp->_t_logstate) {
676 case TCP_LOG_STATE_HEAD_AUTO:
677 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at id switch",
678 M_NOWAIT, false);
679 break;
680 case TCP_LOG_STATE_TAIL_AUTO:
681 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at id switch",
682 M_NOWAIT, false);
683 break;
684 case TCP_LOG_STATE_CONTINUAL:
685 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual at id switch",
686 M_NOWAIT, false);
687 break;
688 case TCP_LOG_VIA_BBPOINTS:
689 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints at id switch",
690 M_NOWAIT, false);
691 break;
692 }
693 }
694 if (same) {
695 if (tp->t_lib != NULL) {
696 tcp_log_increment_reqcnt(tp->t_lib);
697 if ((tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) &&
698 (tp->t_log_state_set == 0)) {
699 /* Clone in any logging */
700
701 tp->_t_logstate = tp->t_lib->tlb_logstate;
702 }
703 if ((tp->t_lib->tlb_loglimit) &&
704 (tp->t_log_state_set == 0)) {
705 /* We also have a limit set */
706
707 tp->t_loglimit = tp->t_lib->tlb_loglimit;
708 }
709 }
710 rv = 0;
711 goto done;
712 }
713
714 /*
715 * If the TCPCB had a previous ID, we need to extricate it from
716 * the previous list.
717 *
718 * Drop the TCPCB lock and lock the tree and the bucket.
719 * Because this is called in the socket context, we (theoretically)
720 * don't need to worry about the INPCB completely going away
721 * while we are gone.
722 */
723 if (tp->t_lib != NULL) {
724 tlb = tp->t_lib;
725 TCPID_BUCKET_REF(tlb);
726 INP_WUNLOCK(inp);
727
728 if (tree_locked == TREE_UNLOCKED) {
729 TCPID_TREE_RLOCK();
730 tree_locked = TREE_RLOCKED;
731 }
732 TCPID_BUCKET_LOCK(tlb);
733 bucket_locked = true;
734 INP_WLOCK(inp);
735
736 /*
737 * Unreference the bucket. If our bucket went away, it is no
738 * longer locked or valid.
739 */
740 if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) {
741 bucket_locked = false;
742 tlb = NULL;
743 }
744
745 /* Validate the INP. */
746 RECHECK_INP();
747
748 /*
749 * Evaluate whether the bucket changed while we were unlocked.
750 *
751 * Possible scenarios here:
752 * 1. Bucket is unchanged and the same one we started with.
753 * 2. The TCPCB no longer has a bucket and our bucket was
754 * freed.
755 * 3. The TCPCB has a new bucket, whether ours was freed.
756 * 4. The TCPCB no longer has a bucket and our bucket was
757 * not freed.
758 *
759 * In cases 2-4, we will start over. In case 1, we will
760 * proceed here to remove the bucket.
761 */
762 if (tlb == NULL || tp->t_lib != tlb) {
763 KASSERT(bucket_locked || tlb == NULL,
764 ("%s: bucket_locked (%d) and tlb (%p) are "
765 "inconsistent", __func__, bucket_locked, tlb));
766
767 if (bucket_locked) {
768 TCPID_BUCKET_UNLOCK(tlb);
769 bucket_locked = false;
770 tlb = NULL;
771 }
772 goto restart;
773 }
774
775 /*
776 * Store the (struct tcp_log_id_node) for reuse. Then, remove
777 * it from the bucket. In the process, we may end up relocking.
778 * If so, we need to validate that the INP is still valid, and
779 * the TCPCB entries match we expect.
780 *
781 * We will clear tlb and change the bucket_locked state just
782 * before calling tcp_log_remove_id_node(), since that function
783 * will unlock the bucket.
784 */
785 if (tln != NULL)
786 uma_zfree(tcp_log_id_node_zone, tln);
787 tln = tp->t_lin;
788 tlb = NULL;
789 bucket_locked = false;
790 if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) {
791 RECHECK_INP();
792
793 /*
794 * If the TCPCB moved to a new bucket while we had
795 * dropped the lock, restart.
796 */
797 if (tp->t_lib != NULL || tp->t_lin != NULL)
798 goto restart;
799 }
800
801 /*
802 * Yay! We successfully removed the TCPCB from its old
803 * bucket. Phew!
804 *
805 * On to bigger and better things...
806 */
807 }
808
809 /* At this point, the TCPCB should not be in any bucket. */
810 KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__));
811
812 /*
813 * If the new ID is not empty, we need to now assign this TCPCB to a
814 * new bucket.
815 */
816 if (*id) {
817 /* Get a new tln, if we don't already have one to reuse. */
818 if (tln == NULL) {
819 tln = uma_zalloc(tcp_log_id_node_zone,
820 M_NOWAIT | M_ZERO);
821 if (tln == NULL) {
822 rv = ENOBUFS;
823 goto done;
824 }
825 tln->tln_inp = inp;
826 tln->tln_tp = tp;
827 }
828
829 /*
830 * Drop the INP lock for a bit. We don't need it, and dropping
831 * it prevents lock order reversals.
832 */
833 INP_WUNLOCK(inp);
834
835 /* Make sure we have at least a read lock on the tree. */
836 tcp_log_id_validate_tree_lock(tree_locked);
837 if (tree_locked == TREE_UNLOCKED) {
838 TCPID_TREE_RLOCK();
839 tree_locked = TREE_RLOCKED;
840 }
841
842 refind:
843 /*
844 * Remember that we constructed (struct tcp_log_id_node) so
845 * we can safely cast the id to it for the purposes of finding.
846 */
847 KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL",
848 __func__, __LINE__));
849 tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head,
850 (struct tcp_log_id_bucket *) id);
851
852 /*
853 * If we didn't find a matching bucket, we need to add a new
854 * one. This requires a write lock. But, of course, we will
855 * need to recheck some things when we re-acquire the lock.
856 */
857 if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) {
858 tree_locked = TREE_WLOCKED;
859 if (!TCPID_TREE_UPGRADE()) {
860 TCPID_TREE_RUNLOCK();
861 TCPID_TREE_WLOCK();
862
863 /*
864 * The tree may have changed while we were
865 * unlocked.
866 */
867 goto refind;
868 }
869 }
870
871 /* If we need to add a new bucket, do it now. */
872 if (tmp_tlb == NULL) {
873 /* Allocate new bucket. */
874 tlb = uma_zalloc(tcp_log_id_bucket_zone, M_NOWAIT);
875 if (tlb == NULL) {
876 rv = ENOBUFS;
877 goto done_noinp;
878 }
879 counter_u64_add(tcp_log_pcb_ids_cur, 1);
880 counter_u64_add(tcp_log_pcb_ids_tot, 1);
881
882 if ((tcp_log_auto_all == false) &&
883 tcp_log_auto_mode &&
884 tcp_log_selectauto()) {
885 /* Save off the log state */
886 tlb->tlb_logstate = tcp_log_auto_mode;
887 } else
888 tlb->tlb_logstate = TCP_LOG_STATE_OFF;
889 tlb->tlb_loglimit = 0;
890 tlb->tlb_tag[0] = '\0'; /* Default to an empty tag. */
891
892 /*
893 * Copy the ID to the bucket.
894 * NB: Don't use strlcpy() unless you are sure
895 * we've always validated NULL termination.
896 *
897 * TODO: When I'm done writing this, see if we
898 * we have correctly validated NULL termination and
899 * can use strlcpy(). :-)
900 */
901 strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1);
902 tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0';
903
904 /*
905 * Take the refcount for the first node and go ahead
906 * and lock this. Note that we zero the tlb_mtx
907 * structure, since 0xdeadc0de flips the right bits
908 * for the code to think that this mutex has already
909 * been initialized. :-(
910 */
911 SLIST_INIT(&tlb->tlb_head);
912 refcount_init(&tlb->tlb_refcnt, 1);
913 tlb->tlb_reqcnt = 1;
914 memset(&tlb->tlb_mtx, 0, sizeof(struct mtx));
915 TCPID_BUCKET_LOCK_INIT(tlb);
916 TCPID_BUCKET_LOCK(tlb);
917 bucket_locked = true;
918
919 #define FREE_NEW_TLB() do { \
920 TCPID_BUCKET_LOCK_DESTROY(tlb); \
921 uma_zfree(tcp_log_id_bucket_zone, tlb); \
922 counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1); \
923 counter_u64_add(tcp_log_pcb_ids_tot, (int64_t)-1); \
924 bucket_locked = false; \
925 tlb = NULL; \
926 } while (0)
927 /*
928 * Relock the INP and make sure we are still
929 * unassigned.
930 */
931 INP_WLOCK(inp);
932 RECHECK_INP_CLEAN(FREE_NEW_TLB());
933 if (tp->t_lib != NULL) {
934 FREE_NEW_TLB();
935 goto restart;
936 }
937
938 /* Add the new bucket to the tree. */
939 tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head,
940 tlb);
941 KASSERT(tmp_tlb == NULL,
942 ("%s: Unexpected conflicting bucket (%p) while "
943 "adding new bucket (%p)", __func__, tmp_tlb, tlb));
944
945 /*
946 * If we found a conflicting bucket, free the new
947 * one we made and fall through to use the existing
948 * bucket.
949 */
950 if (tmp_tlb != NULL) {
951 FREE_NEW_TLB();
952 INP_WUNLOCK(inp);
953 }
954 #undef FREE_NEW_TLB
955 }
956
957 /* If we found an existing bucket, use it. */
958 if (tmp_tlb != NULL) {
959 tlb = tmp_tlb;
960 TCPID_BUCKET_LOCK(tlb);
961 bucket_locked = true;
962
963 /*
964 * Relock the INP and make sure we are still
965 * unassigned.
966 */
967 INP_UNLOCK_ASSERT(inp);
968 INP_WLOCK(inp);
969 RECHECK_INP();
970 if (tp->t_lib != NULL) {
971 TCPID_BUCKET_UNLOCK(tlb);
972 bucket_locked = false;
973 tlb = NULL;
974 goto restart;
975 }
976
977 /* Take a reference on the bucket. */
978 TCPID_BUCKET_REF(tlb);
979
980 /* Record the request. */
981 tcp_log_increment_reqcnt(tlb);
982 }
983
984 tcp_log_grow_tlb(tlb->tlb_id, tp);
985
986 /* Add the new node to the list. */
987 SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list);
988 tp->t_lib = tlb;
989 tp->t_lin = tln;
990 if (tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) {
991 /* Clone in any logging */
992
993 tp->_t_logstate = tp->t_lib->tlb_logstate;
994 }
995 if (tp->t_lib->tlb_loglimit) {
996 /* The loglimit too */
997
998 tp->t_loglimit = tp->t_lib->tlb_loglimit;
999 }
1000 tln = NULL;
1001 }
1002
1003 rv = 0;
1004
1005 done:
1006 /* Unlock things, as needed, and return. */
1007 INP_WUNLOCK(inp);
1008 done_noinp:
1009 INP_UNLOCK_ASSERT(inp);
1010 if (bucket_locked) {
1011 TCPID_BUCKET_LOCK_ASSERT(tlb);
1012 TCPID_BUCKET_UNLOCK(tlb);
1013 } else if (tlb != NULL)
1014 TCPID_BUCKET_UNLOCK_ASSERT(tlb);
1015 if (tree_locked == TREE_WLOCKED) {
1016 TCPID_TREE_WLOCK_ASSERT();
1017 TCPID_TREE_WUNLOCK();
1018 } else if (tree_locked == TREE_RLOCKED) {
1019 TCPID_TREE_RLOCK_ASSERT();
1020 TCPID_TREE_RUNLOCK();
1021 } else
1022 TCPID_TREE_UNLOCK_ASSERT();
1023 if (tln != NULL)
1024 uma_zfree(tcp_log_id_node_zone, tln);
1025 return (rv);
1026 }
1027
1028 /*
1029 * Get the TCP log ID for a TCPCB.
1030 * Called with INPCB locked.
1031 * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long.
1032 * Returns number of bytes copied.
1033 */
1034 size_t
tcp_log_get_id(struct tcpcb * tp,char * buf)1035 tcp_log_get_id(struct tcpcb *tp, char *buf)
1036 {
1037 size_t len;
1038
1039 INP_LOCK_ASSERT(tptoinpcb(tp));
1040 if (tp->t_lib != NULL) {
1041 len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
1042 KASSERT(len < TCP_LOG_ID_LEN,
1043 ("%s:%d: tp->t_lib->tlb_id too long (%zu)",
1044 __func__, __LINE__, len));
1045 } else {
1046 *buf = '\0';
1047 len = 0;
1048 }
1049 return (len);
1050 }
1051
1052 /*
1053 * Get the tag associated with the TCPCB's log ID.
1054 * Called with INPCB locked. Returns with it unlocked.
1055 * 'buf' must point to a buffer that is at least TCP_LOG_TAG_LEN bytes long.
1056 * Returns number of bytes copied.
1057 */
1058 size_t
tcp_log_get_tag(struct tcpcb * tp,char * buf)1059 tcp_log_get_tag(struct tcpcb *tp, char *buf)
1060 {
1061 struct inpcb *inp = tptoinpcb(tp);
1062 struct tcp_log_id_bucket *tlb;
1063 size_t len;
1064 int tree_locked;
1065
1066 INP_WLOCK_ASSERT(inp);
1067
1068 tree_locked = TREE_UNLOCKED;
1069 tlb = tp->t_lib;
1070
1071 if (tlb != NULL) {
1072 TCPID_BUCKET_REF(tlb);
1073 INP_WUNLOCK(inp);
1074 TCPID_BUCKET_LOCK(tlb);
1075 len = strlcpy(buf, tlb->tlb_tag, TCP_LOG_TAG_LEN);
1076 KASSERT(len < TCP_LOG_TAG_LEN,
1077 ("%s:%d: tp->t_lib->tlb_tag too long (%zu)",
1078 __func__, __LINE__, len));
1079 if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
1080 TCPID_BUCKET_UNLOCK(tlb);
1081
1082 if (tree_locked == TREE_WLOCKED) {
1083 TCPID_TREE_WLOCK_ASSERT();
1084 TCPID_TREE_WUNLOCK();
1085 } else if (tree_locked == TREE_RLOCKED) {
1086 TCPID_TREE_RLOCK_ASSERT();
1087 TCPID_TREE_RUNLOCK();
1088 } else
1089 TCPID_TREE_UNLOCK_ASSERT();
1090 } else {
1091 INP_WUNLOCK(inp);
1092 *buf = '\0';
1093 len = 0;
1094 }
1095
1096 return (len);
1097 }
1098
1099 /*
1100 * Get number of connections with the same log ID.
1101 * Log ID is taken from given TCPCB.
1102 * Called with INPCB locked.
1103 */
1104 u_int
tcp_log_get_id_cnt(struct tcpcb * tp)1105 tcp_log_get_id_cnt(struct tcpcb *tp)
1106 {
1107
1108 INP_WLOCK_ASSERT(tptoinpcb(tp));
1109 return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt);
1110 }
1111
1112 #ifdef TCPLOG_DEBUG_RINGBUF
1113 /*
1114 * Functions/macros to increment/decrement reference count for a log
1115 * entry. This should catch when we do a double-free/double-remove or
1116 * a double-add.
1117 */
1118 static inline void
_tcp_log_entry_refcnt_add(struct tcp_log_mem * log_entry,const char * func,int line)1119 _tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func,
1120 int line)
1121 {
1122 int refcnt;
1123
1124 refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1);
1125 if (refcnt != 0)
1126 panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)",
1127 func, line, log_entry, refcnt);
1128 }
1129 #define tcp_log_entry_refcnt_add(l) \
1130 _tcp_log_entry_refcnt_add((l), __func__, __LINE__)
1131
1132 static inline void
_tcp_log_entry_refcnt_rem(struct tcp_log_mem * log_entry,const char * func,int line)1133 _tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func,
1134 int line)
1135 {
1136 int refcnt;
1137
1138 refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1);
1139 if (refcnt != 1)
1140 panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)",
1141 func, line, log_entry, refcnt);
1142 }
1143 #define tcp_log_entry_refcnt_rem(l) \
1144 _tcp_log_entry_refcnt_rem((l), __func__, __LINE__)
1145
1146 #else /* !TCPLOG_DEBUG_RINGBUF */
1147
1148 #define tcp_log_entry_refcnt_add(l)
1149 #define tcp_log_entry_refcnt_rem(l)
1150
1151 #endif
1152
1153 /*
1154 * Cleanup after removing a log entry, but only decrement the count if we
1155 * are running INVARIANTS.
1156 */
1157 static inline void
tcp_log_free_log_common(struct tcp_log_mem * log_entry,int * count __unused)1158 tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused)
1159 {
1160
1161 uma_zfree(tcp_log_zone, log_entry);
1162 #ifdef INVARIANTS
1163 (*count)--;
1164 KASSERT(*count >= 0,
1165 ("%s: count unexpectedly negative", __func__));
1166 #endif
1167 }
1168
1169 static void
tcp_log_free_entries(struct tcp_log_stailq * head,int * count)1170 tcp_log_free_entries(struct tcp_log_stailq *head, int *count)
1171 {
1172 struct tcp_log_mem *log_entry;
1173
1174 /* Free the entries. */
1175 while ((log_entry = STAILQ_FIRST(head)) != NULL) {
1176 STAILQ_REMOVE_HEAD(head, tlm_queue);
1177 tcp_log_entry_refcnt_rem(log_entry);
1178 tcp_log_free_log_common(log_entry, count);
1179 }
1180 }
1181
1182 /* Cleanup after removing a log entry. */
1183 static inline void
tcp_log_remove_log_cleanup(struct tcpcb * tp,struct tcp_log_mem * log_entry)1184 tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry)
1185 {
1186 uma_zfree(tcp_log_zone, log_entry);
1187 tp->t_lognum--;
1188 KASSERT(tp->t_lognum >= 0,
1189 ("%s: tp->t_lognum unexpectedly negative", __func__));
1190 }
1191
1192 /* Remove a log entry from the head of a list. */
1193 static inline void
tcp_log_remove_log_head(struct tcpcb * tp,struct tcp_log_mem * log_entry)1194 tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry)
1195 {
1196
1197 KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs),
1198 ("%s: attempt to remove non-HEAD log entry", __func__));
1199 STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
1200 tcp_log_entry_refcnt_rem(log_entry);
1201 tcp_log_remove_log_cleanup(tp, log_entry);
1202 }
1203
1204 #ifdef TCPLOG_DEBUG_RINGBUF
1205 /*
1206 * Initialize the log entry's reference count, which we want to
1207 * survive allocations.
1208 */
1209 static int
tcp_log_zone_init(void * mem,int size,int flags __unused)1210 tcp_log_zone_init(void *mem, int size, int flags __unused)
1211 {
1212 struct tcp_log_mem *tlm;
1213
1214 KASSERT(size >= sizeof(struct tcp_log_mem),
1215 ("%s: unexpectedly short (%d) allocation", __func__, size));
1216 tlm = (struct tcp_log_mem *)mem;
1217 tlm->tlm_refcnt = 0;
1218 return (0);
1219 }
1220
1221 /*
1222 * Double check that the refcnt is zero on allocation and return.
1223 */
1224 static int
tcp_log_zone_ctor(void * mem,int size,void * args __unused,int flags __unused)1225 tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused)
1226 {
1227 struct tcp_log_mem *tlm;
1228
1229 KASSERT(size >= sizeof(struct tcp_log_mem),
1230 ("%s: unexpectedly short (%d) allocation", __func__, size));
1231 tlm = (struct tcp_log_mem *)mem;
1232 if (tlm->tlm_refcnt != 0)
1233 panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
1234 __func__, __LINE__, tlm, tlm->tlm_refcnt);
1235 return (0);
1236 }
1237
1238 static void
tcp_log_zone_dtor(void * mem,int size,void * args __unused)1239 tcp_log_zone_dtor(void *mem, int size, void *args __unused)
1240 {
1241 struct tcp_log_mem *tlm;
1242
1243 KASSERT(size >= sizeof(struct tcp_log_mem),
1244 ("%s: unexpectedly short (%d) allocation", __func__, size));
1245 tlm = (struct tcp_log_mem *)mem;
1246 if (tlm->tlm_refcnt != 0)
1247 panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
1248 __func__, __LINE__, tlm, tlm->tlm_refcnt);
1249 }
1250 #endif /* TCPLOG_DEBUG_RINGBUF */
1251
1252 /* Do global initialization. */
1253 void
tcp_log_init(void)1254 tcp_log_init(void)
1255 {
1256
1257 tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem),
1258 #ifdef TCPLOG_DEBUG_RINGBUF
1259 tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init,
1260 #else
1261 NULL, NULL, NULL,
1262 #endif
1263 NULL, UMA_ALIGN_PTR, 0);
1264 (void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT);
1265 tcp_log_id_bucket_zone = uma_zcreate("tcp_log_id_bucket",
1266 sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL,
1267 UMA_ALIGN_PTR, 0);
1268 tcp_log_id_node_zone = uma_zcreate("tcp_log_id_node",
1269 sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL,
1270 UMA_ALIGN_PTR, 0);
1271 #ifdef TCPLOG_DEBUG_COUNTERS
1272 tcp_log_queued = counter_u64_alloc(M_WAITOK);
1273 tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK);
1274 tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK);
1275 tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK);
1276 tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK);
1277 tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK);
1278 tcp_log_que_copyout = counter_u64_alloc(M_WAITOK);
1279 tcp_log_que_read = counter_u64_alloc(M_WAITOK);
1280 tcp_log_que_freed = counter_u64_alloc(M_WAITOK);
1281 #endif
1282 tcp_log_pcb_ids_cur = counter_u64_alloc(M_WAITOK);
1283 tcp_log_pcb_ids_tot = counter_u64_alloc(M_WAITOK);
1284
1285 rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW);
1286 mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF);
1287 callout_init(&tcp_log_expireq_callout, 1);
1288 }
1289
1290 /* Do per-TCPCB initialization. */
1291 void
tcp_log_tcpcbinit(struct tcpcb * tp)1292 tcp_log_tcpcbinit(struct tcpcb *tp)
1293 {
1294
1295 /* A new TCPCB should start out zero-initialized. */
1296 STAILQ_INIT(&tp->t_logs);
1297
1298 /*
1299 * If we are doing auto-capturing, figure out whether we will capture
1300 * this session.
1301 */
1302 tp->t_loglimit = tcp_log_session_limit;
1303 if ((tcp_log_auto_all == true) &&
1304 tcp_log_auto_mode &&
1305 tcp_log_selectauto()) {
1306 tp->_t_logstate = tcp_log_auto_mode;
1307 tp->t_flags2 |= TF2_LOG_AUTO;
1308 }
1309 }
1310
1311 /* Remove entries */
1312 static void
tcp_log_expire(void * unused __unused)1313 tcp_log_expire(void *unused __unused)
1314 {
1315 struct tcp_log_id_bucket *tlb;
1316 struct tcp_log_id_node *tln;
1317 sbintime_t expiry_limit;
1318 int tree_locked;
1319
1320 TCPLOG_EXPIREQ_LOCK();
1321 if (callout_pending(&tcp_log_expireq_callout)) {
1322 /* Callout was reset. */
1323 TCPLOG_EXPIREQ_UNLOCK();
1324 return;
1325 }
1326
1327 /*
1328 * Process entries until we reach one that expires too far in the
1329 * future. Look one second in the future.
1330 */
1331 expiry_limit = getsbinuptime() + SBT_1S;
1332 tree_locked = TREE_UNLOCKED;
1333
1334 while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL &&
1335 tln->tln_expiretime <= expiry_limit) {
1336 if (!callout_active(&tcp_log_expireq_callout)) {
1337 /*
1338 * Callout was stopped. I guess we should
1339 * just quit at this point.
1340 */
1341 TCPLOG_EXPIREQ_UNLOCK();
1342 return;
1343 }
1344
1345 /*
1346 * Remove the node from the head of the list and unlock
1347 * the list. Change the expiry time to SBT_MAX as a signal
1348 * to other threads that we now own this.
1349 */
1350 STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq);
1351 tln->tln_expiretime = SBT_MAX;
1352 TCPLOG_EXPIREQ_UNLOCK();
1353
1354 /*
1355 * Remove the node from the bucket.
1356 */
1357 tlb = tln->tln_bucket;
1358 TCPID_BUCKET_LOCK(tlb);
1359 if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) {
1360 tcp_log_id_validate_tree_lock(tree_locked);
1361 if (tree_locked == TREE_WLOCKED)
1362 TCPID_TREE_WUNLOCK();
1363 else
1364 TCPID_TREE_RUNLOCK();
1365 tree_locked = TREE_UNLOCKED;
1366 }
1367
1368 /* Drop the INP reference. */
1369 INP_WLOCK(tln->tln_inp);
1370 if (!in_pcbrele_wlocked(tln->tln_inp))
1371 INP_WUNLOCK(tln->tln_inp);
1372
1373 /* Free the log records. */
1374 tcp_log_free_entries(&tln->tln_entries, &tln->tln_count);
1375
1376 /* Free the node. */
1377 uma_zfree(tcp_log_id_node_zone, tln);
1378
1379 /* Relock the expiry queue. */
1380 TCPLOG_EXPIREQ_LOCK();
1381 }
1382
1383 /*
1384 * We've expired all the entries we can. Do we need to reschedule
1385 * ourselves?
1386 */
1387 callout_deactivate(&tcp_log_expireq_callout);
1388 if (tln != NULL) {
1389 /*
1390 * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and
1391 * set the next callout to that. (This helps ensure we generally
1392 * run the callout no more often than desired.)
1393 */
1394 expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL;
1395 if (expiry_limit < tln->tln_expiretime)
1396 expiry_limit = tln->tln_expiretime;
1397 callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit,
1398 SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE);
1399 }
1400
1401 /* We're done. */
1402 TCPLOG_EXPIREQ_UNLOCK();
1403 return;
1404 }
1405
1406 /*
1407 * Move log data from the TCPCB to a new node. This will reset the TCPCB log
1408 * entries and log count; however, it will not touch other things from the
1409 * TCPCB (e.g. t_lin, t_lib).
1410 *
1411 * NOTE: Must hold a lock on the INP.
1412 */
1413 static void
tcp_log_move_tp_to_node(struct tcpcb * tp,struct tcp_log_id_node * tln)1414 tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln)
1415 {
1416 struct inpcb *inp = tptoinpcb(tp);
1417
1418 INP_WLOCK_ASSERT(inp);
1419
1420 tln->tln_ie = inp->inp_inc.inc_ie;
1421 if (inp->inp_inc.inc_flags & INC_ISIPV6)
1422 tln->tln_af = AF_INET6;
1423 else
1424 tln->tln_af = AF_INET;
1425 tln->tln_entries = tp->t_logs;
1426 tln->tln_count = tp->t_lognum;
1427 tln->tln_bucket = tp->t_lib;
1428
1429 /* Clear information from the PCB. */
1430 STAILQ_INIT(&tp->t_logs);
1431 tp->t_lognum = 0;
1432 }
1433
1434 /* Do per-TCPCB cleanup */
1435 void
tcp_log_tcpcbfini(struct tcpcb * tp)1436 tcp_log_tcpcbfini(struct tcpcb *tp)
1437 {
1438 struct tcp_log_id_node *tln, *tln_first;
1439 struct tcp_log_mem *log_entry;
1440 sbintime_t callouttime;
1441
1442
1443 INP_WLOCK_ASSERT(tptoinpcb(tp));
1444 if (tp->_t_logstate) {
1445 union tcp_log_stackspecific log;
1446 struct timeval tv;
1447 #ifdef TCP_ACCOUNTING
1448 struct tcp_log_buffer *lgb;
1449 int i;
1450
1451 memset(&log, 0, sizeof(log));
1452 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
1453 for (i = 0; i < TCP_NUM_CNT_COUNTERS; i++) {
1454 log.u_raw.u64_flex[i] = tp->tcp_cnt_counters[i];
1455 }
1456 lgb = tcp_log_event(tp, NULL,
1457 NULL,
1458 NULL,
1459 TCP_LOG_ACCOUNTING, 0,
1460 0, &log, false, NULL, NULL, 0, &tv);
1461 if (lgb != NULL) {
1462 lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS;
1463 lgb->tlb_flex2 = 1;
1464 } else
1465 goto skip_out;
1466 for (i = 0; i<TCP_NUM_CNT_COUNTERS; i++) {
1467 log.u_raw.u64_flex[i] = tp->tcp_proc_time[i];
1468 }
1469 lgb = tcp_log_event(tp, NULL,
1470 NULL,
1471 NULL,
1472 TCP_LOG_ACCOUNTING, 0,
1473 0, &log, false, NULL, NULL, 0, &tv);
1474 if (lgb != NULL) {
1475 lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS;
1476 lgb->tlb_flex2 = 2;
1477 }
1478 }
1479 skip_out:
1480 #endif
1481 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1482 log.u_bbr.cur_del_rate = tp->t_end_info;
1483 (void)tcp_log_event(tp, NULL,
1484 NULL,
1485 NULL,
1486 TCP_LOG_CONNEND, 0,
1487 0, &log, false, NULL, NULL, 0, &tv);
1488 }
1489 /*
1490 * If we were gathering packets to be automatically dumped, try to do
1491 * it now. If this succeeds, the log information in the TCPCB will be
1492 * cleared. Otherwise, we'll handle the log information as we do
1493 * for other states.
1494 */
1495 switch(tp->_t_logstate) {
1496 case TCP_LOG_STATE_HEAD_AUTO:
1497 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
1498 M_NOWAIT, false);
1499 break;
1500 case TCP_LOG_STATE_TAIL_AUTO:
1501 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail",
1502 M_NOWAIT, false);
1503 break;
1504 case TCP_LOG_VIA_BBPOINTS:
1505 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints",
1506 M_NOWAIT, false);
1507 break;
1508 case TCP_LOG_STATE_CONTINUAL:
1509 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
1510 M_NOWAIT, false);
1511 break;
1512 }
1513
1514 /*
1515 * There are two ways we could keep logs: per-socket or per-ID. If
1516 * we are tracking logs with an ID, then the logs survive the
1517 * destruction of the TCPCB.
1518 *
1519 * If the TCPCB is associated with an ID node, move the logs from the
1520 * TCPCB to the ID node. In theory, this is safe, for reasons which I
1521 * will now explain for my own benefit when I next need to figure out
1522 * this code. :-)
1523 *
1524 * We own the INP lock. Therefore, no one else can change the contents
1525 * of this node (Rule C). Further, no one can remove this node from
1526 * the bucket while we hold the lock (Rule D). Basically, no one can
1527 * mess with this node. That leaves two states in which we could be:
1528 *
1529 * 1. Another thread is currently waiting to acquire the INP lock, with
1530 * plans to do something with this node. When we drop the INP lock,
1531 * they will have a chance to do that. They will recheck the
1532 * tln_closed field (see note to Rule C) and then acquire the
1533 * bucket lock before proceeding further.
1534 *
1535 * 2. Another thread will try to acquire a lock at some point in the
1536 * future. If they try to acquire a lock before we set the
1537 * tln_closed field, they will follow state #1. If they try to
1538 * acquire a lock after we set the tln_closed field, they will be
1539 * able to make changes to the node, at will, following Rule C.
1540 *
1541 * Therefore, we currently own this node and can make any changes
1542 * we want. But, as soon as we set the tln_closed field to true, we
1543 * have effectively dropped our lock on the node. (For this reason, we
1544 * also need to make sure our writes are ordered correctly. An atomic
1545 * operation with "release" semantics should be sufficient.)
1546 */
1547
1548 if (tp->t_lin != NULL) {
1549 struct inpcb *inp = tptoinpcb(tp);
1550
1551 /* Copy the relevant information to the log entry. */
1552 tln = tp->t_lin;
1553 KASSERT(tln->tln_inp == inp,
1554 ("%s: Mismatched inp (tln->tln_inp=%p, tp inpcb=%p)",
1555 __func__, tln->tln_inp, inp));
1556 tcp_log_move_tp_to_node(tp, tln);
1557
1558 /* Clear information from the PCB. */
1559 tp->t_lin = NULL;
1560 tp->t_lib = NULL;
1561
1562 /*
1563 * Take a reference on the INP. This ensures that the INP
1564 * remains valid while the node is on the expiry queue. This
1565 * ensures the INP is valid for other threads that may be
1566 * racing to lock this node when we move it to the expire
1567 * queue.
1568 */
1569 in_pcbref(inp);
1570
1571 /*
1572 * Store the entry on the expiry list. The exact behavior
1573 * depends on whether we have entries to keep. If so, we
1574 * put the entry at the tail of the list and expire in
1575 * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put
1576 * the entry at the head of the list. (Handling the cleanup
1577 * via the expiry timer lets us avoid locking messy-ness here.)
1578 */
1579 tln->tln_expiretime = getsbinuptime();
1580 TCPLOG_EXPIREQ_LOCK();
1581 if (tln->tln_count) {
1582 tln->tln_expiretime += TCP_LOG_EXPIRE_TIME;
1583 if (STAILQ_EMPTY(&tcp_log_expireq_head) &&
1584 !callout_active(&tcp_log_expireq_callout)) {
1585 /*
1586 * We are adding the first entry and a callout
1587 * is not currently scheduled; therefore, we
1588 * need to schedule one.
1589 */
1590 callout_reset_sbt(&tcp_log_expireq_callout,
1591 tln->tln_expiretime, SBT_1S, tcp_log_expire,
1592 NULL, C_ABSOLUTE);
1593 }
1594 STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln,
1595 tln_expireq);
1596 } else {
1597 callouttime = tln->tln_expiretime +
1598 TCP_LOG_EXPIRE_INTVL;
1599 tln_first = STAILQ_FIRST(&tcp_log_expireq_head);
1600
1601 if ((tln_first == NULL ||
1602 callouttime < tln_first->tln_expiretime) &&
1603 (callout_pending(&tcp_log_expireq_callout) ||
1604 !callout_active(&tcp_log_expireq_callout))) {
1605 /*
1606 * The list is empty, or we want to run the
1607 * expire code before the first entry's timer
1608 * fires. Also, we are in a case where a callout
1609 * is not actively running. We want to reset
1610 * the callout to occur sooner.
1611 */
1612 callout_reset_sbt(&tcp_log_expireq_callout,
1613 callouttime, SBT_1S, tcp_log_expire, NULL,
1614 C_ABSOLUTE);
1615 }
1616
1617 /*
1618 * Insert to the head, or just after the head, as
1619 * appropriate. (This might result in small
1620 * mis-orderings as a bunch of "expire now" entries
1621 * gather at the start of the list, but that should
1622 * not produce big problems, since the expire timer
1623 * will walk through all of them.)
1624 */
1625 if (tln_first == NULL ||
1626 tln->tln_expiretime < tln_first->tln_expiretime)
1627 STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln,
1628 tln_expireq);
1629 else
1630 STAILQ_INSERT_AFTER(&tcp_log_expireq_head,
1631 tln_first, tln, tln_expireq);
1632 }
1633 TCPLOG_EXPIREQ_UNLOCK();
1634
1635 /*
1636 * We are done messing with the tln. After this point, we
1637 * can't touch it. (Note that the "release" semantics should
1638 * be included with the TCPLOG_EXPIREQ_UNLOCK() call above.
1639 * Therefore, they should be unnecessary here. However, it
1640 * seems like a good idea to include them anyway, since we
1641 * really are releasing a lock here.)
1642 */
1643 atomic_store_rel_int(&tln->tln_closed, 1);
1644 } else {
1645 /* Remove log entries. */
1646 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
1647 tcp_log_remove_log_head(tp, log_entry);
1648 KASSERT(tp->t_lognum == 0,
1649 ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
1650 __func__, tp->t_lognum));
1651 }
1652
1653 /*
1654 * Change the log state to off (just in case anything tries to sneak
1655 * in a last-minute log).
1656 */
1657 tp->_t_logstate = TCP_LOG_STATE_OFF;
1658 }
1659
1660 static void
tcp_log_purge_tp_logbuf(struct tcpcb * tp)1661 tcp_log_purge_tp_logbuf(struct tcpcb *tp)
1662 {
1663 struct tcp_log_mem *log_entry;
1664
1665 INP_WLOCK_ASSERT(tptoinpcb(tp));
1666 if (tp->t_lognum == 0)
1667 return;
1668
1669 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
1670 tcp_log_remove_log_head(tp, log_entry);
1671 KASSERT(tp->t_lognum == 0,
1672 ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
1673 __func__, tp->t_lognum));
1674 tp->_t_logstate = TCP_LOG_STATE_OFF;
1675 }
1676
1677 /*
1678 * This logs an event for a TCP socket. Normally, this is called via
1679 * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for
1680 * TCP_LOG_EVENT().
1681 */
1682
1683 struct tcp_log_buffer *
tcp_log_event(struct tcpcb * tp,struct tcphdr * th,struct sockbuf * rxbuf,struct sockbuf * txbuf,uint8_t eventid,int errornum,uint32_t len,union tcp_log_stackspecific * stackinfo,int th_hostorder,const char * output_caller,const char * func,int line,const struct timeval * itv)1684 tcp_log_event(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
1685 struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
1686 union tcp_log_stackspecific *stackinfo, int th_hostorder,
1687 const char *output_caller, const char *func, int line, const struct timeval *itv)
1688 {
1689 struct tcp_log_mem *log_entry;
1690 struct tcp_log_buffer *log_buf;
1691 int attempt_count = 0;
1692 struct tcp_log_verbose *log_verbose;
1693 uint32_t logsn;
1694
1695 KASSERT((func == NULL && line == 0) || (func != NULL && line > 0),
1696 ("%s called with inconsistent func (%p) and line (%d) arguments",
1697 __func__, func, line));
1698
1699 INP_WLOCK_ASSERT(tptoinpcb(tp));
1700 if (tcp_disable_all_bb_logs) {
1701 /*
1702 * The global shutdown logging
1703 * switch has been thrown. Call
1704 * the purge function that frees
1705 * purges out the logs and
1706 * turns off logging.
1707 */
1708 tcp_log_purge_tp_logbuf(tp);
1709 return (NULL);
1710 }
1711 KASSERT(tp->_t_logstate == TCP_LOG_STATE_HEAD ||
1712 tp->_t_logstate == TCP_LOG_STATE_TAIL ||
1713 tp->_t_logstate == TCP_LOG_STATE_CONTINUAL ||
1714 tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO ||
1715 tp->_t_logstate == TCP_LOG_VIA_BBPOINTS ||
1716 tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO,
1717 ("%s called with unexpected tp->_t_logstate (%d)", __func__,
1718 tp->_t_logstate));
1719
1720 /*
1721 * Get the serial number. We do this early so it will
1722 * increment even if we end up skipping the log entry for some
1723 * reason.
1724 */
1725 logsn = tp->t_logsn++;
1726
1727 /*
1728 * Can we get a new log entry? If so, increment the lognum counter
1729 * here.
1730 */
1731 retry:
1732 if (tp->t_lognum < tp->t_loglimit) {
1733 if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL)
1734 tp->t_lognum++;
1735 } else
1736 log_entry = NULL;
1737
1738 /* Do we need to try to reuse? */
1739 if (log_entry == NULL) {
1740 /*
1741 * Sacrifice auto-logged sessions without a log ID if
1742 * tcp_log_auto_all is false. (If they don't have a log
1743 * ID by now, it is probable that either they won't get one
1744 * or we are resource-constrained.)
1745 */
1746 if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
1747 !tcp_log_auto_all) {
1748 if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) {
1749 #ifdef INVARIANTS
1750 panic("%s:%d: tcp_log_state_change() failed "
1751 "to set tp %p to TCP_LOG_STATE_CLEAR",
1752 __func__, __LINE__, tp);
1753 #endif
1754 tp->_t_logstate = TCP_LOG_STATE_OFF;
1755 }
1756 return (NULL);
1757 }
1758 /*
1759 * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump
1760 * the buffers. If successful, deactivate tracing. Otherwise,
1761 * leave it active so we will retry.
1762 */
1763 if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO &&
1764 !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
1765 M_NOWAIT, false)) {
1766 tp->_t_logstate = TCP_LOG_STATE_OFF;
1767 return(NULL);
1768 } else if ((tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) &&
1769 !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
1770 M_NOWAIT, false)) {
1771 if (attempt_count == 0) {
1772 attempt_count++;
1773 goto retry;
1774 }
1775 #ifdef TCPLOG_DEBUG_COUNTERS
1776 counter_u64_add(tcp_log_que_fail4, 1);
1777 #endif
1778 return(NULL);
1779
1780 } else if ((tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) &&
1781 !tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints",
1782 M_NOWAIT, false)) {
1783 if (attempt_count == 0) {
1784 attempt_count++;
1785 goto retry;
1786 }
1787 #ifdef TCPLOG_DEBUG_COUNTERS
1788 counter_u64_add(tcp_log_que_fail4, 1);
1789 #endif
1790 return(NULL);
1791 } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO)
1792 return(NULL);
1793
1794 /* If in HEAD state, just deactivate the tracing and return. */
1795 if (tp->_t_logstate == TCP_LOG_STATE_HEAD) {
1796 tp->_t_logstate = TCP_LOG_STATE_OFF;
1797 return(NULL);
1798 }
1799 /*
1800 * Get a buffer to reuse. If that fails, just give up.
1801 * (We can't log anything without a buffer in which to
1802 * put it.)
1803 *
1804 * Note that we don't change the t_lognum counter
1805 * here. Because we are re-using the buffer, the total
1806 * number won't change.
1807 */
1808 if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL)
1809 return(NULL);
1810 STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
1811 tcp_log_entry_refcnt_rem(log_entry);
1812 }
1813
1814 KASSERT(log_entry != NULL,
1815 ("%s: log_entry unexpectedly NULL", __func__));
1816
1817 /* Extract the log buffer and verbose buffer pointers. */
1818 log_buf = &log_entry->tlm_buf;
1819 log_verbose = &log_entry->tlm_v;
1820
1821 /* Basic entries. */
1822 if (itv == NULL)
1823 microuptime(&log_buf->tlb_tv);
1824 else
1825 memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval));
1826 log_buf->tlb_ticks = ticks;
1827 log_buf->tlb_sn = logsn;
1828 log_buf->tlb_stackid = tp->t_fb->tfb_id;
1829 log_buf->tlb_eventid = eventid;
1830 log_buf->tlb_eventflags = 0;
1831 log_buf->tlb_errno = errornum;
1832
1833 /* Socket buffers */
1834 if (rxbuf != NULL) {
1835 log_buf->tlb_eventflags |= TLB_FLAG_RXBUF;
1836 log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc;
1837 log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc;
1838 log_buf->tlb_rxbuf.tls_sb_spare = 0;
1839 } else {
1840 log_buf->tlb_rxbuf.tls_sb_acc = 0;
1841 log_buf->tlb_rxbuf.tls_sb_ccc = 0;
1842 }
1843 if (txbuf != NULL) {
1844 log_buf->tlb_eventflags |= TLB_FLAG_TXBUF;
1845 log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc;
1846 log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc;
1847 log_buf->tlb_txbuf.tls_sb_spare = 0;
1848 } else {
1849 log_buf->tlb_txbuf.tls_sb_acc = 0;
1850 log_buf->tlb_txbuf.tls_sb_ccc = 0;
1851 }
1852 /* Copy values from tp to the log entry. */
1853 log_buf->tlb_state = tp->t_state;
1854 log_buf->tlb_starttime = tp->t_starttime;
1855 log_buf->tlb_iss = tp->iss;
1856 log_buf->tlb_flags = tp->t_flags;
1857 log_buf->tlb_snd_una = tp->snd_una;
1858 log_buf->tlb_snd_max = tp->snd_max;
1859 log_buf->tlb_snd_cwnd = tp->snd_cwnd;
1860 log_buf->tlb_snd_nxt = tp->snd_nxt;
1861 log_buf->tlb_snd_recover = tp->snd_recover;
1862 log_buf->tlb_snd_wnd = tp->snd_wnd;
1863 log_buf->tlb_snd_ssthresh = tp->snd_ssthresh;
1864 log_buf->tlb_srtt = tp->t_srtt;
1865 log_buf->tlb_rttvar = tp->t_rttvar;
1866 log_buf->tlb_rcv_up = tp->rcv_up;
1867 log_buf->tlb_rcv_adv = tp->rcv_adv;
1868 log_buf->tlb_flags2 = tp->t_flags2;
1869 log_buf->tlb_rcv_nxt = tp->rcv_nxt;
1870 log_buf->tlb_rcv_wnd = tp->rcv_wnd;
1871 log_buf->tlb_dupacks = tp->t_dupacks;
1872 log_buf->tlb_segqlen = tp->t_segqlen;
1873 log_buf->tlb_snd_numholes = tp->snd_numholes;
1874 log_buf->tlb_flex1 = 0;
1875 log_buf->tlb_flex2 = 0;
1876 log_buf->tlb_fbyte_in = tp->t_fbyte_in;
1877 log_buf->tlb_fbyte_out = tp->t_fbyte_out;
1878 log_buf->tlb_snd_scale = tp->snd_scale;
1879 log_buf->tlb_rcv_scale = tp->rcv_scale;
1880 log_buf->_pad[0] = 0;
1881 log_buf->_pad[1] = 0;
1882 log_buf->_pad[2] = 0;
1883 /* Copy stack-specific info. */
1884 if (stackinfo != NULL) {
1885 memcpy(&log_buf->tlb_stackinfo, stackinfo,
1886 sizeof(log_buf->tlb_stackinfo));
1887 log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO;
1888 }
1889
1890 /* The packet */
1891 log_buf->tlb_len = len;
1892 if (th) {
1893 int optlen;
1894
1895 log_buf->tlb_eventflags |= TLB_FLAG_HDR;
1896 log_buf->tlb_th = *th;
1897 if (th_hostorder)
1898 tcp_fields_to_net(&log_buf->tlb_th);
1899 optlen = (th->th_off << 2) - sizeof (struct tcphdr);
1900 if (optlen > 0)
1901 memcpy(log_buf->tlb_opts, th + 1, optlen);
1902 } else {
1903 memset(&log_buf->tlb_th, 0, sizeof(*th));
1904 }
1905
1906 /* Verbose information */
1907 if (func != NULL) {
1908 log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE;
1909 if (output_caller != NULL)
1910 strlcpy(log_verbose->tlv_snd_frm, output_caller,
1911 TCP_FUNC_LEN);
1912 else
1913 *log_verbose->tlv_snd_frm = 0;
1914 strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN);
1915 log_verbose->tlv_trace_line = line;
1916 }
1917
1918 /* Insert the new log at the tail. */
1919 STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue);
1920 tcp_log_entry_refcnt_add(log_entry);
1921 return (log_buf);
1922 }
1923
1924 /*
1925 * Change the logging state for a TCPCB. Returns 0 on success or an
1926 * error code on failure.
1927 */
1928 int
tcp_log_state_change(struct tcpcb * tp,int state)1929 tcp_log_state_change(struct tcpcb *tp, int state)
1930 {
1931 struct tcp_log_mem *log_entry;
1932 int rv;
1933
1934 INP_WLOCK_ASSERT(tptoinpcb(tp));
1935 rv = 0;
1936 switch(state) {
1937 case TCP_LOG_STATE_CLEAR:
1938 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
1939 tcp_log_remove_log_head(tp, log_entry);
1940 /* FALLTHROUGH */
1941
1942 case TCP_LOG_STATE_OFF:
1943 tp->_t_logstate = TCP_LOG_STATE_OFF;
1944 break;
1945
1946 case TCP_LOG_STATE_TAIL:
1947 case TCP_LOG_STATE_HEAD:
1948 case TCP_LOG_STATE_CONTINUAL:
1949 case TCP_LOG_VIA_BBPOINTS:
1950 case TCP_LOG_STATE_HEAD_AUTO:
1951 case TCP_LOG_STATE_TAIL_AUTO:
1952 /*
1953 * When the RATIO_OFF state is set for the bucket, the log ID
1954 * this tp is associated with has been probabilistically opted
1955 * out of logging per tcp_log_apply_ratio().
1956 */
1957 if (tp->t_lib == NULL ||
1958 tp->t_lib->tlb_logstate != TCP_LOG_STATE_RATIO_OFF) {
1959 tp->_t_logstate = state;
1960 } else {
1961 rv = ECANCELED;
1962 tp->_t_logstate = TCP_LOG_STATE_OFF;
1963 }
1964 break;
1965
1966 default:
1967 return (EINVAL);
1968 }
1969 if (tcp_disable_all_bb_logs) {
1970 /* We are prohibited from doing any logs */
1971 tp->_t_logstate = TCP_LOG_STATE_OFF;
1972 rv = EBUSY;
1973 }
1974 tp->t_flags2 &= ~(TF2_LOG_AUTO);
1975
1976 return (rv);
1977 }
1978
1979 /* If tcp_drain() is called, flush half the log entries. */
1980 void
tcp_log_drain(struct tcpcb * tp)1981 tcp_log_drain(struct tcpcb *tp)
1982 {
1983 struct tcp_log_mem *log_entry, *next;
1984 int target, skip;
1985
1986 INP_WLOCK_ASSERT(tptoinpcb(tp));
1987 if ((target = tp->t_lognum / 2) == 0)
1988 return;
1989
1990 /*
1991 * XXXRRS: At this I don't think this is wise that
1992 * we do this. All that a drain call means is that
1993 * we are hitting one of the system mbuf limits. BB
1994 * logging, or freeing of them, will not create any
1995 * more mbufs and really has nothing to do with
1996 * the system running out of mbufs. For now I
1997 * am changing this to free any "AUTO" by dumping
1998 * them out. But this should either be changed
1999 * so that it gets called when we hit the BB limit
2000 * or it should just not get called (one of the two)
2001 * since I don't think the mbuf <-> BB log cleanup
2002 * is the right thing to do here.
2003 */
2004 /*
2005 * If we are logging the "head" packets, we want to discard
2006 * from the tail of the queue. Otherwise, we want to discard
2007 * from the head.
2008 */
2009 if (tp->_t_logstate == TCP_LOG_STATE_HEAD) {
2010 skip = tp->t_lognum - target;
2011 STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue)
2012 if (!--skip)
2013 break;
2014 KASSERT(log_entry != NULL,
2015 ("%s: skipped through all entries!", __func__));
2016 if (log_entry == NULL)
2017 return;
2018 while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) {
2019 STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue);
2020 tcp_log_entry_refcnt_rem(next);
2021 tcp_log_remove_log_cleanup(tp, next);
2022 #ifdef INVARIANTS
2023 target--;
2024 #endif
2025 }
2026 KASSERT(target == 0,
2027 ("%s: After removing from tail, target was %d", __func__,
2028 target));
2029 } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO) {
2030 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at drain",
2031 M_NOWAIT, false);
2032 } else if (tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO) {
2033 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at drain",
2034 M_NOWAIT, false);
2035 } else if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) {
2036 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints",
2037 M_NOWAIT, false);
2038 } else if (tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) {
2039 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
2040 M_NOWAIT, false);
2041 } else {
2042 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL &&
2043 target--)
2044 tcp_log_remove_log_head(tp, log_entry);
2045 KASSERT(target <= 0,
2046 ("%s: After removing from head, target was %d", __func__,
2047 target));
2048 KASSERT(tp->t_lognum > 0,
2049 ("%s: After removing from head, tp->t_lognum was %d",
2050 __func__, target));
2051 KASSERT(log_entry != NULL,
2052 ("%s: After removing from head, the tailq was empty",
2053 __func__));
2054 }
2055 }
2056
2057 static inline int
tcp_log_copyout(struct sockopt * sopt,void * src,void * dst,size_t len)2058 tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len)
2059 {
2060
2061 if (sopt->sopt_td != NULL)
2062 return (copyout(src, dst, len));
2063 bcopy(src, dst, len);
2064 return (0);
2065 }
2066
2067 static int
tcp_log_logs_to_buf(struct sockopt * sopt,struct tcp_log_stailq * log_tailqp,struct tcp_log_buffer ** end,int count)2068 tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp,
2069 struct tcp_log_buffer **end, int count)
2070 {
2071 struct tcp_log_buffer *out_entry;
2072 struct tcp_log_mem *log_entry;
2073 size_t entrysize;
2074 int error;
2075 #ifdef INVARIANTS
2076 int orig_count = count;
2077 #endif
2078
2079 /* Copy the data out. */
2080 error = 0;
2081 out_entry = (struct tcp_log_buffer *) sopt->sopt_val;
2082 STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) {
2083 count--;
2084 KASSERT(count >= 0,
2085 ("%s:%d: Exceeded expected count (%d) processing list %p",
2086 __func__, __LINE__, orig_count, log_tailqp));
2087
2088 #ifdef TCPLOG_DEBUG_COUNTERS
2089 counter_u64_add(tcp_log_que_copyout, 1);
2090 #endif
2091
2092 /*
2093 * Skip copying out the header if it isn't present.
2094 * Instead, copy out zeros (to ensure we don't leak info).
2095 * TODO: Make sure we truly do zero everything we don't
2096 * explicitly set.
2097 */
2098 if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)
2099 entrysize = sizeof(struct tcp_log_buffer);
2100 else
2101 entrysize = offsetof(struct tcp_log_buffer, tlb_th);
2102 error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry,
2103 entrysize);
2104 if (error)
2105 break;
2106 if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) {
2107 error = tcp_log_copyout(sopt, zerobuf,
2108 ((uint8_t *)out_entry) + entrysize,
2109 sizeof(struct tcp_log_buffer) - entrysize);
2110 }
2111
2112 /*
2113 * Copy out the verbose bit, if needed. Either way,
2114 * increment the output pointer the correct amount.
2115 */
2116 if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) {
2117 error = tcp_log_copyout(sopt, &log_entry->tlm_v,
2118 out_entry->tlb_verbose,
2119 sizeof(struct tcp_log_verbose));
2120 if (error)
2121 break;
2122 out_entry = (struct tcp_log_buffer *)
2123 (((uint8_t *) (out_entry + 1)) +
2124 sizeof(struct tcp_log_verbose));
2125 } else
2126 out_entry++;
2127 }
2128 *end = out_entry;
2129 KASSERT(error || count == 0,
2130 ("%s:%d: Less than expected count (%d) processing list %p"
2131 " (%d remain)", __func__, __LINE__, orig_count,
2132 log_tailqp, count));
2133
2134 return (error);
2135 }
2136
2137 /*
2138 * Copy out the buffer. Note that we do incremental copying, so
2139 * sooptcopyout() won't work. However, the goal is to produce the same
2140 * end result as if we copied in the entire user buffer, updated it,
2141 * and then used sooptcopyout() to copy it out.
2142 *
2143 * NOTE: This should be called with a write lock on the PCB; however,
2144 * the function will drop it after it extracts the data from the TCPCB.
2145 */
2146 int
tcp_log_getlogbuf(struct sockopt * sopt,struct tcpcb * tp)2147 tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp)
2148 {
2149 struct tcp_log_stailq log_tailq;
2150 struct tcp_log_mem *log_entry, *log_next;
2151 struct tcp_log_buffer *out_entry;
2152 struct inpcb *inp = tptoinpcb(tp);
2153 size_t outsize, entrysize;
2154 int error, outnum;
2155
2156 INP_WLOCK_ASSERT(inp);
2157
2158 /*
2159 * Determine which log entries will fit in the buffer. As an
2160 * optimization, skip this if all the entries will clearly fit
2161 * in the buffer. (However, get an exact size if we are using
2162 * INVARIANTS.)
2163 */
2164 #ifndef INVARIANTS
2165 if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) +
2166 sizeof(struct tcp_log_verbose)) >= tp->t_lognum) {
2167 log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue);
2168 log_next = NULL;
2169 outsize = 0;
2170 outnum = tp->t_lognum;
2171 } else {
2172 #endif
2173 outsize = outnum = 0;
2174 log_entry = NULL;
2175 STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) {
2176 entrysize = sizeof(struct tcp_log_buffer);
2177 if (log_next->tlm_buf.tlb_eventflags &
2178 TLB_FLAG_VERBOSE)
2179 entrysize += sizeof(struct tcp_log_verbose);
2180 if ((sopt->sopt_valsize - outsize) < entrysize)
2181 break;
2182 outsize += entrysize;
2183 outnum++;
2184 log_entry = log_next;
2185 }
2186 KASSERT(outsize <= sopt->sopt_valsize,
2187 ("%s: calculated output size (%zu) greater than available"
2188 "space (%zu)", __func__, outsize, sopt->sopt_valsize));
2189 #ifndef INVARIANTS
2190 }
2191 #endif
2192
2193 /*
2194 * Copy traditional sooptcopyout() behavior: if sopt->sopt_val
2195 * is NULL, silently skip the copy. However, in this case, we
2196 * will leave the list alone and return. Functionally, this
2197 * gives userspace a way to poll for an approximate buffer
2198 * size they will need to get the log entries.
2199 */
2200 if (sopt->sopt_val == NULL) {
2201 INP_WUNLOCK(inp);
2202 if (outsize == 0) {
2203 outsize = outnum * (sizeof(struct tcp_log_buffer) +
2204 sizeof(struct tcp_log_verbose));
2205 }
2206 if (sopt->sopt_valsize > outsize)
2207 sopt->sopt_valsize = outsize;
2208 return (0);
2209 }
2210
2211 /*
2212 * Break apart the list. We'll save the ones we want to copy
2213 * out locally and remove them from the TCPCB list. We can
2214 * then drop the INPCB lock while we do the copyout.
2215 *
2216 * There are roughly three cases:
2217 * 1. There was nothing to copy out. That's easy: drop the
2218 * lock and return.
2219 * 2. We are copying out the entire list. Again, that's easy:
2220 * move the whole list.
2221 * 3. We are copying out a partial list. That's harder. We
2222 * need to update the list book-keeping entries.
2223 */
2224 if (log_entry != NULL && log_next == NULL) {
2225 /* Move entire list. */
2226 KASSERT(outnum == tp->t_lognum,
2227 ("%s:%d: outnum (%d) should match tp->t_lognum (%d)",
2228 __func__, __LINE__, outnum, tp->t_lognum));
2229 log_tailq = tp->t_logs;
2230 tp->t_lognum = 0;
2231 STAILQ_INIT(&tp->t_logs);
2232 } else if (log_entry != NULL) {
2233 /* Move partial list. */
2234 KASSERT(outnum < tp->t_lognum,
2235 ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)",
2236 __func__, __LINE__, outnum, tp->t_lognum));
2237 STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs);
2238 STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue);
2239 KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL,
2240 ("%s:%d: tp->t_logs is unexpectedly shorter than expected"
2241 "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)",
2242 __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum));
2243 STAILQ_NEXT(log_entry, tlm_queue) = NULL;
2244 log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue);
2245 tp->t_lognum -= outnum;
2246 } else
2247 STAILQ_INIT(&log_tailq);
2248
2249 /* Drop the PCB lock. */
2250 INP_WUNLOCK(inp);
2251
2252 /* Copy the data out. */
2253 error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum);
2254
2255 if (error) {
2256 /* Restore list */
2257 INP_WLOCK(inp);
2258 if ((inp->inp_flags & INP_DROPPED) == 0) {
2259 tp = intotcpcb(inp);
2260
2261 /* Merge the two lists. */
2262 STAILQ_CONCAT(&log_tailq, &tp->t_logs);
2263 tp->t_logs = log_tailq;
2264 tp->t_lognum += outnum;
2265 }
2266 INP_WUNLOCK(inp);
2267 } else {
2268 /* Sanity check entries */
2269 KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val) ==
2270 outsize, ("%s: Actual output size (%zu) != "
2271 "calculated output size (%zu)", __func__,
2272 (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val),
2273 outsize));
2274
2275 /* Free the entries we just copied out. */
2276 STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) {
2277 tcp_log_entry_refcnt_rem(log_entry);
2278 uma_zfree(tcp_log_zone, log_entry);
2279 }
2280 }
2281
2282 sopt->sopt_valsize = (size_t)((caddr_t)out_entry -
2283 (caddr_t)sopt->sopt_val);
2284 return (error);
2285 }
2286
2287 static void
tcp_log_free_queue(struct tcp_log_dev_queue * param)2288 tcp_log_free_queue(struct tcp_log_dev_queue *param)
2289 {
2290 struct tcp_log_dev_log_queue *entry;
2291
2292 KASSERT(param != NULL, ("%s: called with NULL param", __func__));
2293 if (param == NULL)
2294 return;
2295
2296 entry = (struct tcp_log_dev_log_queue *)param;
2297
2298 /* Free the entries. */
2299 tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
2300
2301 /* Free the buffer, if it is allocated. */
2302 if (entry->tldl_common.tldq_buf != NULL)
2303 free(entry->tldl_common.tldq_buf, M_TCPLOGDEV);
2304
2305 /* Free the queue entry. */
2306 free(entry, M_TCPLOGDEV);
2307 }
2308
2309 static struct tcp_log_common_header *
tcp_log_expandlogbuf(struct tcp_log_dev_queue * param)2310 tcp_log_expandlogbuf(struct tcp_log_dev_queue *param)
2311 {
2312 struct tcp_log_dev_log_queue *entry;
2313 struct tcp_log_header *hdr;
2314 uint8_t *end;
2315 struct sockopt sopt;
2316 int error;
2317
2318 entry = (struct tcp_log_dev_log_queue *)param;
2319
2320 /* Take a worst-case guess at space needs. */
2321 sopt.sopt_valsize = sizeof(struct tcp_log_header) +
2322 entry->tldl_count * (sizeof(struct tcp_log_buffer) +
2323 sizeof(struct tcp_log_verbose));
2324 hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT);
2325 if (hdr == NULL) {
2326 #ifdef TCPLOG_DEBUG_COUNTERS
2327 counter_u64_add(tcp_log_que_fail5, entry->tldl_count);
2328 #endif
2329 return (NULL);
2330 }
2331 sopt.sopt_val = hdr + 1;
2332 sopt.sopt_valsize -= sizeof(struct tcp_log_header);
2333 sopt.sopt_td = NULL;
2334
2335 error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries,
2336 (struct tcp_log_buffer **)&end, entry->tldl_count);
2337 if (error) {
2338 free(hdr, M_TCPLOGDEV);
2339 return (NULL);
2340 }
2341
2342 /* Free the entries. */
2343 tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
2344 entry->tldl_count = 0;
2345
2346 memset(hdr, 0, sizeof(struct tcp_log_header));
2347 hdr->tlh_version = TCP_LOG_BUF_VER;
2348 hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR;
2349 hdr->tlh_length = end - (uint8_t *)hdr;
2350 hdr->tlh_ie = entry->tldl_ie;
2351 hdr->tlh_af = entry->tldl_af;
2352 getboottime(&hdr->tlh_offset);
2353 strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN);
2354 strlcpy(hdr->tlh_tag, entry->tldl_tag, TCP_LOG_TAG_LEN);
2355 strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN);
2356 return ((struct tcp_log_common_header *)hdr);
2357 }
2358
2359 /*
2360 * Queue the tcpcb's log buffer for transmission via the log buffer facility.
2361 *
2362 * NOTE: This should be called with a write lock on the PCB.
2363 *
2364 * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
2365 * and reacquire the INP lock if it needs to do so.
2366 *
2367 * If force is false, this will only dump auto-logged sessions if
2368 * tcp_log_auto_all is true or if there is a log ID defined for the session.
2369 */
2370 int
tcp_log_dump_tp_logbuf(struct tcpcb * tp,char * reason,int how,bool force)2371 tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force)
2372 {
2373 struct tcp_log_dev_log_queue *entry;
2374 struct inpcb *inp = tptoinpcb(tp);
2375 #ifdef TCPLOG_DEBUG_COUNTERS
2376 int num_entries;
2377 #endif
2378
2379 INP_WLOCK_ASSERT(inp);
2380
2381 /* If there are no log entries, there is nothing to do. */
2382 if (tp->t_lognum == 0)
2383 return (0);
2384
2385 /* Check for a log ID. */
2386 if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
2387 !tcp_log_auto_all && !force) {
2388 struct tcp_log_mem *log_entry;
2389
2390 /*
2391 * We needed a log ID and none was found. Free the log entries
2392 * and return success. Also, cancel further logging. If the
2393 * session doesn't have a log ID by now, we'll assume it isn't
2394 * going to get one.
2395 */
2396 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
2397 tcp_log_remove_log_head(tp, log_entry);
2398 KASSERT(tp->t_lognum == 0,
2399 ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
2400 __func__, tp->t_lognum));
2401 tp->_t_logstate = TCP_LOG_STATE_OFF;
2402 return (0);
2403 }
2404
2405 /*
2406 * Allocate memory. If we must wait, we'll need to drop the locks
2407 * and reacquire them (and do all the related business that goes
2408 * along with that).
2409 */
2410 entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
2411 M_NOWAIT);
2412 if (entry == NULL && (how & M_NOWAIT)) {
2413 #ifdef TCPLOG_DEBUG_COUNTERS
2414 counter_u64_add(tcp_log_que_fail3, 1);
2415 #endif
2416 return (ENOBUFS);
2417 }
2418 if (entry == NULL) {
2419 INP_WUNLOCK(inp);
2420 entry = malloc(sizeof(struct tcp_log_dev_log_queue),
2421 M_TCPLOGDEV, M_WAITOK);
2422 INP_WLOCK(inp);
2423 /*
2424 * Note that this check is slightly overly-restrictive in
2425 * that the TCB can survive either of these events.
2426 * However, there is currently not a good way to ensure
2427 * that is the case. So, if we hit this M_WAIT path, we
2428 * may end up dropping some entries. That seems like a
2429 * small price to pay for safety.
2430 */
2431 if (inp->inp_flags & INP_DROPPED) {
2432 free(entry, M_TCPLOGDEV);
2433 #ifdef TCPLOG_DEBUG_COUNTERS
2434 counter_u64_add(tcp_log_que_fail2, 1);
2435 #endif
2436 return (ECONNRESET);
2437 }
2438 tp = intotcpcb(inp);
2439 if (tp->t_lognum == 0) {
2440 free(entry, M_TCPLOGDEV);
2441 return (0);
2442 }
2443 }
2444
2445 /* Fill in the unique parts of the queue entry. */
2446 if (tp->t_lib != NULL) {
2447 strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
2448 strlcpy(entry->tldl_tag, tp->t_lib->tlb_tag, TCP_LOG_TAG_LEN);
2449 } else {
2450 strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN);
2451 strlcpy(entry->tldl_tag, "UNKNOWN", TCP_LOG_TAG_LEN);
2452 }
2453 if (reason != NULL)
2454 strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
2455 else
2456 strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_REASON_LEN);
2457 entry->tldl_ie = inp->inp_inc.inc_ie;
2458 if (inp->inp_inc.inc_flags & INC_ISIPV6)
2459 entry->tldl_af = AF_INET6;
2460 else
2461 entry->tldl_af = AF_INET;
2462 entry->tldl_entries = tp->t_logs;
2463 entry->tldl_count = tp->t_lognum;
2464
2465 /* Fill in the common parts of the queue entry. */
2466 entry->tldl_common.tldq_buf = NULL;
2467 entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
2468 entry->tldl_common.tldq_dtor = tcp_log_free_queue;
2469
2470 /* Clear the log data from the TCPCB. */
2471 #ifdef TCPLOG_DEBUG_COUNTERS
2472 num_entries = tp->t_lognum;
2473 #endif
2474 tp->t_lognum = 0;
2475 STAILQ_INIT(&tp->t_logs);
2476
2477 /* Add the entry. If no one is listening, free the entry. */
2478 if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) {
2479 tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
2480 #ifdef TCPLOG_DEBUG_COUNTERS
2481 counter_u64_add(tcp_log_que_fail1, num_entries);
2482 } else {
2483 counter_u64_add(tcp_log_queued, num_entries);
2484 #endif
2485 }
2486 return (0);
2487 }
2488
2489 /*
2490 * Queue the log_id_node's log buffers for transmission via the log buffer
2491 * facility.
2492 *
2493 * NOTE: This should be called with the bucket locked and referenced.
2494 *
2495 * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
2496 * and reacquire the bucket lock if it needs to do so. (The caller must
2497 * ensure that the tln is no longer on any lists so no one else will mess
2498 * with this while the lock is dropped!)
2499 */
2500 static int
tcp_log_dump_node_logbuf(struct tcp_log_id_node * tln,char * reason,int how)2501 tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how)
2502 {
2503 struct tcp_log_dev_log_queue *entry;
2504 struct tcp_log_id_bucket *tlb;
2505
2506 tlb = tln->tln_bucket;
2507 TCPID_BUCKET_LOCK_ASSERT(tlb);
2508 KASSERT(tlb->tlb_refcnt > 0,
2509 ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)",
2510 __func__, __LINE__, tln, tlb));
2511 KASSERT(tln->tln_closed,
2512 ("%s:%d: Called for node with tln_closed==false (tln=%p)",
2513 __func__, __LINE__, tln));
2514
2515 /* If there are no log entries, there is nothing to do. */
2516 if (tln->tln_count == 0)
2517 return (0);
2518
2519 /*
2520 * Allocate memory. If we must wait, we'll need to drop the locks
2521 * and reacquire them (and do all the related business that goes
2522 * along with that).
2523 */
2524 entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
2525 M_NOWAIT);
2526 if (entry == NULL && (how & M_NOWAIT))
2527 return (ENOBUFS);
2528 if (entry == NULL) {
2529 TCPID_BUCKET_UNLOCK(tlb);
2530 entry = malloc(sizeof(struct tcp_log_dev_log_queue),
2531 M_TCPLOGDEV, M_WAITOK);
2532 TCPID_BUCKET_LOCK(tlb);
2533 }
2534
2535 /* Fill in the common parts of the queue entry.. */
2536 entry->tldl_common.tldq_buf = NULL;
2537 entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
2538 entry->tldl_common.tldq_dtor = tcp_log_free_queue;
2539
2540 /* Fill in the unique parts of the queue entry. */
2541 strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN);
2542 strlcpy(entry->tldl_tag, tlb->tlb_tag, TCP_LOG_TAG_LEN);
2543 if (reason != NULL)
2544 strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
2545 else
2546 strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_REASON_LEN);
2547 entry->tldl_ie = tln->tln_ie;
2548 entry->tldl_entries = tln->tln_entries;
2549 entry->tldl_count = tln->tln_count;
2550 entry->tldl_af = tln->tln_af;
2551
2552 /* Add the entry. If no one is listening, free the entry. */
2553 if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry))
2554 tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
2555
2556 return (0);
2557 }
2558
2559 /*
2560 * Queue the log buffers for all sessions in a bucket for transmissions via
2561 * the log buffer facility.
2562 *
2563 * NOTE: This should be called with a locked bucket; however, the function
2564 * will drop the lock.
2565 */
2566 #define LOCAL_SAVE 10
2567 static void
tcp_log_dumpbucketlogs(struct tcp_log_id_bucket * tlb,char * reason)2568 tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason)
2569 {
2570 struct tcp_log_id_node local_entries[LOCAL_SAVE];
2571 struct inpcb *inp;
2572 struct tcpcb *tp;
2573 struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln;
2574 int i, num_local_entries, tree_locked;
2575 bool expireq_locked;
2576
2577 TCPID_BUCKET_LOCK_ASSERT(tlb);
2578
2579 /*
2580 * Take a reference on the bucket to keep it from disappearing until
2581 * we are done.
2582 */
2583 TCPID_BUCKET_REF(tlb);
2584
2585 /*
2586 * We'll try to create these without dropping locks. However, we
2587 * might very well need to drop locks to get memory. If that's the
2588 * case, we'll save up to 10 on the stack, and sacrifice the rest.
2589 * (Otherwise, we need to worry about finding our place again in a
2590 * potentially changed list. It just doesn't seem worth the trouble
2591 * to do that.
2592 */
2593 expireq_locked = false;
2594 num_local_entries = 0;
2595 prev_tln = NULL;
2596 tree_locked = TREE_UNLOCKED;
2597 SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) {
2598 /*
2599 * If this isn't associated with a TCPCB, we can pull it off
2600 * the list now. We need to be careful that the expire timer
2601 * hasn't already taken ownership (tln_expiretime == SBT_MAX).
2602 * If so, we let the expire timer code free the data.
2603 */
2604 if (cur_tln->tln_closed) {
2605 no_inp:
2606 /*
2607 * Get the expireq lock so we can get a consistent
2608 * read of tln_expiretime and so we can remove this
2609 * from the expireq.
2610 */
2611 if (!expireq_locked) {
2612 TCPLOG_EXPIREQ_LOCK();
2613 expireq_locked = true;
2614 }
2615
2616 /*
2617 * We ignore entries with tln_expiretime == SBT_MAX.
2618 * The expire timer code already owns those.
2619 */
2620 KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0,
2621 ("%s:%d: node on the expire queue without positive "
2622 "expire time", __func__, __LINE__));
2623 if (cur_tln->tln_expiretime == SBT_MAX) {
2624 prev_tln = cur_tln;
2625 continue;
2626 }
2627
2628 /* Remove the entry from the expireq. */
2629 STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln,
2630 tcp_log_id_node, tln_expireq);
2631
2632 /* Remove the entry from the bucket. */
2633 if (prev_tln != NULL)
2634 SLIST_REMOVE_AFTER(prev_tln, tln_list);
2635 else
2636 SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list);
2637
2638 /*
2639 * Drop the INP and bucket reference counts. Due to
2640 * lock-ordering rules, we need to drop the expire
2641 * queue lock.
2642 */
2643 TCPLOG_EXPIREQ_UNLOCK();
2644 expireq_locked = false;
2645
2646 /* Drop the INP reference. */
2647 INP_WLOCK(cur_tln->tln_inp);
2648 if (!in_pcbrele_wlocked(cur_tln->tln_inp))
2649 INP_WUNLOCK(cur_tln->tln_inp);
2650
2651 if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
2652 #ifdef INVARIANTS
2653 panic("%s: Bucket refcount unexpectedly 0.",
2654 __func__);
2655 #endif
2656 /*
2657 * Recover as best we can: free the entry we
2658 * own.
2659 */
2660 tcp_log_free_entries(&cur_tln->tln_entries,
2661 &cur_tln->tln_count);
2662 uma_zfree(tcp_log_id_node_zone, cur_tln);
2663 goto done;
2664 }
2665
2666 if (tcp_log_dump_node_logbuf(cur_tln, reason,
2667 M_NOWAIT)) {
2668 /*
2669 * If we have sapce, save the entries locally.
2670 * Otherwise, free them.
2671 */
2672 if (num_local_entries < LOCAL_SAVE) {
2673 local_entries[num_local_entries] =
2674 *cur_tln;
2675 num_local_entries++;
2676 } else {
2677 tcp_log_free_entries(
2678 &cur_tln->tln_entries,
2679 &cur_tln->tln_count);
2680 }
2681 }
2682
2683 /* No matter what, we are done with the node now. */
2684 uma_zfree(tcp_log_id_node_zone, cur_tln);
2685
2686 /*
2687 * Because we removed this entry from the list, prev_tln
2688 * (which tracks the previous entry still on the tlb
2689 * list) remains unchanged.
2690 */
2691 continue;
2692 }
2693
2694 /*
2695 * If we get to this point, the session data is still held in
2696 * the TCPCB. So, we need to pull the data out of that.
2697 *
2698 * We will need to drop the expireq lock so we can lock the INP.
2699 * We can then try to extract the data the "easy" way. If that
2700 * fails, we'll save the log entries for later.
2701 */
2702 if (expireq_locked) {
2703 TCPLOG_EXPIREQ_UNLOCK();
2704 expireq_locked = false;
2705 }
2706
2707 /* Lock the INP and then re-check the state. */
2708 inp = cur_tln->tln_inp;
2709 INP_WLOCK(inp);
2710 /*
2711 * If we caught this while it was transitioning, the data
2712 * might have moved from the TCPCB to the tln (signified by
2713 * setting tln_closed to true. If so, treat this like an
2714 * inactive connection.
2715 */
2716 if (cur_tln->tln_closed) {
2717 /*
2718 * It looks like we may have caught this connection
2719 * while it was transitioning from active to inactive.
2720 * Treat this like an inactive connection.
2721 */
2722 INP_WUNLOCK(inp);
2723 goto no_inp;
2724 }
2725
2726 /*
2727 * Try to dump the data from the tp without dropping the lock.
2728 * If this fails, try to save off the data locally.
2729 */
2730 tp = cur_tln->tln_tp;
2731 if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) &&
2732 num_local_entries < LOCAL_SAVE) {
2733 tcp_log_move_tp_to_node(tp,
2734 &local_entries[num_local_entries]);
2735 local_entries[num_local_entries].tln_closed = 1;
2736 KASSERT(local_entries[num_local_entries].tln_bucket ==
2737 tlb, ("%s: %d: bucket mismatch for node %p",
2738 __func__, __LINE__, cur_tln));
2739 num_local_entries++;
2740 }
2741
2742 INP_WUNLOCK(inp);
2743
2744 /*
2745 * We are goint to leave the current tln on the list. It will
2746 * become the previous tln.
2747 */
2748 prev_tln = cur_tln;
2749 }
2750
2751 /* Drop our locks, if any. */
2752 KASSERT(tree_locked == TREE_UNLOCKED,
2753 ("%s: %d: tree unexpectedly locked", __func__, __LINE__));
2754 switch (tree_locked) {
2755 case TREE_WLOCKED:
2756 TCPID_TREE_WUNLOCK();
2757 tree_locked = TREE_UNLOCKED;
2758 break;
2759 case TREE_RLOCKED:
2760 TCPID_TREE_RUNLOCK();
2761 tree_locked = TREE_UNLOCKED;
2762 break;
2763 }
2764 if (expireq_locked) {
2765 TCPLOG_EXPIREQ_UNLOCK();
2766 expireq_locked = false;
2767 }
2768
2769 /*
2770 * Try again for any saved entries. tcp_log_dump_node_logbuf() is
2771 * guaranteed to free the log entries within the node. And, since
2772 * the node itself is on our stack, we don't need to free it.
2773 */
2774 for (i = 0; i < num_local_entries; i++)
2775 tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK);
2776
2777 /* Drop our reference. */
2778 if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
2779 TCPID_BUCKET_UNLOCK(tlb);
2780
2781 done:
2782 /* Drop our locks, if any. */
2783 switch (tree_locked) {
2784 case TREE_WLOCKED:
2785 TCPID_TREE_WUNLOCK();
2786 break;
2787 case TREE_RLOCKED:
2788 TCPID_TREE_RUNLOCK();
2789 break;
2790 }
2791 if (expireq_locked)
2792 TCPLOG_EXPIREQ_UNLOCK();
2793 }
2794 #undef LOCAL_SAVE
2795
2796 /*
2797 * Queue the log buffers for all sessions in a bucket for transmissions via
2798 * the log buffer facility.
2799 *
2800 * NOTE: This should be called with a locked INP; however, the function
2801 * will drop the lock.
2802 */
2803 void
tcp_log_dump_tp_bucket_logbufs(struct tcpcb * tp,char * reason)2804 tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason)
2805 {
2806 struct inpcb *inp = tptoinpcb(tp);
2807 struct tcp_log_id_bucket *tlb;
2808 int tree_locked;
2809
2810 /* Figure out our bucket and lock it. */
2811 INP_WLOCK_ASSERT(inp);
2812 tlb = tp->t_lib;
2813 if (tlb == NULL) {
2814 /*
2815 * No bucket; treat this like a request to dump a single
2816 * session's traces.
2817 */
2818 (void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true);
2819 INP_WUNLOCK(inp);
2820 return;
2821 }
2822 TCPID_BUCKET_REF(tlb);
2823 INP_WUNLOCK(inp);
2824 TCPID_BUCKET_LOCK(tlb);
2825
2826 /* If we are the last reference, we have nothing more to do here. */
2827 tree_locked = TREE_UNLOCKED;
2828 if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
2829 switch (tree_locked) {
2830 case TREE_WLOCKED:
2831 TCPID_TREE_WUNLOCK();
2832 break;
2833 case TREE_RLOCKED:
2834 TCPID_TREE_RUNLOCK();
2835 break;
2836 }
2837 return;
2838 }
2839
2840 /* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */
2841 tcp_log_dumpbucketlogs(tlb, reason);
2842 }
2843
2844 /*
2845 * Mark the end of a flow with the current stack. A stack can add
2846 * stack-specific info to this trace event by overriding this
2847 * function (see bbr_log_flowend() for example).
2848 */
2849 void
tcp_log_flowend(struct tcpcb * tp)2850 tcp_log_flowend(struct tcpcb *tp)
2851 {
2852 if (tp->_t_logstate != TCP_LOG_STATE_OFF) {
2853 struct socket *so = tptosocket(tp);
2854 TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd,
2855 TCP_LOG_FLOWEND, 0, 0, NULL, false);
2856 }
2857 }
2858
2859 void
tcp_log_sendfile(struct socket * so,off_t offset,size_t nbytes,int flags)2860 tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
2861 {
2862 struct inpcb *inp;
2863 struct tcpcb *tp;
2864 #ifdef TCP_REQUEST_TRK
2865 struct tcp_sendfile_track *ent;
2866 int i, fnd;
2867 #endif
2868
2869 inp = sotoinpcb(so);
2870 KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL"));
2871
2872 /* quick check to see if logging is enabled for this connection */
2873 tp = intotcpcb(inp);
2874 if ((inp->inp_flags & INP_DROPPED) ||
2875 (tp->_t_logstate == TCP_LOG_STATE_OFF)) {
2876 return;
2877 }
2878
2879 INP_WLOCK(inp);
2880 /* double check log state now that we have the lock */
2881 if (inp->inp_flags & INP_DROPPED)
2882 goto done;
2883 if (tcp_bblogging_on(tp)) {
2884 struct timeval tv;
2885 tcp_log_eventspecific_t log;
2886
2887 memset(&log, 0, sizeof(log));
2888 microuptime(&tv);
2889 log.u_sf.offset = offset;
2890 log.u_sf.length = nbytes;
2891 log.u_sf.flags = flags;
2892
2893 TCP_LOG_EVENTP(tp, NULL,
2894 &tptosocket(tp)->so_rcv,
2895 &tptosocket(tp)->so_snd,
2896 TCP_LOG_SENDFILE, 0, 0, &log, false, &tv);
2897 }
2898 #ifdef TCP_REQUEST_TRK
2899 if (tp->t_tcpreq_req == 0) {
2900 /* No http requests to track */
2901 goto done;
2902 }
2903 fnd = 0;
2904 if (tp->t_tcpreq_closed == 0) {
2905 /* No closed end req to track */
2906 goto skip_closed_req;
2907 }
2908 for(i = 0; i < MAX_TCP_TRK_REQ; i++) {
2909 /* Lets see if this one can be found */
2910 ent = &tp->t_tcpreq_info[i];
2911 if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) {
2912 /* Not used */
2913 continue;
2914 }
2915 if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) {
2916 /* This pass does not consider open requests */
2917 continue;
2918 }
2919 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) {
2920 /* Don't look at what we have completed */
2921 continue;
2922 }
2923 /* If we reach here its a allocated closed end request */
2924 if ((ent->start == offset) ||
2925 ((offset > ent->start) && (offset < ent->end))){
2926 /* Its within this request?? */
2927 fnd = 1;
2928 }
2929 if (fnd) {
2930 /*
2931 * It is at or past the end, its complete.
2932 */
2933 ent->flags |= TCP_TRK_TRACK_FLG_SEQV;
2934 /*
2935 * When an entry completes we can take (snd_una + sb_cc) and know where
2936 * the end of the range really is. Note that this works since two
2937 * requests must be sequential and sendfile now is complete for *this* request.
2938 * we must use sb_ccc since the data may still be in-flight in TLS.
2939 *
2940 * We always cautiously move the end_seq only if our calculations
2941 * show it happened (just in case sf has the call to here at the wrong
2942 * place). When we go COMP we will stop coming here and hopefully be
2943 * left with the correct end_seq.
2944 */
2945 if (SEQ_GT((tp->snd_una + so->so_snd.sb_ccc), ent->end_seq))
2946 ent->end_seq = tp->snd_una + so->so_snd.sb_ccc;
2947 if ((offset + nbytes) >= ent->end) {
2948 ent->flags |= TCP_TRK_TRACK_FLG_COMP;
2949 tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_COMPLETE, offset, nbytes);
2950 } else {
2951 tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_MOREYET, offset, nbytes);
2952 }
2953 /* We assume that sendfile never sends overlapping requests */
2954 goto done;
2955 }
2956 }
2957 skip_closed_req:
2958 if (!fnd) {
2959 /* Ok now lets look for open requests */
2960 for(i = 0; i < MAX_TCP_TRK_REQ; i++) {
2961 ent = &tp->t_tcpreq_info[i];
2962 if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) {
2963 /* Not used */
2964 continue;
2965 }
2966 if ((ent->flags & TCP_TRK_TRACK_FLG_OPEN) == 0)
2967 continue;
2968 /* If we reach here its an allocated open request */
2969 if (ent->start == offset) {
2970 /* It begins this request */
2971 ent->start_seq = tp->snd_una +
2972 tptosocket(tp)->so_snd.sb_ccc;
2973 ent->flags |= TCP_TRK_TRACK_FLG_SEQV;
2974 break;
2975 } else if (offset > ent->start) {
2976 ent->flags |= TCP_TRK_TRACK_FLG_SEQV;
2977 break;
2978 }
2979 }
2980 }
2981 #endif
2982 done:
2983 INP_WUNLOCK(inp);
2984 }
2985
2986 #ifdef DDB
2987 static void
db_print_indent(int indent)2988 db_print_indent(int indent)
2989 {
2990 int i;
2991
2992 for (i = 0; i < indent; i++)
2993 db_printf(" ");
2994 }
2995
2996 static void
db_print_tcphdr(struct tcp_log_buffer * tlm_buf)2997 db_print_tcphdr(struct tcp_log_buffer *tlm_buf)
2998 {
2999 struct sackblk sack;
3000 struct tcphdr *th;
3001 int cnt, i, j, opt, optlen, num_sacks;
3002 uint32_t val, ecr;
3003 uint16_t mss;
3004 uint16_t flags;
3005
3006 if ((tlm_buf->tlb_eventflags & TLB_FLAG_HDR) == 0) {
3007 return;
3008 }
3009 th = &tlm_buf->tlb_th;
3010 flags = tcp_get_flags(th);
3011 if (flags & TH_FIN) {
3012 db_printf("F");
3013 }
3014 if (flags & TH_SYN) {
3015 db_printf("S");
3016 }
3017 if (flags & TH_RST) {
3018 db_printf("R");
3019 }
3020 if (flags & TH_PUSH) {
3021 db_printf("P");
3022 }
3023 if (flags & TH_ACK) {
3024 db_printf(".");
3025 }
3026 if (flags & TH_URG) {
3027 db_printf("U");
3028 }
3029 if (flags & TH_ECE) {
3030 db_printf("E");
3031 }
3032 if (flags & TH_CWR) {
3033 db_printf("W");
3034 }
3035 if (flags & TH_AE) {
3036 db_printf("A");
3037 }
3038 db_printf(" %u:%u(%u)", ntohl(th->th_seq),
3039 ntohl(th->th_seq) + tlm_buf->tlb_len, tlm_buf->tlb_len);
3040 if (flags & TH_ACK) {
3041 db_printf(" ack %u", ntohl(th->th_ack));
3042 }
3043 db_printf(" win %u", ntohs(th->th_win));
3044 if (flags & TH_URG) {
3045 db_printf(" urg %u", ntohs(th->th_urp));
3046 }
3047 cnt = (th->th_off << 2) - sizeof(struct tcphdr);
3048 if (cnt > 0) {
3049 db_printf(" <");
3050 for (i = 0; i < cnt; i += optlen) {
3051 opt = tlm_buf->tlb_opts[i];
3052 if (opt == TCPOPT_EOL || opt == TCPOPT_NOP) {
3053 optlen = 1;
3054 } else {
3055 if (cnt - i < 2) {
3056 break;
3057 }
3058 optlen = tlm_buf->tlb_opts[i + 1];
3059 if (optlen < 2 || optlen > cnt - i) {
3060 break;
3061 }
3062 }
3063 if (i > 0) {
3064 db_printf(",");
3065 }
3066 switch (opt) {
3067 case TCPOPT_EOL:
3068 db_printf("eol");
3069 break;
3070 case TCPOPT_NOP:
3071 db_printf("nop");
3072 break;
3073 case TCPOPT_MAXSEG:
3074 if (optlen != TCPOLEN_MAXSEG) {
3075 break;
3076 }
3077 bcopy(tlm_buf->tlb_opts + i + 2, &mss,
3078 sizeof(uint16_t));
3079 db_printf("mss %u", ntohs(mss));
3080 break;
3081 case TCPOPT_WINDOW:
3082 if (optlen != TCPOLEN_WINDOW) {
3083 break;
3084 }
3085 db_printf("wscale %u",
3086 tlm_buf->tlb_opts[i + 2]);
3087 break;
3088 case TCPOPT_SACK_PERMITTED:
3089 if (optlen != TCPOLEN_SACK_PERMITTED) {
3090 break;
3091 }
3092 db_printf("sackOK");
3093 break;
3094 case TCPOPT_SACK:
3095 if (optlen == TCPOLEN_SACKHDR ||
3096 (optlen - 2) % TCPOLEN_SACK != 0) {
3097 break;
3098 }
3099 num_sacks = (optlen - 2) / TCPOLEN_SACK;
3100 db_printf("sack");
3101 for (j = 0; j < num_sacks; j++) {
3102 bcopy(tlm_buf->tlb_opts + i + 2 +
3103 j * TCPOLEN_SACK, &sack,
3104 TCPOLEN_SACK);
3105 db_printf(" %u:%u", ntohl(sack.start),
3106 ntohl(sack.end));
3107 }
3108 break;
3109 case TCPOPT_TIMESTAMP:
3110 if (optlen != TCPOLEN_TIMESTAMP) {
3111 break;
3112 }
3113 bcopy(tlm_buf->tlb_opts + i + 2, &val,
3114 sizeof(uint32_t));
3115 bcopy(tlm_buf->tlb_opts + i + 6, &ecr,
3116 sizeof(uint32_t));
3117 db_printf("TS val %u ecr %u", ntohl(val),
3118 ntohl(ecr));
3119 break;
3120 case TCPOPT_SIGNATURE:
3121 db_printf("md5");
3122 if (optlen > 2) {
3123 db_printf(" ");
3124 }
3125 for (j = 0; j < optlen - 2; j++) {
3126 db_printf("%02x",
3127 tlm_buf->tlb_opts[i + 2 + j]);
3128 }
3129 break;
3130 case TCPOPT_FAST_OPEN:
3131 db_printf("FO");
3132 if (optlen > 2) {
3133 db_printf(" ");
3134 }
3135 for (j = 0; j < optlen - 2; j++) {
3136 db_printf("%02x",
3137 tlm_buf->tlb_opts[i + 2 + j]);
3138 }
3139 break;
3140 default:
3141 db_printf("opt=%u len=%u", opt, optlen);
3142 break;
3143 }
3144 }
3145 db_printf(">");
3146 }
3147 }
3148 static void
db_print_pru(struct tcp_log_buffer * tlm_buf)3149 db_print_pru(struct tcp_log_buffer *tlm_buf)
3150 {
3151 switch (tlm_buf->tlb_flex1) {
3152 case PRU_ATTACH:
3153 db_printf("ATTACH");
3154 break;
3155 case PRU_DETACH:
3156 db_printf("DETACH");
3157 break;
3158 case PRU_BIND:
3159 db_printf("BIND");
3160 break;
3161 case PRU_LISTEN:
3162 db_printf("LISTEN");
3163 break;
3164 case PRU_CONNECT:
3165 db_printf("CONNECT");
3166 break;
3167 case PRU_ACCEPT:
3168 db_printf("ACCEPT");
3169 break;
3170 case PRU_DISCONNECT:
3171 db_printf("DISCONNECT");
3172 break;
3173 case PRU_SHUTDOWN:
3174 db_printf("SHUTDOWN");
3175 break;
3176 case PRU_RCVD:
3177 db_printf("RCVD");
3178 break;
3179 case PRU_SEND:
3180 db_printf("SEND");
3181 break;
3182 case PRU_ABORT:
3183 db_printf("ABORT");
3184 break;
3185 case PRU_CONTROL:
3186 db_printf("CONTROL");
3187 break;
3188 case PRU_SENSE:
3189 db_printf("SENSE");
3190 break;
3191 case PRU_RCVOOB:
3192 db_printf("RCVOOB");
3193 break;
3194 case PRU_SENDOOB:
3195 db_printf("SENDOOB");
3196 break;
3197 case PRU_SOCKADDR:
3198 db_printf("SOCKADDR");
3199 break;
3200 case PRU_PEERADDR:
3201 db_printf("PEERADDR");
3202 break;
3203 case PRU_CONNECT2:
3204 db_printf("CONNECT2");
3205 break;
3206 case PRU_FASTTIMO:
3207 db_printf("FASTTIMO");
3208 break;
3209 case PRU_SLOWTIMO:
3210 db_printf("SLOWTIMO");
3211 break;
3212 case PRU_PROTORCV:
3213 db_printf("PROTORCV");
3214 break;
3215 case PRU_PROTOSEND:
3216 db_printf("PROTOSEND");
3217 break;
3218 case PRU_SEND_EOF:
3219 db_printf("SEND_EOF");
3220 break;
3221 case PRU_SOSETLABEL:
3222 db_printf("SOSETLABEL");
3223 break;
3224 case PRU_CLOSE:
3225 db_printf("CLOSE");
3226 break;
3227 case PRU_FLUSH:
3228 db_printf("FLUSH");
3229 break;
3230 default:
3231 db_printf("Unknown PRU (%u)", tlm_buf->tlb_flex1);
3232 break;
3233 }
3234 if (tlm_buf->tlb_errno >= 0) {
3235 db_printf(", error: %d", tlm_buf->tlb_errno);
3236 }
3237 }
3238
3239 static void
db_print_rto(struct tcp_log_buffer * tlm_buf)3240 db_print_rto(struct tcp_log_buffer *tlm_buf)
3241 {
3242 tt_what what;
3243 tt_which which;
3244
3245 what = (tlm_buf->tlb_flex1 & 0xffffff00) >> 8;
3246 which = tlm_buf->tlb_flex1 & 0x000000ff;
3247 switch (what) {
3248 case TT_PROCESSING:
3249 db_printf("Processing ");
3250 break;
3251 case TT_PROCESSED:
3252 db_printf("Processed ");
3253 break;
3254 case TT_STARTING:
3255 db_printf("Starting ");
3256 break;
3257 case TT_STOPPING:
3258 db_printf("Stopping ");
3259 break;
3260 default:
3261 db_printf("Unknown operation (%u) for ", what);
3262 break;
3263 }
3264 switch (which) {
3265 case TT_REXMT:
3266 db_printf("Retransmission ");
3267 break;
3268 case TT_PERSIST:
3269 db_printf("Persist ");
3270 break;
3271 case TT_KEEP:
3272 db_printf("Keepalive ");
3273 break;
3274 case TT_2MSL:
3275 db_printf("2 MSL ");
3276 break;
3277 case TT_DELACK:
3278 db_printf("Delayed ACK ");
3279 break;
3280 default:
3281 db_printf("Unknown (%u) ", which);
3282 break;
3283 }
3284 db_printf("timer");
3285 if (what == TT_STARTING) {
3286 db_printf(": %u ms", tlm_buf->tlb_flex2);
3287 }
3288 }
3289
3290 static void
db_print_usersend(struct tcp_log_buffer * tlm_buf)3291 db_print_usersend(struct tcp_log_buffer *tlm_buf)
3292 {
3293 if ((tlm_buf->tlb_eventflags & TLB_FLAG_RXBUF) == 0) {
3294 return;
3295 }
3296 if ((tlm_buf->tlb_eventflags & TLB_FLAG_TXBUF) == 0) {
3297 return;
3298 }
3299 db_printf("usersend: rcv.acc: %u rcv.ccc: %u snd.acc: %u snd.ccc: %u",
3300 tlm_buf->tlb_rxbuf.tls_sb_acc, tlm_buf->tlb_rxbuf.tls_sb_ccc,
3301 tlm_buf->tlb_txbuf.tls_sb_acc, tlm_buf->tlb_txbuf.tls_sb_ccc);
3302 }
3303
3304 void
db_print_bblog_entries(struct tcp_log_stailq * log_entries,int indent)3305 db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent)
3306 {
3307 struct tcp_log_mem *log_entry;
3308 struct tcp_log_buffer *tlm_buf, *prev_tlm_buf;
3309 int64_t delta_t;
3310
3311 indent += 2;
3312 prev_tlm_buf = NULL;
3313 STAILQ_FOREACH(log_entry, log_entries, tlm_queue) {
3314 db_print_indent(indent);
3315 tlm_buf = &log_entry->tlm_buf;
3316 if (prev_tlm_buf == NULL) {
3317 db_printf(" 0.000 ");
3318 } else {
3319 delta_t = sbttoms(tvtosbt(tlm_buf->tlb_tv) -
3320 tvtosbt(prev_tlm_buf->tlb_tv));
3321 db_printf("+%u.%03u ", (uint32_t)(delta_t / 1000),
3322 (uint32_t)(delta_t % 1000));
3323 }
3324 switch (tlm_buf->tlb_eventid) {
3325 case TCP_LOG_IN:
3326 db_printf("< ");
3327 db_print_tcphdr(tlm_buf);
3328 break;
3329 case TCP_LOG_OUT:
3330 db_printf("> ");
3331 db_print_tcphdr(tlm_buf);
3332 break;
3333 case TCP_LOG_RTO:
3334 db_print_rto(tlm_buf);
3335 break;
3336 case TCP_LOG_PRU:
3337 db_print_pru(tlm_buf);
3338 break;
3339 case TCP_LOG_USERSEND:
3340 db_print_usersend(tlm_buf);
3341 break;
3342 default:
3343 break;
3344 }
3345 db_printf("\n");
3346 prev_tlm_buf = tlm_buf;
3347 if (db_pager_quit)
3348 break;
3349 }
3350 }
3351 #endif
3352