1
2 /*-
3 * SPDX-License-Identifier: BSD-2-Clause
4 *
5 * Copyright (c) 2016-2018 Netflix, Inc.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32 #include "opt_ddb.h"
33 #include <sys/param.h>
34 #include <sys/arb.h>
35 #include <sys/hash.h>
36 #include <sys/kernel.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/qmath.h>
41 #include <sys/queue.h>
42 #include <sys/refcount.h>
43 #include <sys/rwlock.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #ifdef DDB
48 #include <sys/time.h>
49 #endif
50 #include <sys/tree.h>
51 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
52 #include <sys/counter.h>
53 #include <dev/tcp_log/tcp_log_dev.h>
54
55 #ifdef DDB
56 #include <ddb/ddb.h>
57 #endif
58
59 #include <net/if.h>
60 #include <net/if_var.h>
61 #include <net/vnet.h>
62
63 #include <netinet/in.h>
64 #include <netinet/in_pcb.h>
65 #include <netinet/in_var.h>
66 #include <netinet/tcp_var.h>
67 #include <netinet/tcp_log_buf.h>
68 #include <netinet/tcp_seq.h>
69 #include <netinet/tcp_hpts.h>
70
71 /* Default expiry time */
72 #define TCP_LOG_EXPIRE_TIME ((sbintime_t)60 * SBT_1S)
73
74 /* Max interval at which to run the expiry timer */
75 #define TCP_LOG_EXPIRE_INTVL ((sbintime_t)5 * SBT_1S)
76
77 bool tcp_log_verbose;
78 static uma_zone_t tcp_log_id_bucket_zone, tcp_log_id_node_zone, tcp_log_zone;
79 static int tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT;
80 static uint32_t tcp_log_version = TCP_LOG_BUF_VER;
81 RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket);
82 static struct tcp_log_id_tree tcp_log_id_head;
83 static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head =
84 STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head);
85 static struct mtx tcp_log_expireq_mtx;
86 static struct callout tcp_log_expireq_callout;
87 static u_long tcp_log_auto_ratio = 0;
88 static volatile u_long tcp_log_auto_ratio_cur = 0;
89 static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL;
90 static bool tcp_log_auto_all = false;
91 static uint32_t tcp_disable_all_bb_logs = 0;
92
93 RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
94
95 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
96 "TCP Black Box controls");
97
98 SYSCTL_NODE(_net_inet_tcp_bb, OID_AUTO, tp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
99 "TCP Black Box Trace Point controls");
100
101 SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose,
102 0, "Force verbose logging for TCP traces");
103
104 SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit,
105 CTLFLAG_RW, &tcp_log_session_limit, 0,
106 "Maximum number of events maintained for each TCP session");
107
108 uint32_t tcp_trace_point_config = 0;
109 SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, number, CTLFLAG_RW,
110 &tcp_trace_point_config, TCP_LOG_STATE_HEAD_AUTO,
111 "What is the trace point number to activate (0=none, 0xffffffff = all)?");
112
113 uint32_t tcp_trace_point_bb_mode = TCP_LOG_STATE_CONTINUAL;
114 SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, bbmode, CTLFLAG_RW,
115 &tcp_trace_point_bb_mode, TCP_LOG_STATE_HEAD_AUTO,
116 "What is BB logging mode that is activated?");
117
118 int32_t tcp_trace_point_count = 0;
119 SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, count, CTLFLAG_RW,
120 &tcp_trace_point_count, TCP_LOG_STATE_HEAD_AUTO,
121 "How many connections will have BB logging turned on that hit the tracepoint?");
122
123
124
125 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW,
126 &tcp_log_zone, "Maximum number of events maintained for all TCP sessions");
127
128 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD,
129 &tcp_log_zone, "Current number of events maintained for all TCP sessions");
130
131 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW,
132 &tcp_log_id_bucket_zone, "Maximum number of log IDs");
133
134 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD,
135 &tcp_log_id_bucket_zone, "Current number of log IDs");
136
137 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW,
138 &tcp_log_id_node_zone, "Maximum number of tcpcbs with log IDs");
139
140 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD,
141 &tcp_log_id_node_zone, "Current number of tcpcbs with log IDs");
142
143 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version,
144 0, "Version of log formats exported");
145
146 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, disable_all, CTLFLAG_RW,
147 &tcp_disable_all_bb_logs, 0,
148 "Disable all BB logging for all connections");
149
150 SYSCTL_ULONG(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW,
151 &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions");
152
153 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW,
154 &tcp_log_auto_mode, 0,
155 "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_TAIL)");
156
157 SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW,
158 &tcp_log_auto_all, 0,
159 "Auto-select from all sessions (rather than just those with IDs)");
160
161 #ifdef TCPLOG_DEBUG_COUNTERS
162 counter_u64_t tcp_log_queued;
163 counter_u64_t tcp_log_que_fail1;
164 counter_u64_t tcp_log_que_fail2;
165 counter_u64_t tcp_log_que_fail3;
166 counter_u64_t tcp_log_que_fail4;
167 counter_u64_t tcp_log_que_fail5;
168 counter_u64_t tcp_log_que_copyout;
169 counter_u64_t tcp_log_que_read;
170 counter_u64_t tcp_log_que_freed;
171
172 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD,
173 &tcp_log_queued, "Number of entries queued");
174 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD,
175 &tcp_log_que_fail1, "Number of entries queued but fail 1");
176 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD,
177 &tcp_log_que_fail2, "Number of entries queued but fail 2");
178 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD,
179 &tcp_log_que_fail3, "Number of entries queued but fail 3");
180 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD,
181 &tcp_log_que_fail4, "Number of entries queued but fail 4");
182 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD,
183 &tcp_log_que_fail5, "Number of entries queued but fail 4");
184 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD,
185 &tcp_log_que_copyout, "Number of entries copied out");
186 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD,
187 &tcp_log_que_read, "Number of entries read from the queue");
188 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD,
189 &tcp_log_que_freed, "Number of entries freed after reading");
190 #endif
191
192 #ifdef INVARIANTS
193 #define TCPLOG_DEBUG_RINGBUF
194 #endif
195 /* Number of requests to consider a PBCID "active". */
196 #define ACTIVE_REQUEST_COUNT 10
197
198 /* Statistic tracking for "active" PBCIDs. */
199 static counter_u64_t tcp_log_pcb_ids_cur;
200 static counter_u64_t tcp_log_pcb_ids_tot;
201
202 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_cur, CTLFLAG_RD,
203 &tcp_log_pcb_ids_cur, "Number of pcb IDs allocated in the system");
204 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_tot, CTLFLAG_RD,
205 &tcp_log_pcb_ids_tot, "Total number of pcb IDs that have been allocated");
206
207 struct tcp_log_mem
208 {
209 STAILQ_ENTRY(tcp_log_mem) tlm_queue;
210 struct tcp_log_buffer tlm_buf;
211 struct tcp_log_verbose tlm_v;
212 #ifdef TCPLOG_DEBUG_RINGBUF
213 volatile int tlm_refcnt;
214 #endif
215 };
216
217 /* 60 bytes for the header, + 16 bytes for padding */
218 static uint8_t zerobuf[76];
219
220 /*
221 * Lock order:
222 * 1. TCPID_TREE
223 * 2. TCPID_BUCKET
224 * 3. INP
225 *
226 * Rules:
227 * A. You need a lock on the Tree to add/remove buckets.
228 * B. You need a lock on the bucket to add/remove nodes from the bucket.
229 * C. To change information in a node, you need the INP lock if the tln_closed
230 * field is false. Otherwise, you need the bucket lock. (Note that the
231 * tln_closed field can change at any point, so you need to recheck the
232 * entry after acquiring the INP lock.)
233 * D. To remove a node from the bucket, you must have that entry locked,
234 * according to the criteria of Rule C. Also, the node must not be on
235 * the expiry queue.
236 * E. The exception to C is the expiry queue fields, which are locked by
237 * the TCPLOG_EXPIREQ lock.
238 *
239 * Buckets have a reference count. Each node is a reference. Further,
240 * other callers may add reference counts to keep a bucket from disappearing.
241 * You can add a reference as long as you own a lock sufficient to keep the
242 * bucket from disappearing. For example, a common use is:
243 * a. Have a locked INP, but need to lock the TCPID_BUCKET.
244 * b. Add a refcount on the bucket. (Safe because the INP lock prevents
245 * the TCPID_BUCKET from going away.)
246 * c. Drop the INP lock.
247 * d. Acquire a lock on the TCPID_BUCKET.
248 * e. Acquire a lock on the INP.
249 * f. Drop the refcount on the bucket.
250 * (At this point, the bucket may disappear.)
251 *
252 * Expire queue lock:
253 * You can acquire this with either the bucket or INP lock. Don't reverse it.
254 * When the expire code has committed to freeing a node, it resets the expiry
255 * time to SBT_MAX. That is the signal to everyone else that they should
256 * leave that node alone.
257 */
258 static struct rwlock tcp_id_tree_lock;
259 #define TCPID_TREE_WLOCK() rw_wlock(&tcp_id_tree_lock)
260 #define TCPID_TREE_RLOCK() rw_rlock(&tcp_id_tree_lock)
261 #define TCPID_TREE_UPGRADE() rw_try_upgrade(&tcp_id_tree_lock)
262 #define TCPID_TREE_WUNLOCK() rw_wunlock(&tcp_id_tree_lock)
263 #define TCPID_TREE_RUNLOCK() rw_runlock(&tcp_id_tree_lock)
264 #define TCPID_TREE_WLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_WLOCKED)
265 #define TCPID_TREE_RLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_RLOCKED)
266 #define TCPID_TREE_UNLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_UNLOCKED)
267
268 #define TCPID_BUCKET_LOCK_INIT(tlb) mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF)
269 #define TCPID_BUCKET_LOCK_DESTROY(tlb) mtx_destroy(&((tlb)->tlb_mtx))
270 #define TCPID_BUCKET_LOCK(tlb) mtx_lock(&((tlb)->tlb_mtx))
271 #define TCPID_BUCKET_UNLOCK(tlb) mtx_unlock(&((tlb)->tlb_mtx))
272 #define TCPID_BUCKET_LOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_OWNED)
273 #define TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED)
274
275 #define TCPID_BUCKET_REF(tlb) refcount_acquire(&((tlb)->tlb_refcnt))
276 #define TCPID_BUCKET_UNREF(tlb) refcount_release(&((tlb)->tlb_refcnt))
277
278 #define TCPLOG_EXPIREQ_LOCK() mtx_lock(&tcp_log_expireq_mtx)
279 #define TCPLOG_EXPIREQ_UNLOCK() mtx_unlock(&tcp_log_expireq_mtx)
280
281 SLIST_HEAD(tcp_log_id_head, tcp_log_id_node);
282
283 struct tcp_log_id_bucket
284 {
285 /*
286 * tlb_id must be first. This lets us use strcmp on
287 * (struct tcp_log_id_bucket *) and (char *) interchangeably.
288 */
289 char tlb_id[TCP_LOG_ID_LEN];
290 char tlb_tag[TCP_LOG_TAG_LEN];
291 RB_ENTRY(tcp_log_id_bucket) tlb_rb;
292 struct tcp_log_id_head tlb_head;
293 struct mtx tlb_mtx;
294 volatile u_int tlb_refcnt;
295 volatile u_int tlb_reqcnt;
296 uint32_t tlb_loglimit;
297 int8_t tlb_logstate;
298 };
299
300 struct tcp_log_id_node
301 {
302 SLIST_ENTRY(tcp_log_id_node) tln_list;
303 STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */
304 sbintime_t tln_expiretime; /* Locked by the expireq lock */
305
306 /*
307 * If INP is NULL, that means the connection has closed. We've
308 * saved the connection endpoint information and the log entries
309 * in the tln_ie and tln_entries members. We've also saved a pointer
310 * to the enclosing bucket here. If INP is not NULL, the information is
311 * in the PCB and not here.
312 */
313 struct inpcb *tln_inp;
314 struct tcpcb *tln_tp;
315 struct tcp_log_id_bucket *tln_bucket;
316 struct in_endpoints tln_ie;
317 struct tcp_log_stailq tln_entries;
318 int tln_count;
319 volatile int tln_closed;
320 uint8_t tln_af;
321 };
322
323 enum tree_lock_state {
324 TREE_UNLOCKED = 0,
325 TREE_RLOCKED,
326 TREE_WLOCKED,
327 };
328
329 /* Do we want to select this session for auto-logging? */
330 static __inline bool
tcp_log_selectauto(void)331 tcp_log_selectauto(void)
332 {
333
334 /*
335 * If we are doing auto-capturing, figure out whether we will capture
336 * this session.
337 */
338 if (tcp_log_auto_ratio &&
339 (tcp_disable_all_bb_logs == 0) &&
340 (atomic_fetchadd_long(&tcp_log_auto_ratio_cur, 1) %
341 tcp_log_auto_ratio) == 0)
342 return (true);
343 return (false);
344 }
345
346 static __inline int
tcp_log_id_cmp(struct tcp_log_id_bucket * a,struct tcp_log_id_bucket * b)347 tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b)
348 {
349 KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL"));
350 KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL"));
351 return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN);
352 }
353
RB_GENERATE_STATIC(tcp_log_id_tree,tcp_log_id_bucket,tlb_rb,tcp_log_id_cmp)354 RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
355
356 static __inline void
357 tcp_log_id_validate_tree_lock(int tree_locked)
358 {
359
360 #ifdef INVARIANTS
361 switch (tree_locked) {
362 case TREE_WLOCKED:
363 TCPID_TREE_WLOCK_ASSERT();
364 break;
365 case TREE_RLOCKED:
366 TCPID_TREE_RLOCK_ASSERT();
367 break;
368 case TREE_UNLOCKED:
369 TCPID_TREE_UNLOCK_ASSERT();
370 break;
371 default:
372 kassert_panic("%s:%d: unknown tree lock state", __func__,
373 __LINE__);
374 }
375 #endif
376 }
377
378 static __inline void
tcp_log_remove_bucket(struct tcp_log_id_bucket * tlb)379 tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb)
380 {
381
382 TCPID_TREE_WLOCK_ASSERT();
383 KASSERT(SLIST_EMPTY(&tlb->tlb_head),
384 ("%s: Attempt to remove non-empty bucket", __func__));
385 if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) {
386 #ifdef INVARIANTS
387 kassert_panic("%s:%d: error removing element from tree",
388 __func__, __LINE__);
389 #endif
390 }
391 TCPID_BUCKET_LOCK_DESTROY(tlb);
392 counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1);
393 uma_zfree(tcp_log_id_bucket_zone, tlb);
394 }
395
396 /*
397 * Call with a referenced and locked bucket.
398 * Will return true if the bucket was freed; otherwise, false.
399 * tlb: The bucket to unreference.
400 * tree_locked: A pointer to the state of the tree lock. If the tree lock
401 * state changes, the function will update it.
402 * inp: If not NULL and the function needs to drop the inp lock to relock the
403 * tree, it will do so. (The caller must ensure inp will not become invalid,
404 * probably by holding a reference to it.)
405 */
406 static bool
tcp_log_unref_bucket(struct tcp_log_id_bucket * tlb,int * tree_locked,struct inpcb * inp)407 tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked,
408 struct inpcb *inp)
409 {
410
411 KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__));
412 KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
413 __func__));
414
415 tcp_log_id_validate_tree_lock(*tree_locked);
416
417 /*
418 * Did we hold the last reference on the tlb? If so, we may need
419 * to free it. (Note that we can realistically only execute the
420 * loop twice: once without a write lock and once with a write
421 * lock.)
422 */
423 while (TCPID_BUCKET_UNREF(tlb)) {
424 /*
425 * We need a write lock on the tree to free this.
426 * If we can upgrade the tree lock, this is "easy". If we
427 * can't upgrade the tree lock, we need to do this the
428 * "hard" way: unwind all our locks and relock everything.
429 * In the meantime, anything could have changed. We even
430 * need to validate that we still need to free the bucket.
431 */
432 if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE())
433 *tree_locked = TREE_WLOCKED;
434 else if (*tree_locked != TREE_WLOCKED) {
435 TCPID_BUCKET_REF(tlb);
436 if (inp != NULL)
437 INP_WUNLOCK(inp);
438 TCPID_BUCKET_UNLOCK(tlb);
439 if (*tree_locked == TREE_RLOCKED)
440 TCPID_TREE_RUNLOCK();
441 TCPID_TREE_WLOCK();
442 *tree_locked = TREE_WLOCKED;
443 TCPID_BUCKET_LOCK(tlb);
444 if (inp != NULL)
445 INP_WLOCK(inp);
446 continue;
447 }
448
449 /*
450 * We have an empty bucket and a write lock on the tree.
451 * Remove the empty bucket.
452 */
453 tcp_log_remove_bucket(tlb);
454 return (true);
455 }
456 return (false);
457 }
458
459 /*
460 * Call with a locked bucket. This function will release the lock on the
461 * bucket before returning.
462 *
463 * The caller is responsible for freeing the tp->t_lin/tln node!
464 *
465 * Note: one of tp or both tlb and tln must be supplied.
466 *
467 * inp: A pointer to the inp. If the function needs to drop the inp lock to
468 * acquire the tree write lock, it will do so. (The caller must ensure inp
469 * will not become invalid, probably by holding a reference to it.)
470 * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored)
471 * tlb: A pointer to the bucket. (optional; ignored if tp is specified)
472 * tln: A pointer to the node. (optional; ignored if tp is specified)
473 * tree_locked: A pointer to the state of the tree lock. If the tree lock
474 * state changes, the function will update it.
475 *
476 * Will return true if the INP lock was reacquired; otherwise, false.
477 */
478 static bool
tcp_log_remove_id_node(struct inpcb * inp,struct tcpcb * tp,struct tcp_log_id_bucket * tlb,struct tcp_log_id_node * tln,int * tree_locked)479 tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp,
480 struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln,
481 int *tree_locked)
482 {
483 int orig_tree_locked;
484
485 KASSERT(tp != NULL || (tlb != NULL && tln != NULL),
486 ("%s: called with tp=%p, tlb=%p, tln=%p", __func__,
487 tp, tlb, tln));
488 KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
489 __func__));
490
491 if (tp != NULL) {
492 tlb = tp->t_lib;
493 tln = tp->t_lin;
494 KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__));
495 KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__));
496 }
497
498 tcp_log_id_validate_tree_lock(*tree_locked);
499 TCPID_BUCKET_LOCK_ASSERT(tlb);
500
501 /*
502 * Remove the node, clear the log bucket and node from the TCPCB, and
503 * decrement the bucket refcount. In the process, if this is the
504 * last reference, the bucket will be freed.
505 */
506 SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list);
507 if (tp != NULL) {
508 tp->t_lib = NULL;
509 tp->t_lin = NULL;
510 }
511 orig_tree_locked = *tree_locked;
512 if (!tcp_log_unref_bucket(tlb, tree_locked, inp))
513 TCPID_BUCKET_UNLOCK(tlb);
514 return (*tree_locked != orig_tree_locked);
515 }
516
517 #define RECHECK_INP_CLEAN(cleanup) do { \
518 if (inp->inp_flags & INP_DROPPED) { \
519 rv = ECONNRESET; \
520 cleanup; \
521 goto done; \
522 } \
523 tp = intotcpcb(inp); \
524 } while (0)
525
526 #define RECHECK_INP() RECHECK_INP_CLEAN(/* noop */)
527
528 static void
tcp_log_grow_tlb(char * tlb_id,struct tcpcb * tp)529 tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp)
530 {
531
532 INP_WLOCK_ASSERT(tptoinpcb(tp));
533
534 #ifdef STATS
535 if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL)
536 (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id));
537 #endif
538 }
539
540 static void
tcp_log_increment_reqcnt(struct tcp_log_id_bucket * tlb)541 tcp_log_increment_reqcnt(struct tcp_log_id_bucket *tlb)
542 {
543
544 atomic_fetchadd_int(&tlb->tlb_reqcnt, 1);
545 }
546
547 int
tcp_log_apply_ratio(struct tcpcb * tp,int ratio)548 tcp_log_apply_ratio(struct tcpcb *tp, int ratio)
549 {
550 struct tcp_log_id_bucket *tlb;
551 struct inpcb *inp = tptoinpcb(tp);
552 uint32_t hash, ratio_hash_thresh;
553 int rv, tree_locked;
554
555 rv = 0;
556 tree_locked = TREE_UNLOCKED;
557 tlb = tp->t_lib;
558
559 INP_WLOCK_ASSERT(inp);
560 if (tlb == NULL) {
561 INP_WUNLOCK(inp);
562 return (EOPNOTSUPP);
563 }
564 if (ratio)
565 ratio_hash_thresh = max(1, UINT32_MAX / ratio);
566 else
567 ratio_hash_thresh = 0;
568 TCPID_BUCKET_REF(tlb);
569 INP_WUNLOCK(inp);
570 TCPID_BUCKET_LOCK(tlb);
571
572 hash = hash32_buf(tlb->tlb_id, strlen(tlb->tlb_id), 0);
573 if (hash > ratio_hash_thresh && tp->_t_logstate == TCP_LOG_STATE_OFF &&
574 tlb->tlb_logstate == TCP_LOG_STATE_OFF) {
575 /*
576 * Ratio decision not to log this log ID (and this connection by
577 * way of association). We only apply a log ratio log disable
578 * decision if it would not interfere with a log enable decision
579 * made elsewhere e.g. tcp_log_selectauto() or setsockopt().
580 */
581 tlb->tlb_logstate = TCP_LOG_STATE_RATIO_OFF;
582 INP_WLOCK(inp);
583 RECHECK_INP();
584 (void)tcp_log_state_change(tp, TCP_LOG_STATE_OFF);
585 done:
586 INP_WUNLOCK(inp);
587 }
588
589 INP_UNLOCK_ASSERT(inp);
590 if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
591 TCPID_BUCKET_UNLOCK(tlb);
592
593 if (tree_locked == TREE_WLOCKED) {
594 TCPID_TREE_WLOCK_ASSERT();
595 TCPID_TREE_WUNLOCK();
596 } else if (tree_locked == TREE_RLOCKED) {
597 TCPID_TREE_RLOCK_ASSERT();
598 TCPID_TREE_RUNLOCK();
599 } else
600 TCPID_TREE_UNLOCK_ASSERT();
601
602 return (rv);
603 }
604
605 /*
606 * Associate the specified tag with a particular TCP log ID.
607 * Called with INPCB locked. Returns with it unlocked.
608 * Returns 0 on success or EOPNOTSUPP if the connection has no TCP log ID.
609 */
610 int
tcp_log_set_tag(struct tcpcb * tp,char * tag)611 tcp_log_set_tag(struct tcpcb *tp, char *tag)
612 {
613 struct inpcb *inp = tptoinpcb(tp);
614 struct tcp_log_id_bucket *tlb;
615 int tree_locked;
616
617 INP_WLOCK_ASSERT(inp);
618
619 tree_locked = TREE_UNLOCKED;
620 tlb = tp->t_lib;
621 if (tlb == NULL) {
622 INP_WUNLOCK(inp);
623 return (EOPNOTSUPP);
624 }
625
626 TCPID_BUCKET_REF(tlb);
627 INP_WUNLOCK(inp);
628 TCPID_BUCKET_LOCK(tlb);
629 strlcpy(tlb->tlb_tag, tag, TCP_LOG_TAG_LEN);
630 if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
631 TCPID_BUCKET_UNLOCK(tlb);
632
633 if (tree_locked == TREE_WLOCKED) {
634 TCPID_TREE_WLOCK_ASSERT();
635 TCPID_TREE_WUNLOCK();
636 } else if (tree_locked == TREE_RLOCKED) {
637 TCPID_TREE_RLOCK_ASSERT();
638 TCPID_TREE_RUNLOCK();
639 } else
640 TCPID_TREE_UNLOCK_ASSERT();
641
642 return (0);
643 }
644
645 /*
646 * Set the TCP log ID for a TCPCB.
647 * Called with INPCB locked. Returns with it unlocked.
648 */
649 int
tcp_log_set_id(struct tcpcb * tp,char * id)650 tcp_log_set_id(struct tcpcb *tp, char *id)
651 {
652 struct tcp_log_id_bucket *tlb, *tmp_tlb;
653 struct tcp_log_id_node *tln;
654 struct inpcb *inp = tptoinpcb(tp);
655 int tree_locked, rv;
656 bool bucket_locked, same;
657
658 tlb = NULL;
659 tln = NULL;
660 tree_locked = TREE_UNLOCKED;
661 bucket_locked = false;
662
663 restart:
664 INP_WLOCK_ASSERT(inp);
665 /* See if the ID is unchanged. */
666 same = ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) ||
667 (tp->t_lib == NULL && *id == 0));
668 if (tp->_t_logstate && STAILQ_FIRST(&tp->t_logs) && !same) {
669 /*
670 * There are residual logs left we may
671 * be changing id's so dump what we can.
672 */
673 switch(tp->_t_logstate) {
674 case TCP_LOG_STATE_HEAD_AUTO:
675 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at id switch",
676 M_NOWAIT, false);
677 break;
678 case TCP_LOG_STATE_TAIL_AUTO:
679 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at id switch",
680 M_NOWAIT, false);
681 break;
682 case TCP_LOG_STATE_CONTINUAL:
683 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual at id switch",
684 M_NOWAIT, false);
685 break;
686 case TCP_LOG_VIA_BBPOINTS:
687 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints at id switch",
688 M_NOWAIT, false);
689 break;
690 }
691 }
692 if (same) {
693 if (tp->t_lib != NULL) {
694 tcp_log_increment_reqcnt(tp->t_lib);
695 if ((tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) &&
696 (tp->t_log_state_set == 0)) {
697 /* Clone in any logging */
698
699 tp->_t_logstate = tp->t_lib->tlb_logstate;
700 }
701 if ((tp->t_lib->tlb_loglimit) &&
702 (tp->t_log_state_set == 0)) {
703 /* We also have a limit set */
704
705 tp->t_loglimit = tp->t_lib->tlb_loglimit;
706 }
707 }
708 rv = 0;
709 goto done;
710 }
711
712 /*
713 * If the TCPCB had a previous ID, we need to extricate it from
714 * the previous list.
715 *
716 * Drop the TCPCB lock and lock the tree and the bucket.
717 * Because this is called in the socket context, we (theoretically)
718 * don't need to worry about the INPCB completely going away
719 * while we are gone.
720 */
721 if (tp->t_lib != NULL) {
722 tlb = tp->t_lib;
723 TCPID_BUCKET_REF(tlb);
724 INP_WUNLOCK(inp);
725
726 if (tree_locked == TREE_UNLOCKED) {
727 TCPID_TREE_RLOCK();
728 tree_locked = TREE_RLOCKED;
729 }
730 TCPID_BUCKET_LOCK(tlb);
731 bucket_locked = true;
732 INP_WLOCK(inp);
733
734 /*
735 * Unreference the bucket. If our bucket went away, it is no
736 * longer locked or valid.
737 */
738 if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) {
739 bucket_locked = false;
740 tlb = NULL;
741 }
742
743 /* Validate the INP. */
744 RECHECK_INP();
745
746 /*
747 * Evaluate whether the bucket changed while we were unlocked.
748 *
749 * Possible scenarios here:
750 * 1. Bucket is unchanged and the same one we started with.
751 * 2. The TCPCB no longer has a bucket and our bucket was
752 * freed.
753 * 3. The TCPCB has a new bucket, whether ours was freed.
754 * 4. The TCPCB no longer has a bucket and our bucket was
755 * not freed.
756 *
757 * In cases 2-4, we will start over. In case 1, we will
758 * proceed here to remove the bucket.
759 */
760 if (tlb == NULL || tp->t_lib != tlb) {
761 KASSERT(bucket_locked || tlb == NULL,
762 ("%s: bucket_locked (%d) and tlb (%p) are "
763 "inconsistent", __func__, bucket_locked, tlb));
764
765 if (bucket_locked) {
766 TCPID_BUCKET_UNLOCK(tlb);
767 bucket_locked = false;
768 tlb = NULL;
769 }
770 goto restart;
771 }
772
773 /*
774 * Store the (struct tcp_log_id_node) for reuse. Then, remove
775 * it from the bucket. In the process, we may end up relocking.
776 * If so, we need to validate that the INP is still valid, and
777 * the TCPCB entries match we expect.
778 *
779 * We will clear tlb and change the bucket_locked state just
780 * before calling tcp_log_remove_id_node(), since that function
781 * will unlock the bucket.
782 */
783 if (tln != NULL)
784 uma_zfree(tcp_log_id_node_zone, tln);
785 tln = tp->t_lin;
786 tlb = NULL;
787 bucket_locked = false;
788 if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) {
789 RECHECK_INP();
790
791 /*
792 * If the TCPCB moved to a new bucket while we had
793 * dropped the lock, restart.
794 */
795 if (tp->t_lib != NULL || tp->t_lin != NULL)
796 goto restart;
797 }
798
799 /*
800 * Yay! We successfully removed the TCPCB from its old
801 * bucket. Phew!
802 *
803 * On to bigger and better things...
804 */
805 }
806
807 /* At this point, the TCPCB should not be in any bucket. */
808 KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__));
809
810 /*
811 * If the new ID is not empty, we need to now assign this TCPCB to a
812 * new bucket.
813 */
814 if (*id) {
815 /* Get a new tln, if we don't already have one to reuse. */
816 if (tln == NULL) {
817 tln = uma_zalloc(tcp_log_id_node_zone,
818 M_NOWAIT | M_ZERO);
819 if (tln == NULL) {
820 rv = ENOBUFS;
821 goto done;
822 }
823 tln->tln_inp = inp;
824 tln->tln_tp = tp;
825 }
826
827 /*
828 * Drop the INP lock for a bit. We don't need it, and dropping
829 * it prevents lock order reversals.
830 */
831 INP_WUNLOCK(inp);
832
833 /* Make sure we have at least a read lock on the tree. */
834 tcp_log_id_validate_tree_lock(tree_locked);
835 if (tree_locked == TREE_UNLOCKED) {
836 TCPID_TREE_RLOCK();
837 tree_locked = TREE_RLOCKED;
838 }
839
840 refind:
841 /*
842 * Remember that we constructed (struct tcp_log_id_node) so
843 * we can safely cast the id to it for the purposes of finding.
844 */
845 KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL",
846 __func__, __LINE__));
847 tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head,
848 (struct tcp_log_id_bucket *) id);
849
850 /*
851 * If we didn't find a matching bucket, we need to add a new
852 * one. This requires a write lock. But, of course, we will
853 * need to recheck some things when we re-acquire the lock.
854 */
855 if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) {
856 tree_locked = TREE_WLOCKED;
857 if (!TCPID_TREE_UPGRADE()) {
858 TCPID_TREE_RUNLOCK();
859 TCPID_TREE_WLOCK();
860
861 /*
862 * The tree may have changed while we were
863 * unlocked.
864 */
865 goto refind;
866 }
867 }
868
869 /* If we need to add a new bucket, do it now. */
870 if (tmp_tlb == NULL) {
871 /* Allocate new bucket. */
872 tlb = uma_zalloc(tcp_log_id_bucket_zone, M_NOWAIT);
873 if (tlb == NULL) {
874 rv = ENOBUFS;
875 goto done_noinp;
876 }
877 counter_u64_add(tcp_log_pcb_ids_cur, 1);
878 counter_u64_add(tcp_log_pcb_ids_tot, 1);
879
880 if ((tcp_log_auto_all == false) &&
881 tcp_log_auto_mode &&
882 tcp_log_selectauto()) {
883 /* Save off the log state */
884 tlb->tlb_logstate = tcp_log_auto_mode;
885 } else
886 tlb->tlb_logstate = TCP_LOG_STATE_OFF;
887 tlb->tlb_loglimit = 0;
888 tlb->tlb_tag[0] = '\0'; /* Default to an empty tag. */
889
890 /*
891 * Copy the ID to the bucket.
892 * NB: Don't use strlcpy() unless you are sure
893 * we've always validated NULL termination.
894 *
895 * TODO: When I'm done writing this, see if we
896 * we have correctly validated NULL termination and
897 * can use strlcpy(). :-)
898 */
899 strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1);
900 tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0';
901
902 /*
903 * Take the refcount for the first node and go ahead
904 * and lock this. Note that we zero the tlb_mtx
905 * structure, since 0xdeadc0de flips the right bits
906 * for the code to think that this mutex has already
907 * been initialized. :-(
908 */
909 SLIST_INIT(&tlb->tlb_head);
910 refcount_init(&tlb->tlb_refcnt, 1);
911 tlb->tlb_reqcnt = 1;
912 memset(&tlb->tlb_mtx, 0, sizeof(struct mtx));
913 TCPID_BUCKET_LOCK_INIT(tlb);
914 TCPID_BUCKET_LOCK(tlb);
915 bucket_locked = true;
916
917 #define FREE_NEW_TLB() do { \
918 TCPID_BUCKET_LOCK_DESTROY(tlb); \
919 uma_zfree(tcp_log_id_bucket_zone, tlb); \
920 counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1); \
921 counter_u64_add(tcp_log_pcb_ids_tot, (int64_t)-1); \
922 bucket_locked = false; \
923 tlb = NULL; \
924 } while (0)
925 /*
926 * Relock the INP and make sure we are still
927 * unassigned.
928 */
929 INP_WLOCK(inp);
930 RECHECK_INP_CLEAN(FREE_NEW_TLB());
931 if (tp->t_lib != NULL) {
932 FREE_NEW_TLB();
933 goto restart;
934 }
935
936 /* Add the new bucket to the tree. */
937 tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head,
938 tlb);
939 KASSERT(tmp_tlb == NULL,
940 ("%s: Unexpected conflicting bucket (%p) while "
941 "adding new bucket (%p)", __func__, tmp_tlb, tlb));
942
943 /*
944 * If we found a conflicting bucket, free the new
945 * one we made and fall through to use the existing
946 * bucket.
947 */
948 if (tmp_tlb != NULL) {
949 FREE_NEW_TLB();
950 INP_WUNLOCK(inp);
951 }
952 #undef FREE_NEW_TLB
953 }
954
955 /* If we found an existing bucket, use it. */
956 if (tmp_tlb != NULL) {
957 tlb = tmp_tlb;
958 TCPID_BUCKET_LOCK(tlb);
959 bucket_locked = true;
960
961 /*
962 * Relock the INP and make sure we are still
963 * unassigned.
964 */
965 INP_UNLOCK_ASSERT(inp);
966 INP_WLOCK(inp);
967 RECHECK_INP();
968 if (tp->t_lib != NULL) {
969 TCPID_BUCKET_UNLOCK(tlb);
970 bucket_locked = false;
971 tlb = NULL;
972 goto restart;
973 }
974
975 /* Take a reference on the bucket. */
976 TCPID_BUCKET_REF(tlb);
977
978 /* Record the request. */
979 tcp_log_increment_reqcnt(tlb);
980 }
981
982 tcp_log_grow_tlb(tlb->tlb_id, tp);
983
984 /* Add the new node to the list. */
985 SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list);
986 tp->t_lib = tlb;
987 tp->t_lin = tln;
988 if (tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) {
989 /* Clone in any logging */
990
991 tp->_t_logstate = tp->t_lib->tlb_logstate;
992 }
993 if (tp->t_lib->tlb_loglimit) {
994 /* The loglimit too */
995
996 tp->t_loglimit = tp->t_lib->tlb_loglimit;
997 }
998 tln = NULL;
999 }
1000
1001 rv = 0;
1002
1003 done:
1004 /* Unlock things, as needed, and return. */
1005 INP_WUNLOCK(inp);
1006 done_noinp:
1007 INP_UNLOCK_ASSERT(inp);
1008 if (bucket_locked) {
1009 TCPID_BUCKET_LOCK_ASSERT(tlb);
1010 TCPID_BUCKET_UNLOCK(tlb);
1011 } else if (tlb != NULL)
1012 TCPID_BUCKET_UNLOCK_ASSERT(tlb);
1013 if (tree_locked == TREE_WLOCKED) {
1014 TCPID_TREE_WLOCK_ASSERT();
1015 TCPID_TREE_WUNLOCK();
1016 } else if (tree_locked == TREE_RLOCKED) {
1017 TCPID_TREE_RLOCK_ASSERT();
1018 TCPID_TREE_RUNLOCK();
1019 } else
1020 TCPID_TREE_UNLOCK_ASSERT();
1021 if (tln != NULL)
1022 uma_zfree(tcp_log_id_node_zone, tln);
1023 return (rv);
1024 }
1025
1026 /*
1027 * Get the TCP log ID for a TCPCB.
1028 * Called with INPCB locked.
1029 * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long.
1030 * Returns number of bytes copied.
1031 */
1032 size_t
tcp_log_get_id(struct tcpcb * tp,char * buf)1033 tcp_log_get_id(struct tcpcb *tp, char *buf)
1034 {
1035 size_t len;
1036
1037 INP_LOCK_ASSERT(tptoinpcb(tp));
1038 if (tp->t_lib != NULL) {
1039 len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
1040 KASSERT(len < TCP_LOG_ID_LEN,
1041 ("%s:%d: tp->t_lib->tlb_id too long (%zu)",
1042 __func__, __LINE__, len));
1043 } else {
1044 *buf = '\0';
1045 len = 0;
1046 }
1047 return (len);
1048 }
1049
1050 /*
1051 * Get the tag associated with the TCPCB's log ID.
1052 * Called with INPCB locked. Returns with it unlocked.
1053 * 'buf' must point to a buffer that is at least TCP_LOG_TAG_LEN bytes long.
1054 * Returns number of bytes copied.
1055 */
1056 size_t
tcp_log_get_tag(struct tcpcb * tp,char * buf)1057 tcp_log_get_tag(struct tcpcb *tp, char *buf)
1058 {
1059 struct inpcb *inp = tptoinpcb(tp);
1060 struct tcp_log_id_bucket *tlb;
1061 size_t len;
1062 int tree_locked;
1063
1064 INP_WLOCK_ASSERT(inp);
1065
1066 tree_locked = TREE_UNLOCKED;
1067 tlb = tp->t_lib;
1068
1069 if (tlb != NULL) {
1070 TCPID_BUCKET_REF(tlb);
1071 INP_WUNLOCK(inp);
1072 TCPID_BUCKET_LOCK(tlb);
1073 len = strlcpy(buf, tlb->tlb_tag, TCP_LOG_TAG_LEN);
1074 KASSERT(len < TCP_LOG_TAG_LEN,
1075 ("%s:%d: tp->t_lib->tlb_tag too long (%zu)",
1076 __func__, __LINE__, len));
1077 if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
1078 TCPID_BUCKET_UNLOCK(tlb);
1079
1080 if (tree_locked == TREE_WLOCKED) {
1081 TCPID_TREE_WLOCK_ASSERT();
1082 TCPID_TREE_WUNLOCK();
1083 } else if (tree_locked == TREE_RLOCKED) {
1084 TCPID_TREE_RLOCK_ASSERT();
1085 TCPID_TREE_RUNLOCK();
1086 } else
1087 TCPID_TREE_UNLOCK_ASSERT();
1088 } else {
1089 INP_WUNLOCK(inp);
1090 *buf = '\0';
1091 len = 0;
1092 }
1093
1094 return (len);
1095 }
1096
1097 /*
1098 * Get number of connections with the same log ID.
1099 * Log ID is taken from given TCPCB.
1100 * Called with INPCB locked.
1101 */
1102 u_int
tcp_log_get_id_cnt(struct tcpcb * tp)1103 tcp_log_get_id_cnt(struct tcpcb *tp)
1104 {
1105
1106 INP_WLOCK_ASSERT(tptoinpcb(tp));
1107 return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt);
1108 }
1109
1110 #ifdef TCPLOG_DEBUG_RINGBUF
1111 /*
1112 * Functions/macros to increment/decrement reference count for a log
1113 * entry. This should catch when we do a double-free/double-remove or
1114 * a double-add.
1115 */
1116 static inline void
_tcp_log_entry_refcnt_add(struct tcp_log_mem * log_entry,const char * func,int line)1117 _tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func,
1118 int line)
1119 {
1120 int refcnt;
1121
1122 refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1);
1123 if (refcnt != 0)
1124 panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)",
1125 func, line, log_entry, refcnt);
1126 }
1127 #define tcp_log_entry_refcnt_add(l) \
1128 _tcp_log_entry_refcnt_add((l), __func__, __LINE__)
1129
1130 static inline void
_tcp_log_entry_refcnt_rem(struct tcp_log_mem * log_entry,const char * func,int line)1131 _tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func,
1132 int line)
1133 {
1134 int refcnt;
1135
1136 refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1);
1137 if (refcnt != 1)
1138 panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)",
1139 func, line, log_entry, refcnt);
1140 }
1141 #define tcp_log_entry_refcnt_rem(l) \
1142 _tcp_log_entry_refcnt_rem((l), __func__, __LINE__)
1143
1144 #else /* !TCPLOG_DEBUG_RINGBUF */
1145
1146 #define tcp_log_entry_refcnt_add(l)
1147 #define tcp_log_entry_refcnt_rem(l)
1148
1149 #endif
1150
1151 /*
1152 * Cleanup after removing a log entry, but only decrement the count if we
1153 * are running INVARIANTS.
1154 */
1155 static inline void
tcp_log_free_log_common(struct tcp_log_mem * log_entry,int * count __unused)1156 tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused)
1157 {
1158
1159 uma_zfree(tcp_log_zone, log_entry);
1160 #ifdef INVARIANTS
1161 (*count)--;
1162 KASSERT(*count >= 0,
1163 ("%s: count unexpectedly negative", __func__));
1164 #endif
1165 }
1166
1167 static void
tcp_log_free_entries(struct tcp_log_stailq * head,int * count)1168 tcp_log_free_entries(struct tcp_log_stailq *head, int *count)
1169 {
1170 struct tcp_log_mem *log_entry;
1171
1172 /* Free the entries. */
1173 while ((log_entry = STAILQ_FIRST(head)) != NULL) {
1174 STAILQ_REMOVE_HEAD(head, tlm_queue);
1175 tcp_log_entry_refcnt_rem(log_entry);
1176 tcp_log_free_log_common(log_entry, count);
1177 }
1178 }
1179
1180 /* Cleanup after removing a log entry. */
1181 static inline void
tcp_log_remove_log_cleanup(struct tcpcb * tp,struct tcp_log_mem * log_entry)1182 tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry)
1183 {
1184 uma_zfree(tcp_log_zone, log_entry);
1185 tp->t_lognum--;
1186 KASSERT(tp->t_lognum >= 0,
1187 ("%s: tp->t_lognum unexpectedly negative", __func__));
1188 }
1189
1190 /* Remove a log entry from the head of a list. */
1191 static inline void
tcp_log_remove_log_head(struct tcpcb * tp,struct tcp_log_mem * log_entry)1192 tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry)
1193 {
1194
1195 KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs),
1196 ("%s: attempt to remove non-HEAD log entry", __func__));
1197 STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
1198 tcp_log_entry_refcnt_rem(log_entry);
1199 tcp_log_remove_log_cleanup(tp, log_entry);
1200 }
1201
1202 #ifdef TCPLOG_DEBUG_RINGBUF
1203 /*
1204 * Initialize the log entry's reference count, which we want to
1205 * survive allocations.
1206 */
1207 static int
tcp_log_zone_init(void * mem,int size,int flags __unused)1208 tcp_log_zone_init(void *mem, int size, int flags __unused)
1209 {
1210 struct tcp_log_mem *tlm;
1211
1212 KASSERT(size >= sizeof(struct tcp_log_mem),
1213 ("%s: unexpectedly short (%d) allocation", __func__, size));
1214 tlm = (struct tcp_log_mem *)mem;
1215 tlm->tlm_refcnt = 0;
1216 return (0);
1217 }
1218
1219 /*
1220 * Double check that the refcnt is zero on allocation and return.
1221 */
1222 static int
tcp_log_zone_ctor(void * mem,int size,void * args __unused,int flags __unused)1223 tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused)
1224 {
1225 struct tcp_log_mem *tlm;
1226
1227 KASSERT(size >= sizeof(struct tcp_log_mem),
1228 ("%s: unexpectedly short (%d) allocation", __func__, size));
1229 tlm = (struct tcp_log_mem *)mem;
1230 if (tlm->tlm_refcnt != 0)
1231 panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
1232 __func__, __LINE__, tlm, tlm->tlm_refcnt);
1233 return (0);
1234 }
1235
1236 static void
tcp_log_zone_dtor(void * mem,int size,void * args __unused)1237 tcp_log_zone_dtor(void *mem, int size, void *args __unused)
1238 {
1239 struct tcp_log_mem *tlm;
1240
1241 KASSERT(size >= sizeof(struct tcp_log_mem),
1242 ("%s: unexpectedly short (%d) allocation", __func__, size));
1243 tlm = (struct tcp_log_mem *)mem;
1244 if (tlm->tlm_refcnt != 0)
1245 panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
1246 __func__, __LINE__, tlm, tlm->tlm_refcnt);
1247 }
1248 #endif /* TCPLOG_DEBUG_RINGBUF */
1249
1250 /* Do global initialization. */
1251 void
tcp_log_init(void)1252 tcp_log_init(void)
1253 {
1254
1255 tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem),
1256 #ifdef TCPLOG_DEBUG_RINGBUF
1257 tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init,
1258 #else
1259 NULL, NULL, NULL,
1260 #endif
1261 NULL, UMA_ALIGN_PTR, 0);
1262 (void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT);
1263 tcp_log_id_bucket_zone = uma_zcreate("tcp_log_id_bucket",
1264 sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL,
1265 UMA_ALIGN_PTR, 0);
1266 tcp_log_id_node_zone = uma_zcreate("tcp_log_id_node",
1267 sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL,
1268 UMA_ALIGN_PTR, 0);
1269 #ifdef TCPLOG_DEBUG_COUNTERS
1270 tcp_log_queued = counter_u64_alloc(M_WAITOK);
1271 tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK);
1272 tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK);
1273 tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK);
1274 tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK);
1275 tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK);
1276 tcp_log_que_copyout = counter_u64_alloc(M_WAITOK);
1277 tcp_log_que_read = counter_u64_alloc(M_WAITOK);
1278 tcp_log_que_freed = counter_u64_alloc(M_WAITOK);
1279 #endif
1280 tcp_log_pcb_ids_cur = counter_u64_alloc(M_WAITOK);
1281 tcp_log_pcb_ids_tot = counter_u64_alloc(M_WAITOK);
1282
1283 rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW);
1284 mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF);
1285 callout_init(&tcp_log_expireq_callout, 1);
1286 }
1287
1288 /* Do per-TCPCB initialization. */
1289 void
tcp_log_tcpcbinit(struct tcpcb * tp)1290 tcp_log_tcpcbinit(struct tcpcb *tp)
1291 {
1292
1293 /* A new TCPCB should start out zero-initialized. */
1294 STAILQ_INIT(&tp->t_logs);
1295
1296 /*
1297 * If we are doing auto-capturing, figure out whether we will capture
1298 * this session.
1299 */
1300 tp->t_loglimit = tcp_log_session_limit;
1301 if ((tcp_log_auto_all == true) &&
1302 tcp_log_auto_mode &&
1303 tcp_log_selectauto()) {
1304 tp->_t_logstate = tcp_log_auto_mode;
1305 tp->t_flags2 |= TF2_LOG_AUTO;
1306 }
1307 }
1308
1309 /* Remove entries */
1310 static void
tcp_log_expire(void * unused __unused)1311 tcp_log_expire(void *unused __unused)
1312 {
1313 struct tcp_log_id_bucket *tlb;
1314 struct tcp_log_id_node *tln;
1315 sbintime_t expiry_limit;
1316 int tree_locked;
1317
1318 TCPLOG_EXPIREQ_LOCK();
1319 if (callout_pending(&tcp_log_expireq_callout)) {
1320 /* Callout was reset. */
1321 TCPLOG_EXPIREQ_UNLOCK();
1322 return;
1323 }
1324
1325 /*
1326 * Process entries until we reach one that expires too far in the
1327 * future. Look one second in the future.
1328 */
1329 expiry_limit = getsbinuptime() + SBT_1S;
1330 tree_locked = TREE_UNLOCKED;
1331
1332 while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL &&
1333 tln->tln_expiretime <= expiry_limit) {
1334 if (!callout_active(&tcp_log_expireq_callout)) {
1335 /*
1336 * Callout was stopped. I guess we should
1337 * just quit at this point.
1338 */
1339 TCPLOG_EXPIREQ_UNLOCK();
1340 return;
1341 }
1342
1343 /*
1344 * Remove the node from the head of the list and unlock
1345 * the list. Change the expiry time to SBT_MAX as a signal
1346 * to other threads that we now own this.
1347 */
1348 STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq);
1349 tln->tln_expiretime = SBT_MAX;
1350 TCPLOG_EXPIREQ_UNLOCK();
1351
1352 /*
1353 * Remove the node from the bucket.
1354 */
1355 tlb = tln->tln_bucket;
1356 TCPID_BUCKET_LOCK(tlb);
1357 if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) {
1358 tcp_log_id_validate_tree_lock(tree_locked);
1359 if (tree_locked == TREE_WLOCKED)
1360 TCPID_TREE_WUNLOCK();
1361 else
1362 TCPID_TREE_RUNLOCK();
1363 tree_locked = TREE_UNLOCKED;
1364 }
1365
1366 /* Drop the INP reference. */
1367 INP_WLOCK(tln->tln_inp);
1368 if (!in_pcbrele_wlocked(tln->tln_inp))
1369 INP_WUNLOCK(tln->tln_inp);
1370
1371 /* Free the log records. */
1372 tcp_log_free_entries(&tln->tln_entries, &tln->tln_count);
1373
1374 /* Free the node. */
1375 uma_zfree(tcp_log_id_node_zone, tln);
1376
1377 /* Relock the expiry queue. */
1378 TCPLOG_EXPIREQ_LOCK();
1379 }
1380
1381 /*
1382 * We've expired all the entries we can. Do we need to reschedule
1383 * ourselves?
1384 */
1385 callout_deactivate(&tcp_log_expireq_callout);
1386 if (tln != NULL) {
1387 /*
1388 * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and
1389 * set the next callout to that. (This helps ensure we generally
1390 * run the callout no more often than desired.)
1391 */
1392 expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL;
1393 if (expiry_limit < tln->tln_expiretime)
1394 expiry_limit = tln->tln_expiretime;
1395 callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit,
1396 SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE);
1397 }
1398
1399 /* We're done. */
1400 TCPLOG_EXPIREQ_UNLOCK();
1401 return;
1402 }
1403
1404 /*
1405 * Move log data from the TCPCB to a new node. This will reset the TCPCB log
1406 * entries and log count; however, it will not touch other things from the
1407 * TCPCB (e.g. t_lin, t_lib).
1408 *
1409 * NOTE: Must hold a lock on the INP.
1410 */
1411 static void
tcp_log_move_tp_to_node(struct tcpcb * tp,struct tcp_log_id_node * tln)1412 tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln)
1413 {
1414 struct inpcb *inp = tptoinpcb(tp);
1415
1416 INP_WLOCK_ASSERT(inp);
1417
1418 tln->tln_ie = inp->inp_inc.inc_ie;
1419 if (inp->inp_inc.inc_flags & INC_ISIPV6)
1420 tln->tln_af = AF_INET6;
1421 else
1422 tln->tln_af = AF_INET;
1423 tln->tln_entries = tp->t_logs;
1424 tln->tln_count = tp->t_lognum;
1425 tln->tln_bucket = tp->t_lib;
1426
1427 /* Clear information from the PCB. */
1428 STAILQ_INIT(&tp->t_logs);
1429 tp->t_lognum = 0;
1430 }
1431
1432 /* Do per-TCPCB cleanup */
1433 void
tcp_log_tcpcbfini(struct tcpcb * tp)1434 tcp_log_tcpcbfini(struct tcpcb *tp)
1435 {
1436 struct tcp_log_id_node *tln, *tln_first;
1437 struct tcp_log_mem *log_entry;
1438 sbintime_t callouttime;
1439
1440
1441 INP_WLOCK_ASSERT(tptoinpcb(tp));
1442 if (tp->_t_logstate) {
1443 union tcp_log_stackspecific log;
1444 struct timeval tv;
1445 #ifdef TCP_ACCOUNTING
1446 struct tcp_log_buffer *lgb;
1447 int i;
1448
1449 memset(&log, 0, sizeof(log));
1450 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
1451 for (i = 0; i < TCP_NUM_CNT_COUNTERS; i++) {
1452 log.u_raw.u64_flex[i] = tp->tcp_cnt_counters[i];
1453 }
1454 lgb = tcp_log_event(tp, NULL,
1455 NULL,
1456 NULL,
1457 TCP_LOG_ACCOUNTING, 0,
1458 0, &log, false, NULL, NULL, 0, &tv);
1459 if (lgb != NULL) {
1460 lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS;
1461 lgb->tlb_flex2 = 1;
1462 } else
1463 goto skip_out;
1464 for (i = 0; i<TCP_NUM_CNT_COUNTERS; i++) {
1465 log.u_raw.u64_flex[i] = tp->tcp_proc_time[i];
1466 }
1467 lgb = tcp_log_event(tp, NULL,
1468 NULL,
1469 NULL,
1470 TCP_LOG_ACCOUNTING, 0,
1471 0, &log, false, NULL, NULL, 0, &tv);
1472 if (lgb != NULL) {
1473 lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS;
1474 lgb->tlb_flex2 = 2;
1475 }
1476 }
1477 skip_out:
1478 #endif
1479 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1480 log.u_bbr.cur_del_rate = tp->t_end_info;
1481 (void)tcp_log_event(tp, NULL,
1482 NULL,
1483 NULL,
1484 TCP_LOG_CONNEND, 0,
1485 0, &log, false, NULL, NULL, 0, &tv);
1486 }
1487 /*
1488 * If we were gathering packets to be automatically dumped, try to do
1489 * it now. If this succeeds, the log information in the TCPCB will be
1490 * cleared. Otherwise, we'll handle the log information as we do
1491 * for other states.
1492 */
1493 switch(tp->_t_logstate) {
1494 case TCP_LOG_STATE_HEAD_AUTO:
1495 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
1496 M_NOWAIT, false);
1497 break;
1498 case TCP_LOG_STATE_TAIL_AUTO:
1499 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail",
1500 M_NOWAIT, false);
1501 break;
1502 case TCP_LOG_VIA_BBPOINTS:
1503 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints",
1504 M_NOWAIT, false);
1505 break;
1506 case TCP_LOG_STATE_CONTINUAL:
1507 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
1508 M_NOWAIT, false);
1509 break;
1510 }
1511
1512 /*
1513 * There are two ways we could keep logs: per-socket or per-ID. If
1514 * we are tracking logs with an ID, then the logs survive the
1515 * destruction of the TCPCB.
1516 *
1517 * If the TCPCB is associated with an ID node, move the logs from the
1518 * TCPCB to the ID node. In theory, this is safe, for reasons which I
1519 * will now explain for my own benefit when I next need to figure out
1520 * this code. :-)
1521 *
1522 * We own the INP lock. Therefore, no one else can change the contents
1523 * of this node (Rule C). Further, no one can remove this node from
1524 * the bucket while we hold the lock (Rule D). Basically, no one can
1525 * mess with this node. That leaves two states in which we could be:
1526 *
1527 * 1. Another thread is currently waiting to acquire the INP lock, with
1528 * plans to do something with this node. When we drop the INP lock,
1529 * they will have a chance to do that. They will recheck the
1530 * tln_closed field (see note to Rule C) and then acquire the
1531 * bucket lock before proceeding further.
1532 *
1533 * 2. Another thread will try to acquire a lock at some point in the
1534 * future. If they try to acquire a lock before we set the
1535 * tln_closed field, they will follow state #1. If they try to
1536 * acquire a lock after we set the tln_closed field, they will be
1537 * able to make changes to the node, at will, following Rule C.
1538 *
1539 * Therefore, we currently own this node and can make any changes
1540 * we want. But, as soon as we set the tln_closed field to true, we
1541 * have effectively dropped our lock on the node. (For this reason, we
1542 * also need to make sure our writes are ordered correctly. An atomic
1543 * operation with "release" semantics should be sufficient.)
1544 */
1545
1546 if (tp->t_lin != NULL) {
1547 struct inpcb *inp = tptoinpcb(tp);
1548
1549 /* Copy the relevant information to the log entry. */
1550 tln = tp->t_lin;
1551 KASSERT(tln->tln_inp == inp,
1552 ("%s: Mismatched inp (tln->tln_inp=%p, tp inpcb=%p)",
1553 __func__, tln->tln_inp, inp));
1554 tcp_log_move_tp_to_node(tp, tln);
1555
1556 /* Clear information from the PCB. */
1557 tp->t_lin = NULL;
1558 tp->t_lib = NULL;
1559
1560 /*
1561 * Take a reference on the INP. This ensures that the INP
1562 * remains valid while the node is on the expiry queue. This
1563 * ensures the INP is valid for other threads that may be
1564 * racing to lock this node when we move it to the expire
1565 * queue.
1566 */
1567 in_pcbref(inp);
1568
1569 /*
1570 * Store the entry on the expiry list. The exact behavior
1571 * depends on whether we have entries to keep. If so, we
1572 * put the entry at the tail of the list and expire in
1573 * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put
1574 * the entry at the head of the list. (Handling the cleanup
1575 * via the expiry timer lets us avoid locking messy-ness here.)
1576 */
1577 tln->tln_expiretime = getsbinuptime();
1578 TCPLOG_EXPIREQ_LOCK();
1579 if (tln->tln_count) {
1580 tln->tln_expiretime += TCP_LOG_EXPIRE_TIME;
1581 if (STAILQ_EMPTY(&tcp_log_expireq_head) &&
1582 !callout_active(&tcp_log_expireq_callout)) {
1583 /*
1584 * We are adding the first entry and a callout
1585 * is not currently scheduled; therefore, we
1586 * need to schedule one.
1587 */
1588 callout_reset_sbt(&tcp_log_expireq_callout,
1589 tln->tln_expiretime, SBT_1S, tcp_log_expire,
1590 NULL, C_ABSOLUTE);
1591 }
1592 STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln,
1593 tln_expireq);
1594 } else {
1595 callouttime = tln->tln_expiretime +
1596 TCP_LOG_EXPIRE_INTVL;
1597 tln_first = STAILQ_FIRST(&tcp_log_expireq_head);
1598
1599 if ((tln_first == NULL ||
1600 callouttime < tln_first->tln_expiretime) &&
1601 (callout_pending(&tcp_log_expireq_callout) ||
1602 !callout_active(&tcp_log_expireq_callout))) {
1603 /*
1604 * The list is empty, or we want to run the
1605 * expire code before the first entry's timer
1606 * fires. Also, we are in a case where a callout
1607 * is not actively running. We want to reset
1608 * the callout to occur sooner.
1609 */
1610 callout_reset_sbt(&tcp_log_expireq_callout,
1611 callouttime, SBT_1S, tcp_log_expire, NULL,
1612 C_ABSOLUTE);
1613 }
1614
1615 /*
1616 * Insert to the head, or just after the head, as
1617 * appropriate. (This might result in small
1618 * mis-orderings as a bunch of "expire now" entries
1619 * gather at the start of the list, but that should
1620 * not produce big problems, since the expire timer
1621 * will walk through all of them.)
1622 */
1623 if (tln_first == NULL ||
1624 tln->tln_expiretime < tln_first->tln_expiretime)
1625 STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln,
1626 tln_expireq);
1627 else
1628 STAILQ_INSERT_AFTER(&tcp_log_expireq_head,
1629 tln_first, tln, tln_expireq);
1630 }
1631 TCPLOG_EXPIREQ_UNLOCK();
1632
1633 /*
1634 * We are done messing with the tln. After this point, we
1635 * can't touch it. (Note that the "release" semantics should
1636 * be included with the TCPLOG_EXPIREQ_UNLOCK() call above.
1637 * Therefore, they should be unnecessary here. However, it
1638 * seems like a good idea to include them anyway, since we
1639 * really are releasing a lock here.)
1640 */
1641 atomic_store_rel_int(&tln->tln_closed, 1);
1642 } else {
1643 /* Remove log entries. */
1644 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
1645 tcp_log_remove_log_head(tp, log_entry);
1646 KASSERT(tp->t_lognum == 0,
1647 ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
1648 __func__, tp->t_lognum));
1649 }
1650
1651 /*
1652 * Change the log state to off (just in case anything tries to sneak
1653 * in a last-minute log).
1654 */
1655 tp->_t_logstate = TCP_LOG_STATE_OFF;
1656 }
1657
1658 static void
tcp_log_purge_tp_logbuf(struct tcpcb * tp)1659 tcp_log_purge_tp_logbuf(struct tcpcb *tp)
1660 {
1661 struct tcp_log_mem *log_entry;
1662
1663 INP_WLOCK_ASSERT(tptoinpcb(tp));
1664 if (tp->t_lognum == 0)
1665 return;
1666
1667 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
1668 tcp_log_remove_log_head(tp, log_entry);
1669 KASSERT(tp->t_lognum == 0,
1670 ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
1671 __func__, tp->t_lognum));
1672 tp->_t_logstate = TCP_LOG_STATE_OFF;
1673 }
1674
1675 /*
1676 * This logs an event for a TCP socket. Normally, this is called via
1677 * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for
1678 * TCP_LOG_EVENT().
1679 */
1680
1681 struct tcp_log_buffer *
tcp_log_event(struct tcpcb * tp,struct tcphdr * th,struct sockbuf * rxbuf,struct sockbuf * txbuf,uint8_t eventid,int errornum,uint32_t len,union tcp_log_stackspecific * stackinfo,int th_hostorder,const char * output_caller,const char * func,int line,const struct timeval * itv)1682 tcp_log_event(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
1683 struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
1684 union tcp_log_stackspecific *stackinfo, int th_hostorder,
1685 const char *output_caller, const char *func, int line, const struct timeval *itv)
1686 {
1687 struct tcp_log_mem *log_entry;
1688 struct tcp_log_buffer *log_buf;
1689 int attempt_count = 0;
1690 struct tcp_log_verbose *log_verbose;
1691 uint32_t logsn;
1692
1693 KASSERT((func == NULL && line == 0) || (func != NULL && line > 0),
1694 ("%s called with inconsistent func (%p) and line (%d) arguments",
1695 __func__, func, line));
1696
1697 INP_WLOCK_ASSERT(tptoinpcb(tp));
1698 if (tcp_disable_all_bb_logs) {
1699 /*
1700 * The global shutdown logging
1701 * switch has been thrown. Call
1702 * the purge function that frees
1703 * purges out the logs and
1704 * turns off logging.
1705 */
1706 tcp_log_purge_tp_logbuf(tp);
1707 return (NULL);
1708 }
1709 KASSERT(tp->_t_logstate == TCP_LOG_STATE_HEAD ||
1710 tp->_t_logstate == TCP_LOG_STATE_TAIL ||
1711 tp->_t_logstate == TCP_LOG_STATE_CONTINUAL ||
1712 tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO ||
1713 tp->_t_logstate == TCP_LOG_VIA_BBPOINTS ||
1714 tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO,
1715 ("%s called with unexpected tp->_t_logstate (%d)", __func__,
1716 tp->_t_logstate));
1717
1718 /*
1719 * Get the serial number. We do this early so it will
1720 * increment even if we end up skipping the log entry for some
1721 * reason.
1722 */
1723 logsn = tp->t_logsn++;
1724
1725 /*
1726 * Can we get a new log entry? If so, increment the lognum counter
1727 * here.
1728 */
1729 retry:
1730 if (tp->t_lognum < tp->t_loglimit) {
1731 if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL)
1732 tp->t_lognum++;
1733 } else
1734 log_entry = NULL;
1735
1736 /* Do we need to try to reuse? */
1737 if (log_entry == NULL) {
1738 /*
1739 * Sacrifice auto-logged sessions without a log ID if
1740 * tcp_log_auto_all is false. (If they don't have a log
1741 * ID by now, it is probable that either they won't get one
1742 * or we are resource-constrained.)
1743 */
1744 if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
1745 !tcp_log_auto_all) {
1746 if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) {
1747 #ifdef INVARIANTS
1748 panic("%s:%d: tcp_log_state_change() failed "
1749 "to set tp %p to TCP_LOG_STATE_CLEAR",
1750 __func__, __LINE__, tp);
1751 #endif
1752 tp->_t_logstate = TCP_LOG_STATE_OFF;
1753 }
1754 return (NULL);
1755 }
1756 /*
1757 * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump
1758 * the buffers. If successful, deactivate tracing. Otherwise,
1759 * leave it active so we will retry.
1760 */
1761 if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO &&
1762 !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
1763 M_NOWAIT, false)) {
1764 tp->_t_logstate = TCP_LOG_STATE_OFF;
1765 return(NULL);
1766 } else if ((tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) &&
1767 !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
1768 M_NOWAIT, false)) {
1769 if (attempt_count == 0) {
1770 attempt_count++;
1771 goto retry;
1772 }
1773 #ifdef TCPLOG_DEBUG_COUNTERS
1774 counter_u64_add(tcp_log_que_fail4, 1);
1775 #endif
1776 return(NULL);
1777
1778 } else if ((tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) &&
1779 !tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints",
1780 M_NOWAIT, false)) {
1781 if (attempt_count == 0) {
1782 attempt_count++;
1783 goto retry;
1784 }
1785 #ifdef TCPLOG_DEBUG_COUNTERS
1786 counter_u64_add(tcp_log_que_fail4, 1);
1787 #endif
1788 return(NULL);
1789 } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO)
1790 return(NULL);
1791
1792 /* If in HEAD state, just deactivate the tracing and return. */
1793 if (tp->_t_logstate == TCP_LOG_STATE_HEAD) {
1794 tp->_t_logstate = TCP_LOG_STATE_OFF;
1795 return(NULL);
1796 }
1797 /*
1798 * Get a buffer to reuse. If that fails, just give up.
1799 * (We can't log anything without a buffer in which to
1800 * put it.)
1801 *
1802 * Note that we don't change the t_lognum counter
1803 * here. Because we are re-using the buffer, the total
1804 * number won't change.
1805 */
1806 if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL)
1807 return(NULL);
1808 STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
1809 tcp_log_entry_refcnt_rem(log_entry);
1810 }
1811
1812 KASSERT(log_entry != NULL,
1813 ("%s: log_entry unexpectedly NULL", __func__));
1814
1815 /* Extract the log buffer and verbose buffer pointers. */
1816 log_buf = &log_entry->tlm_buf;
1817 log_verbose = &log_entry->tlm_v;
1818
1819 /* Basic entries. */
1820 if (itv == NULL)
1821 microuptime(&log_buf->tlb_tv);
1822 else
1823 memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval));
1824 log_buf->tlb_ticks = ticks;
1825 log_buf->tlb_sn = logsn;
1826 log_buf->tlb_stackid = tp->t_fb->tfb_id;
1827 log_buf->tlb_eventid = eventid;
1828 log_buf->tlb_eventflags = 0;
1829 log_buf->tlb_errno = errornum;
1830
1831 /* Socket buffers */
1832 if (rxbuf != NULL) {
1833 log_buf->tlb_eventflags |= TLB_FLAG_RXBUF;
1834 log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc;
1835 log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc;
1836 log_buf->tlb_rxbuf.tls_sb_spare = 0;
1837 } else {
1838 log_buf->tlb_rxbuf.tls_sb_acc = 0;
1839 log_buf->tlb_rxbuf.tls_sb_ccc = 0;
1840 }
1841 if (txbuf != NULL) {
1842 log_buf->tlb_eventflags |= TLB_FLAG_TXBUF;
1843 log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc;
1844 log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc;
1845 log_buf->tlb_txbuf.tls_sb_spare = 0;
1846 } else {
1847 log_buf->tlb_txbuf.tls_sb_acc = 0;
1848 log_buf->tlb_txbuf.tls_sb_ccc = 0;
1849 }
1850 /* Copy values from tp to the log entry. */
1851 log_buf->tlb_state = tp->t_state;
1852 log_buf->tlb_starttime = tp->t_starttime;
1853 log_buf->tlb_iss = tp->iss;
1854 log_buf->tlb_flags = tp->t_flags;
1855 log_buf->tlb_snd_una = tp->snd_una;
1856 log_buf->tlb_snd_max = tp->snd_max;
1857 log_buf->tlb_snd_cwnd = tp->snd_cwnd;
1858 log_buf->tlb_snd_nxt = tp->snd_nxt;
1859 log_buf->tlb_snd_recover = tp->snd_recover;
1860 log_buf->tlb_snd_wnd = tp->snd_wnd;
1861 log_buf->tlb_snd_ssthresh = tp->snd_ssthresh;
1862 log_buf->tlb_srtt = tp->t_srtt;
1863 log_buf->tlb_rttvar = tp->t_rttvar;
1864 log_buf->tlb_rcv_up = tp->rcv_up;
1865 log_buf->tlb_rcv_adv = tp->rcv_adv;
1866 log_buf->tlb_flags2 = tp->t_flags2;
1867 log_buf->tlb_rcv_nxt = tp->rcv_nxt;
1868 log_buf->tlb_rcv_wnd = tp->rcv_wnd;
1869 log_buf->tlb_dupacks = tp->t_dupacks;
1870 log_buf->tlb_segqlen = tp->t_segqlen;
1871 log_buf->tlb_snd_numholes = tp->snd_numholes;
1872 log_buf->tlb_flex1 = 0;
1873 log_buf->tlb_flex2 = 0;
1874 log_buf->tlb_fbyte_in = tp->t_fbyte_in;
1875 log_buf->tlb_fbyte_out = tp->t_fbyte_out;
1876 log_buf->tlb_snd_scale = tp->snd_scale;
1877 log_buf->tlb_rcv_scale = tp->rcv_scale;
1878 log_buf->_pad[0] = 0;
1879 log_buf->_pad[1] = 0;
1880 log_buf->_pad[2] = 0;
1881 /* Copy stack-specific info. */
1882 if (stackinfo != NULL) {
1883 memcpy(&log_buf->tlb_stackinfo, stackinfo,
1884 sizeof(log_buf->tlb_stackinfo));
1885 log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO;
1886 }
1887
1888 /* The packet */
1889 log_buf->tlb_len = len;
1890 if (th) {
1891 int optlen;
1892
1893 log_buf->tlb_eventflags |= TLB_FLAG_HDR;
1894 log_buf->tlb_th = *th;
1895 if (th_hostorder)
1896 tcp_fields_to_net(&log_buf->tlb_th);
1897 optlen = (th->th_off << 2) - sizeof (struct tcphdr);
1898 if (optlen > 0)
1899 memcpy(log_buf->tlb_opts, th + 1, optlen);
1900 } else {
1901 memset(&log_buf->tlb_th, 0, sizeof(*th));
1902 }
1903
1904 /* Verbose information */
1905 if (func != NULL) {
1906 log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE;
1907 if (output_caller != NULL)
1908 strlcpy(log_verbose->tlv_snd_frm, output_caller,
1909 TCP_FUNC_LEN);
1910 else
1911 *log_verbose->tlv_snd_frm = 0;
1912 strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN);
1913 log_verbose->tlv_trace_line = line;
1914 }
1915
1916 /* Insert the new log at the tail. */
1917 STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue);
1918 tcp_log_entry_refcnt_add(log_entry);
1919 return (log_buf);
1920 }
1921
1922 /*
1923 * Change the logging state for a TCPCB. Returns 0 on success or an
1924 * error code on failure.
1925 */
1926 int
tcp_log_state_change(struct tcpcb * tp,int state)1927 tcp_log_state_change(struct tcpcb *tp, int state)
1928 {
1929 struct tcp_log_mem *log_entry;
1930 int rv;
1931
1932 INP_WLOCK_ASSERT(tptoinpcb(tp));
1933 rv = 0;
1934 switch(state) {
1935 case TCP_LOG_STATE_CLEAR:
1936 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
1937 tcp_log_remove_log_head(tp, log_entry);
1938 /* FALLTHROUGH */
1939
1940 case TCP_LOG_STATE_OFF:
1941 tp->_t_logstate = TCP_LOG_STATE_OFF;
1942 break;
1943
1944 case TCP_LOG_STATE_TAIL:
1945 case TCP_LOG_STATE_HEAD:
1946 case TCP_LOG_STATE_CONTINUAL:
1947 case TCP_LOG_VIA_BBPOINTS:
1948 case TCP_LOG_STATE_HEAD_AUTO:
1949 case TCP_LOG_STATE_TAIL_AUTO:
1950 /*
1951 * When the RATIO_OFF state is set for the bucket, the log ID
1952 * this tp is associated with has been probabilistically opted
1953 * out of logging per tcp_log_apply_ratio().
1954 */
1955 if (tp->t_lib == NULL ||
1956 tp->t_lib->tlb_logstate != TCP_LOG_STATE_RATIO_OFF) {
1957 tp->_t_logstate = state;
1958 } else {
1959 rv = ECANCELED;
1960 tp->_t_logstate = TCP_LOG_STATE_OFF;
1961 }
1962 break;
1963
1964 default:
1965 return (EINVAL);
1966 }
1967 if (tcp_disable_all_bb_logs) {
1968 /* We are prohibited from doing any logs */
1969 tp->_t_logstate = TCP_LOG_STATE_OFF;
1970 rv = EBUSY;
1971 }
1972 tp->t_flags2 &= ~(TF2_LOG_AUTO);
1973
1974 return (rv);
1975 }
1976
1977 /* If tcp_drain() is called, flush half the log entries. */
1978 void
tcp_log_drain(struct tcpcb * tp)1979 tcp_log_drain(struct tcpcb *tp)
1980 {
1981 struct tcp_log_mem *log_entry, *next;
1982 int target, skip;
1983
1984 INP_WLOCK_ASSERT(tptoinpcb(tp));
1985 if ((target = tp->t_lognum / 2) == 0)
1986 return;
1987
1988 /*
1989 * XXXRRS: At this I don't think this is wise that
1990 * we do this. All that a drain call means is that
1991 * we are hitting one of the system mbuf limits. BB
1992 * logging, or freeing of them, will not create any
1993 * more mbufs and really has nothing to do with
1994 * the system running out of mbufs. For now I
1995 * am changing this to free any "AUTO" by dumping
1996 * them out. But this should either be changed
1997 * so that it gets called when we hit the BB limit
1998 * or it should just not get called (one of the two)
1999 * since I don't think the mbuf <-> BB log cleanup
2000 * is the right thing to do here.
2001 */
2002 /*
2003 * If we are logging the "head" packets, we want to discard
2004 * from the tail of the queue. Otherwise, we want to discard
2005 * from the head.
2006 */
2007 if (tp->_t_logstate == TCP_LOG_STATE_HEAD) {
2008 skip = tp->t_lognum - target;
2009 STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue)
2010 if (!--skip)
2011 break;
2012 KASSERT(log_entry != NULL,
2013 ("%s: skipped through all entries!", __func__));
2014 if (log_entry == NULL)
2015 return;
2016 while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) {
2017 STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue);
2018 tcp_log_entry_refcnt_rem(next);
2019 tcp_log_remove_log_cleanup(tp, next);
2020 #ifdef INVARIANTS
2021 target--;
2022 #endif
2023 }
2024 KASSERT(target == 0,
2025 ("%s: After removing from tail, target was %d", __func__,
2026 target));
2027 } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO) {
2028 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at drain",
2029 M_NOWAIT, false);
2030 } else if (tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO) {
2031 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at drain",
2032 M_NOWAIT, false);
2033 } else if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) {
2034 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints",
2035 M_NOWAIT, false);
2036 } else if (tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) {
2037 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
2038 M_NOWAIT, false);
2039 } else {
2040 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL &&
2041 target--)
2042 tcp_log_remove_log_head(tp, log_entry);
2043 KASSERT(target <= 0,
2044 ("%s: After removing from head, target was %d", __func__,
2045 target));
2046 KASSERT(tp->t_lognum > 0,
2047 ("%s: After removing from head, tp->t_lognum was %d",
2048 __func__, target));
2049 KASSERT(log_entry != NULL,
2050 ("%s: After removing from head, the tailq was empty",
2051 __func__));
2052 }
2053 }
2054
2055 static inline int
tcp_log_copyout(struct sockopt * sopt,void * src,void * dst,size_t len)2056 tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len)
2057 {
2058
2059 if (sopt->sopt_td != NULL)
2060 return (copyout(src, dst, len));
2061 bcopy(src, dst, len);
2062 return (0);
2063 }
2064
2065 static int
tcp_log_logs_to_buf(struct sockopt * sopt,struct tcp_log_stailq * log_tailqp,struct tcp_log_buffer ** end,int count)2066 tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp,
2067 struct tcp_log_buffer **end, int count)
2068 {
2069 struct tcp_log_buffer *out_entry;
2070 struct tcp_log_mem *log_entry;
2071 size_t entrysize;
2072 int error;
2073 #ifdef INVARIANTS
2074 int orig_count = count;
2075 #endif
2076
2077 /* Copy the data out. */
2078 error = 0;
2079 out_entry = (struct tcp_log_buffer *) sopt->sopt_val;
2080 STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) {
2081 count--;
2082 KASSERT(count >= 0,
2083 ("%s:%d: Exceeded expected count (%d) processing list %p",
2084 __func__, __LINE__, orig_count, log_tailqp));
2085
2086 #ifdef TCPLOG_DEBUG_COUNTERS
2087 counter_u64_add(tcp_log_que_copyout, 1);
2088 #endif
2089
2090 /*
2091 * Skip copying out the header if it isn't present.
2092 * Instead, copy out zeros (to ensure we don't leak info).
2093 * TODO: Make sure we truly do zero everything we don't
2094 * explicitly set.
2095 */
2096 if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)
2097 entrysize = sizeof(struct tcp_log_buffer);
2098 else
2099 entrysize = offsetof(struct tcp_log_buffer, tlb_th);
2100 error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry,
2101 entrysize);
2102 if (error)
2103 break;
2104 if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) {
2105 error = tcp_log_copyout(sopt, zerobuf,
2106 ((uint8_t *)out_entry) + entrysize,
2107 sizeof(struct tcp_log_buffer) - entrysize);
2108 }
2109
2110 /*
2111 * Copy out the verbose bit, if needed. Either way,
2112 * increment the output pointer the correct amount.
2113 */
2114 if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) {
2115 error = tcp_log_copyout(sopt, &log_entry->tlm_v,
2116 out_entry->tlb_verbose,
2117 sizeof(struct tcp_log_verbose));
2118 if (error)
2119 break;
2120 out_entry = (struct tcp_log_buffer *)
2121 (((uint8_t *) (out_entry + 1)) +
2122 sizeof(struct tcp_log_verbose));
2123 } else
2124 out_entry++;
2125 }
2126 *end = out_entry;
2127 KASSERT(error || count == 0,
2128 ("%s:%d: Less than expected count (%d) processing list %p"
2129 " (%d remain)", __func__, __LINE__, orig_count,
2130 log_tailqp, count));
2131
2132 return (error);
2133 }
2134
2135 /*
2136 * Copy out the buffer. Note that we do incremental copying, so
2137 * sooptcopyout() won't work. However, the goal is to produce the same
2138 * end result as if we copied in the entire user buffer, updated it,
2139 * and then used sooptcopyout() to copy it out.
2140 *
2141 * NOTE: This should be called with a write lock on the PCB; however,
2142 * the function will drop it after it extracts the data from the TCPCB.
2143 */
2144 int
tcp_log_getlogbuf(struct sockopt * sopt,struct tcpcb * tp)2145 tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp)
2146 {
2147 struct tcp_log_stailq log_tailq;
2148 struct tcp_log_mem *log_entry, *log_next;
2149 struct tcp_log_buffer *out_entry;
2150 struct inpcb *inp = tptoinpcb(tp);
2151 size_t outsize, entrysize;
2152 int error, outnum;
2153
2154 INP_WLOCK_ASSERT(inp);
2155
2156 /*
2157 * Determine which log entries will fit in the buffer. As an
2158 * optimization, skip this if all the entries will clearly fit
2159 * in the buffer. (However, get an exact size if we are using
2160 * INVARIANTS.)
2161 */
2162 #ifndef INVARIANTS
2163 if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) +
2164 sizeof(struct tcp_log_verbose)) >= tp->t_lognum) {
2165 log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue);
2166 log_next = NULL;
2167 outsize = 0;
2168 outnum = tp->t_lognum;
2169 } else {
2170 #endif
2171 outsize = outnum = 0;
2172 log_entry = NULL;
2173 STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) {
2174 entrysize = sizeof(struct tcp_log_buffer);
2175 if (log_next->tlm_buf.tlb_eventflags &
2176 TLB_FLAG_VERBOSE)
2177 entrysize += sizeof(struct tcp_log_verbose);
2178 if ((sopt->sopt_valsize - outsize) < entrysize)
2179 break;
2180 outsize += entrysize;
2181 outnum++;
2182 log_entry = log_next;
2183 }
2184 KASSERT(outsize <= sopt->sopt_valsize,
2185 ("%s: calculated output size (%zu) greater than available"
2186 "space (%zu)", __func__, outsize, sopt->sopt_valsize));
2187 #ifndef INVARIANTS
2188 }
2189 #endif
2190
2191 /*
2192 * Copy traditional sooptcopyout() behavior: if sopt->sopt_val
2193 * is NULL, silently skip the copy. However, in this case, we
2194 * will leave the list alone and return. Functionally, this
2195 * gives userspace a way to poll for an approximate buffer
2196 * size they will need to get the log entries.
2197 */
2198 if (sopt->sopt_val == NULL) {
2199 INP_WUNLOCK(inp);
2200 if (outsize == 0) {
2201 outsize = outnum * (sizeof(struct tcp_log_buffer) +
2202 sizeof(struct tcp_log_verbose));
2203 }
2204 if (sopt->sopt_valsize > outsize)
2205 sopt->sopt_valsize = outsize;
2206 return (0);
2207 }
2208
2209 /*
2210 * Break apart the list. We'll save the ones we want to copy
2211 * out locally and remove them from the TCPCB list. We can
2212 * then drop the INPCB lock while we do the copyout.
2213 *
2214 * There are roughly three cases:
2215 * 1. There was nothing to copy out. That's easy: drop the
2216 * lock and return.
2217 * 2. We are copying out the entire list. Again, that's easy:
2218 * move the whole list.
2219 * 3. We are copying out a partial list. That's harder. We
2220 * need to update the list book-keeping entries.
2221 */
2222 if (log_entry != NULL && log_next == NULL) {
2223 /* Move entire list. */
2224 KASSERT(outnum == tp->t_lognum,
2225 ("%s:%d: outnum (%d) should match tp->t_lognum (%d)",
2226 __func__, __LINE__, outnum, tp->t_lognum));
2227 log_tailq = tp->t_logs;
2228 tp->t_lognum = 0;
2229 STAILQ_INIT(&tp->t_logs);
2230 } else if (log_entry != NULL) {
2231 /* Move partial list. */
2232 KASSERT(outnum < tp->t_lognum,
2233 ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)",
2234 __func__, __LINE__, outnum, tp->t_lognum));
2235 STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs);
2236 STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue);
2237 KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL,
2238 ("%s:%d: tp->t_logs is unexpectedly shorter than expected"
2239 "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)",
2240 __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum));
2241 STAILQ_NEXT(log_entry, tlm_queue) = NULL;
2242 log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue);
2243 tp->t_lognum -= outnum;
2244 } else
2245 STAILQ_INIT(&log_tailq);
2246
2247 /* Drop the PCB lock. */
2248 INP_WUNLOCK(inp);
2249
2250 /* Copy the data out. */
2251 error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum);
2252
2253 if (error) {
2254 /* Restore list */
2255 INP_WLOCK(inp);
2256 if ((inp->inp_flags & INP_DROPPED) == 0) {
2257 tp = intotcpcb(inp);
2258
2259 /* Merge the two lists. */
2260 STAILQ_CONCAT(&log_tailq, &tp->t_logs);
2261 tp->t_logs = log_tailq;
2262 tp->t_lognum += outnum;
2263 }
2264 INP_WUNLOCK(inp);
2265 } else {
2266 /* Sanity check entries */
2267 KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val) ==
2268 outsize, ("%s: Actual output size (%zu) != "
2269 "calculated output size (%zu)", __func__,
2270 (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val),
2271 outsize));
2272
2273 /* Free the entries we just copied out. */
2274 STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) {
2275 tcp_log_entry_refcnt_rem(log_entry);
2276 uma_zfree(tcp_log_zone, log_entry);
2277 }
2278 }
2279
2280 sopt->sopt_valsize = (size_t)((caddr_t)out_entry -
2281 (caddr_t)sopt->sopt_val);
2282 return (error);
2283 }
2284
2285 static void
tcp_log_free_queue(struct tcp_log_dev_queue * param)2286 tcp_log_free_queue(struct tcp_log_dev_queue *param)
2287 {
2288 struct tcp_log_dev_log_queue *entry;
2289
2290 KASSERT(param != NULL, ("%s: called with NULL param", __func__));
2291 if (param == NULL)
2292 return;
2293
2294 entry = (struct tcp_log_dev_log_queue *)param;
2295
2296 /* Free the entries. */
2297 tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
2298
2299 /* Free the buffer, if it is allocated. */
2300 if (entry->tldl_common.tldq_buf != NULL)
2301 free(entry->tldl_common.tldq_buf, M_TCPLOGDEV);
2302
2303 /* Free the queue entry. */
2304 free(entry, M_TCPLOGDEV);
2305 }
2306
2307 static struct tcp_log_common_header *
tcp_log_expandlogbuf(struct tcp_log_dev_queue * param)2308 tcp_log_expandlogbuf(struct tcp_log_dev_queue *param)
2309 {
2310 struct tcp_log_dev_log_queue *entry;
2311 struct tcp_log_header *hdr;
2312 uint8_t *end;
2313 struct sockopt sopt;
2314 int error;
2315
2316 entry = (struct tcp_log_dev_log_queue *)param;
2317
2318 /* Take a worst-case guess at space needs. */
2319 sopt.sopt_valsize = sizeof(struct tcp_log_header) +
2320 entry->tldl_count * (sizeof(struct tcp_log_buffer) +
2321 sizeof(struct tcp_log_verbose));
2322 hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT);
2323 if (hdr == NULL) {
2324 #ifdef TCPLOG_DEBUG_COUNTERS
2325 counter_u64_add(tcp_log_que_fail5, entry->tldl_count);
2326 #endif
2327 return (NULL);
2328 }
2329 sopt.sopt_val = hdr + 1;
2330 sopt.sopt_valsize -= sizeof(struct tcp_log_header);
2331 sopt.sopt_td = NULL;
2332
2333 error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries,
2334 (struct tcp_log_buffer **)&end, entry->tldl_count);
2335 if (error) {
2336 free(hdr, M_TCPLOGDEV);
2337 return (NULL);
2338 }
2339
2340 /* Free the entries. */
2341 tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
2342 entry->tldl_count = 0;
2343
2344 memset(hdr, 0, sizeof(struct tcp_log_header));
2345 hdr->tlh_version = TCP_LOG_BUF_VER;
2346 hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR;
2347 hdr->tlh_length = end - (uint8_t *)hdr;
2348 hdr->tlh_ie = entry->tldl_ie;
2349 hdr->tlh_af = entry->tldl_af;
2350 getboottime(&hdr->tlh_offset);
2351 strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN);
2352 strlcpy(hdr->tlh_tag, entry->tldl_tag, TCP_LOG_TAG_LEN);
2353 strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN);
2354 return ((struct tcp_log_common_header *)hdr);
2355 }
2356
2357 /*
2358 * Queue the tcpcb's log buffer for transmission via the log buffer facility.
2359 *
2360 * NOTE: This should be called with a write lock on the PCB.
2361 *
2362 * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
2363 * and reacquire the INP lock if it needs to do so.
2364 *
2365 * If force is false, this will only dump auto-logged sessions if
2366 * tcp_log_auto_all is true or if there is a log ID defined for the session.
2367 */
2368 int
tcp_log_dump_tp_logbuf(struct tcpcb * tp,char * reason,int how,bool force)2369 tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force)
2370 {
2371 struct tcp_log_dev_log_queue *entry;
2372 struct inpcb *inp = tptoinpcb(tp);
2373 #ifdef TCPLOG_DEBUG_COUNTERS
2374 int num_entries;
2375 #endif
2376
2377 INP_WLOCK_ASSERT(inp);
2378
2379 /* If there are no log entries, there is nothing to do. */
2380 if (tp->t_lognum == 0)
2381 return (0);
2382
2383 /* Check for a log ID. */
2384 if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
2385 !tcp_log_auto_all && !force) {
2386 struct tcp_log_mem *log_entry;
2387
2388 /*
2389 * We needed a log ID and none was found. Free the log entries
2390 * and return success. Also, cancel further logging. If the
2391 * session doesn't have a log ID by now, we'll assume it isn't
2392 * going to get one.
2393 */
2394 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
2395 tcp_log_remove_log_head(tp, log_entry);
2396 KASSERT(tp->t_lognum == 0,
2397 ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
2398 __func__, tp->t_lognum));
2399 tp->_t_logstate = TCP_LOG_STATE_OFF;
2400 return (0);
2401 }
2402
2403 /*
2404 * Allocate memory. If we must wait, we'll need to drop the locks
2405 * and reacquire them (and do all the related business that goes
2406 * along with that).
2407 */
2408 entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
2409 M_NOWAIT);
2410 if (entry == NULL && (how & M_NOWAIT)) {
2411 #ifdef TCPLOG_DEBUG_COUNTERS
2412 counter_u64_add(tcp_log_que_fail3, 1);
2413 #endif
2414 return (ENOBUFS);
2415 }
2416 if (entry == NULL) {
2417 INP_WUNLOCK(inp);
2418 entry = malloc(sizeof(struct tcp_log_dev_log_queue),
2419 M_TCPLOGDEV, M_WAITOK);
2420 INP_WLOCK(inp);
2421 /*
2422 * Note that this check is slightly overly-restrictive in
2423 * that the TCB can survive either of these events.
2424 * However, there is currently not a good way to ensure
2425 * that is the case. So, if we hit this M_WAIT path, we
2426 * may end up dropping some entries. That seems like a
2427 * small price to pay for safety.
2428 */
2429 if (inp->inp_flags & INP_DROPPED) {
2430 free(entry, M_TCPLOGDEV);
2431 #ifdef TCPLOG_DEBUG_COUNTERS
2432 counter_u64_add(tcp_log_que_fail2, 1);
2433 #endif
2434 return (ECONNRESET);
2435 }
2436 tp = intotcpcb(inp);
2437 if (tp->t_lognum == 0) {
2438 free(entry, M_TCPLOGDEV);
2439 return (0);
2440 }
2441 }
2442
2443 /* Fill in the unique parts of the queue entry. */
2444 if (tp->t_lib != NULL) {
2445 strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
2446 strlcpy(entry->tldl_tag, tp->t_lib->tlb_tag, TCP_LOG_TAG_LEN);
2447 } else {
2448 strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN);
2449 strlcpy(entry->tldl_tag, "UNKNOWN", TCP_LOG_TAG_LEN);
2450 }
2451 if (reason != NULL)
2452 strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
2453 else
2454 strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_REASON_LEN);
2455 entry->tldl_ie = inp->inp_inc.inc_ie;
2456 if (inp->inp_inc.inc_flags & INC_ISIPV6)
2457 entry->tldl_af = AF_INET6;
2458 else
2459 entry->tldl_af = AF_INET;
2460 entry->tldl_entries = tp->t_logs;
2461 entry->tldl_count = tp->t_lognum;
2462
2463 /* Fill in the common parts of the queue entry. */
2464 entry->tldl_common.tldq_buf = NULL;
2465 entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
2466 entry->tldl_common.tldq_dtor = tcp_log_free_queue;
2467
2468 /* Clear the log data from the TCPCB. */
2469 #ifdef TCPLOG_DEBUG_COUNTERS
2470 num_entries = tp->t_lognum;
2471 #endif
2472 tp->t_lognum = 0;
2473 STAILQ_INIT(&tp->t_logs);
2474
2475 /* Add the entry. If no one is listening, free the entry. */
2476 if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) {
2477 tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
2478 #ifdef TCPLOG_DEBUG_COUNTERS
2479 counter_u64_add(tcp_log_que_fail1, num_entries);
2480 } else {
2481 counter_u64_add(tcp_log_queued, num_entries);
2482 #endif
2483 }
2484 return (0);
2485 }
2486
2487 /*
2488 * Queue the log_id_node's log buffers for transmission via the log buffer
2489 * facility.
2490 *
2491 * NOTE: This should be called with the bucket locked and referenced.
2492 *
2493 * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
2494 * and reacquire the bucket lock if it needs to do so. (The caller must
2495 * ensure that the tln is no longer on any lists so no one else will mess
2496 * with this while the lock is dropped!)
2497 */
2498 static int
tcp_log_dump_node_logbuf(struct tcp_log_id_node * tln,char * reason,int how)2499 tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how)
2500 {
2501 struct tcp_log_dev_log_queue *entry;
2502 struct tcp_log_id_bucket *tlb;
2503
2504 tlb = tln->tln_bucket;
2505 TCPID_BUCKET_LOCK_ASSERT(tlb);
2506 KASSERT(tlb->tlb_refcnt > 0,
2507 ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)",
2508 __func__, __LINE__, tln, tlb));
2509 KASSERT(tln->tln_closed,
2510 ("%s:%d: Called for node with tln_closed==false (tln=%p)",
2511 __func__, __LINE__, tln));
2512
2513 /* If there are no log entries, there is nothing to do. */
2514 if (tln->tln_count == 0)
2515 return (0);
2516
2517 /*
2518 * Allocate memory. If we must wait, we'll need to drop the locks
2519 * and reacquire them (and do all the related business that goes
2520 * along with that).
2521 */
2522 entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
2523 M_NOWAIT);
2524 if (entry == NULL && (how & M_NOWAIT))
2525 return (ENOBUFS);
2526 if (entry == NULL) {
2527 TCPID_BUCKET_UNLOCK(tlb);
2528 entry = malloc(sizeof(struct tcp_log_dev_log_queue),
2529 M_TCPLOGDEV, M_WAITOK);
2530 TCPID_BUCKET_LOCK(tlb);
2531 }
2532
2533 /* Fill in the common parts of the queue entry.. */
2534 entry->tldl_common.tldq_buf = NULL;
2535 entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
2536 entry->tldl_common.tldq_dtor = tcp_log_free_queue;
2537
2538 /* Fill in the unique parts of the queue entry. */
2539 strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN);
2540 strlcpy(entry->tldl_tag, tlb->tlb_tag, TCP_LOG_TAG_LEN);
2541 if (reason != NULL)
2542 strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
2543 else
2544 strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_REASON_LEN);
2545 entry->tldl_ie = tln->tln_ie;
2546 entry->tldl_entries = tln->tln_entries;
2547 entry->tldl_count = tln->tln_count;
2548 entry->tldl_af = tln->tln_af;
2549
2550 /* Add the entry. If no one is listening, free the entry. */
2551 if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry))
2552 tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
2553
2554 return (0);
2555 }
2556
2557 /*
2558 * Queue the log buffers for all sessions in a bucket for transmissions via
2559 * the log buffer facility.
2560 *
2561 * NOTE: This should be called with a locked bucket; however, the function
2562 * will drop the lock.
2563 */
2564 #define LOCAL_SAVE 10
2565 static void
tcp_log_dumpbucketlogs(struct tcp_log_id_bucket * tlb,char * reason)2566 tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason)
2567 {
2568 struct tcp_log_id_node local_entries[LOCAL_SAVE];
2569 struct inpcb *inp;
2570 struct tcpcb *tp;
2571 struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln;
2572 int i, num_local_entries, tree_locked;
2573 bool expireq_locked;
2574
2575 TCPID_BUCKET_LOCK_ASSERT(tlb);
2576
2577 /*
2578 * Take a reference on the bucket to keep it from disappearing until
2579 * we are done.
2580 */
2581 TCPID_BUCKET_REF(tlb);
2582
2583 /*
2584 * We'll try to create these without dropping locks. However, we
2585 * might very well need to drop locks to get memory. If that's the
2586 * case, we'll save up to 10 on the stack, and sacrifice the rest.
2587 * (Otherwise, we need to worry about finding our place again in a
2588 * potentially changed list. It just doesn't seem worth the trouble
2589 * to do that.
2590 */
2591 expireq_locked = false;
2592 num_local_entries = 0;
2593 prev_tln = NULL;
2594 tree_locked = TREE_UNLOCKED;
2595 SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) {
2596 /*
2597 * If this isn't associated with a TCPCB, we can pull it off
2598 * the list now. We need to be careful that the expire timer
2599 * hasn't already taken ownership (tln_expiretime == SBT_MAX).
2600 * If so, we let the expire timer code free the data.
2601 */
2602 if (cur_tln->tln_closed) {
2603 no_inp:
2604 /*
2605 * Get the expireq lock so we can get a consistent
2606 * read of tln_expiretime and so we can remove this
2607 * from the expireq.
2608 */
2609 if (!expireq_locked) {
2610 TCPLOG_EXPIREQ_LOCK();
2611 expireq_locked = true;
2612 }
2613
2614 /*
2615 * We ignore entries with tln_expiretime == SBT_MAX.
2616 * The expire timer code already owns those.
2617 */
2618 KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0,
2619 ("%s:%d: node on the expire queue without positive "
2620 "expire time", __func__, __LINE__));
2621 if (cur_tln->tln_expiretime == SBT_MAX) {
2622 prev_tln = cur_tln;
2623 continue;
2624 }
2625
2626 /* Remove the entry from the expireq. */
2627 STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln,
2628 tcp_log_id_node, tln_expireq);
2629
2630 /* Remove the entry from the bucket. */
2631 if (prev_tln != NULL)
2632 SLIST_REMOVE_AFTER(prev_tln, tln_list);
2633 else
2634 SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list);
2635
2636 /*
2637 * Drop the INP and bucket reference counts. Due to
2638 * lock-ordering rules, we need to drop the expire
2639 * queue lock.
2640 */
2641 TCPLOG_EXPIREQ_UNLOCK();
2642 expireq_locked = false;
2643
2644 /* Drop the INP reference. */
2645 INP_WLOCK(cur_tln->tln_inp);
2646 if (!in_pcbrele_wlocked(cur_tln->tln_inp))
2647 INP_WUNLOCK(cur_tln->tln_inp);
2648
2649 if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
2650 #ifdef INVARIANTS
2651 panic("%s: Bucket refcount unexpectedly 0.",
2652 __func__);
2653 #endif
2654 /*
2655 * Recover as best we can: free the entry we
2656 * own.
2657 */
2658 tcp_log_free_entries(&cur_tln->tln_entries,
2659 &cur_tln->tln_count);
2660 uma_zfree(tcp_log_id_node_zone, cur_tln);
2661 goto done;
2662 }
2663
2664 if (tcp_log_dump_node_logbuf(cur_tln, reason,
2665 M_NOWAIT)) {
2666 /*
2667 * If we have sapce, save the entries locally.
2668 * Otherwise, free them.
2669 */
2670 if (num_local_entries < LOCAL_SAVE) {
2671 local_entries[num_local_entries] =
2672 *cur_tln;
2673 num_local_entries++;
2674 } else {
2675 tcp_log_free_entries(
2676 &cur_tln->tln_entries,
2677 &cur_tln->tln_count);
2678 }
2679 }
2680
2681 /* No matter what, we are done with the node now. */
2682 uma_zfree(tcp_log_id_node_zone, cur_tln);
2683
2684 /*
2685 * Because we removed this entry from the list, prev_tln
2686 * (which tracks the previous entry still on the tlb
2687 * list) remains unchanged.
2688 */
2689 continue;
2690 }
2691
2692 /*
2693 * If we get to this point, the session data is still held in
2694 * the TCPCB. So, we need to pull the data out of that.
2695 *
2696 * We will need to drop the expireq lock so we can lock the INP.
2697 * We can then try to extract the data the "easy" way. If that
2698 * fails, we'll save the log entries for later.
2699 */
2700 if (expireq_locked) {
2701 TCPLOG_EXPIREQ_UNLOCK();
2702 expireq_locked = false;
2703 }
2704
2705 /* Lock the INP and then re-check the state. */
2706 inp = cur_tln->tln_inp;
2707 INP_WLOCK(inp);
2708 /*
2709 * If we caught this while it was transitioning, the data
2710 * might have moved from the TCPCB to the tln (signified by
2711 * setting tln_closed to true. If so, treat this like an
2712 * inactive connection.
2713 */
2714 if (cur_tln->tln_closed) {
2715 /*
2716 * It looks like we may have caught this connection
2717 * while it was transitioning from active to inactive.
2718 * Treat this like an inactive connection.
2719 */
2720 INP_WUNLOCK(inp);
2721 goto no_inp;
2722 }
2723
2724 /*
2725 * Try to dump the data from the tp without dropping the lock.
2726 * If this fails, try to save off the data locally.
2727 */
2728 tp = cur_tln->tln_tp;
2729 if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) &&
2730 num_local_entries < LOCAL_SAVE) {
2731 tcp_log_move_tp_to_node(tp,
2732 &local_entries[num_local_entries]);
2733 local_entries[num_local_entries].tln_closed = 1;
2734 KASSERT(local_entries[num_local_entries].tln_bucket ==
2735 tlb, ("%s: %d: bucket mismatch for node %p",
2736 __func__, __LINE__, cur_tln));
2737 num_local_entries++;
2738 }
2739
2740 INP_WUNLOCK(inp);
2741
2742 /*
2743 * We are goint to leave the current tln on the list. It will
2744 * become the previous tln.
2745 */
2746 prev_tln = cur_tln;
2747 }
2748
2749 /* Drop our locks, if any. */
2750 KASSERT(tree_locked == TREE_UNLOCKED,
2751 ("%s: %d: tree unexpectedly locked", __func__, __LINE__));
2752 switch (tree_locked) {
2753 case TREE_WLOCKED:
2754 TCPID_TREE_WUNLOCK();
2755 tree_locked = TREE_UNLOCKED;
2756 break;
2757 case TREE_RLOCKED:
2758 TCPID_TREE_RUNLOCK();
2759 tree_locked = TREE_UNLOCKED;
2760 break;
2761 }
2762 if (expireq_locked) {
2763 TCPLOG_EXPIREQ_UNLOCK();
2764 expireq_locked = false;
2765 }
2766
2767 /*
2768 * Try again for any saved entries. tcp_log_dump_node_logbuf() is
2769 * guaranteed to free the log entries within the node. And, since
2770 * the node itself is on our stack, we don't need to free it.
2771 */
2772 for (i = 0; i < num_local_entries; i++)
2773 tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK);
2774
2775 /* Drop our reference. */
2776 if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
2777 TCPID_BUCKET_UNLOCK(tlb);
2778
2779 done:
2780 /* Drop our locks, if any. */
2781 switch (tree_locked) {
2782 case TREE_WLOCKED:
2783 TCPID_TREE_WUNLOCK();
2784 break;
2785 case TREE_RLOCKED:
2786 TCPID_TREE_RUNLOCK();
2787 break;
2788 }
2789 if (expireq_locked)
2790 TCPLOG_EXPIREQ_UNLOCK();
2791 }
2792 #undef LOCAL_SAVE
2793
2794 /*
2795 * Queue the log buffers for all sessions in a bucket for transmissions via
2796 * the log buffer facility.
2797 *
2798 * NOTE: This should be called with a locked INP; however, the function
2799 * will drop the lock.
2800 */
2801 void
tcp_log_dump_tp_bucket_logbufs(struct tcpcb * tp,char * reason)2802 tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason)
2803 {
2804 struct inpcb *inp = tptoinpcb(tp);
2805 struct tcp_log_id_bucket *tlb;
2806 int tree_locked;
2807
2808 /* Figure out our bucket and lock it. */
2809 INP_WLOCK_ASSERT(inp);
2810 tlb = tp->t_lib;
2811 if (tlb == NULL) {
2812 /*
2813 * No bucket; treat this like a request to dump a single
2814 * session's traces.
2815 */
2816 (void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true);
2817 INP_WUNLOCK(inp);
2818 return;
2819 }
2820 TCPID_BUCKET_REF(tlb);
2821 INP_WUNLOCK(inp);
2822 TCPID_BUCKET_LOCK(tlb);
2823
2824 /* If we are the last reference, we have nothing more to do here. */
2825 tree_locked = TREE_UNLOCKED;
2826 if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
2827 switch (tree_locked) {
2828 case TREE_WLOCKED:
2829 TCPID_TREE_WUNLOCK();
2830 break;
2831 case TREE_RLOCKED:
2832 TCPID_TREE_RUNLOCK();
2833 break;
2834 }
2835 return;
2836 }
2837
2838 /* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */
2839 tcp_log_dumpbucketlogs(tlb, reason);
2840 }
2841
2842 /*
2843 * Mark the end of a flow with the current stack. A stack can add
2844 * stack-specific info to this trace event by overriding this
2845 * function (see bbr_log_flowend() for example).
2846 */
2847 void
tcp_log_flowend(struct tcpcb * tp)2848 tcp_log_flowend(struct tcpcb *tp)
2849 {
2850 if (tp->_t_logstate != TCP_LOG_STATE_OFF) {
2851 struct socket *so = tptosocket(tp);
2852 TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd,
2853 TCP_LOG_FLOWEND, 0, 0, NULL, false);
2854 }
2855 }
2856
2857 void
tcp_log_sendfile(struct socket * so,off_t offset,size_t nbytes,int flags)2858 tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
2859 {
2860 struct inpcb *inp;
2861 struct tcpcb *tp;
2862 #ifdef TCP_REQUEST_TRK
2863 struct tcp_sendfile_track *ent;
2864 int i, fnd;
2865 #endif
2866
2867 inp = sotoinpcb(so);
2868 KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL"));
2869
2870 /* quick check to see if logging is enabled for this connection */
2871 tp = intotcpcb(inp);
2872 if ((inp->inp_flags & INP_DROPPED) ||
2873 (tp->_t_logstate == TCP_LOG_STATE_OFF)) {
2874 return;
2875 }
2876
2877 INP_WLOCK(inp);
2878 /* double check log state now that we have the lock */
2879 if (inp->inp_flags & INP_DROPPED)
2880 goto done;
2881 if (tp->_t_logstate != TCP_LOG_STATE_OFF) {
2882 struct timeval tv;
2883 tcp_log_eventspecific_t log;
2884
2885 memset(&log, 0, sizeof(log));
2886 microuptime(&tv);
2887 log.u_sf.offset = offset;
2888 log.u_sf.length = nbytes;
2889 log.u_sf.flags = flags;
2890
2891 TCP_LOG_EVENTP(tp, NULL,
2892 &tptosocket(tp)->so_rcv,
2893 &tptosocket(tp)->so_snd,
2894 TCP_LOG_SENDFILE, 0, 0, &log, false, &tv);
2895 }
2896 #ifdef TCP_REQUEST_TRK
2897 if (tp->t_tcpreq_req == 0) {
2898 /* No http requests to track */
2899 goto done;
2900 }
2901 fnd = 0;
2902 if (tp->t_tcpreq_closed == 0) {
2903 /* No closed end req to track */
2904 goto skip_closed_req;
2905 }
2906 for(i = 0; i < MAX_TCP_TRK_REQ; i++) {
2907 /* Lets see if this one can be found */
2908 ent = &tp->t_tcpreq_info[i];
2909 if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) {
2910 /* Not used */
2911 continue;
2912 }
2913 if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) {
2914 /* This pass does not consider open requests */
2915 continue;
2916 }
2917 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) {
2918 /* Don't look at what we have completed */
2919 continue;
2920 }
2921 /* If we reach here its a allocated closed end request */
2922 if ((ent->start == offset) ||
2923 ((offset > ent->start) && (offset < ent->end))){
2924 /* Its within this request?? */
2925 fnd = 1;
2926 }
2927 if (fnd) {
2928 /*
2929 * It is at or past the end, its complete.
2930 */
2931 ent->flags |= TCP_TRK_TRACK_FLG_SEQV;
2932 /*
2933 * When an entry completes we can take (snd_una + sb_cc) and know where
2934 * the end of the range really is. Note that this works since two
2935 * requests must be sequential and sendfile now is complete for *this* request.
2936 * we must use sb_ccc since the data may still be in-flight in TLS.
2937 *
2938 * We always cautiously move the end_seq only if our calculations
2939 * show it happened (just in case sf has the call to here at the wrong
2940 * place). When we go COMP we will stop coming here and hopefully be
2941 * left with the correct end_seq.
2942 */
2943 if (SEQ_GT((tp->snd_una + so->so_snd.sb_ccc), ent->end_seq))
2944 ent->end_seq = tp->snd_una + so->so_snd.sb_ccc;
2945 if ((offset + nbytes) >= ent->end) {
2946 ent->flags |= TCP_TRK_TRACK_FLG_COMP;
2947 tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_COMPLETE, offset, nbytes);
2948 } else {
2949 tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_MOREYET, offset, nbytes);
2950 }
2951 /* We assume that sendfile never sends overlapping requests */
2952 goto done;
2953 }
2954 }
2955 skip_closed_req:
2956 if (!fnd) {
2957 /* Ok now lets look for open requests */
2958 for(i = 0; i < MAX_TCP_TRK_REQ; i++) {
2959 ent = &tp->t_tcpreq_info[i];
2960 if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) {
2961 /* Not used */
2962 continue;
2963 }
2964 if ((ent->flags & TCP_TRK_TRACK_FLG_OPEN) == 0)
2965 continue;
2966 /* If we reach here its an allocated open request */
2967 if (ent->start == offset) {
2968 /* It begins this request */
2969 ent->start_seq = tp->snd_una +
2970 tptosocket(tp)->so_snd.sb_ccc;
2971 ent->flags |= TCP_TRK_TRACK_FLG_SEQV;
2972 break;
2973 } else if (offset > ent->start) {
2974 ent->flags |= TCP_TRK_TRACK_FLG_SEQV;
2975 break;
2976 }
2977 }
2978 }
2979 #endif
2980 done:
2981 INP_WUNLOCK(inp);
2982 }
2983
2984 #ifdef DDB
2985 static void
db_print_indent(int indent)2986 db_print_indent(int indent)
2987 {
2988 int i;
2989
2990 for (i = 0; i < indent; i++)
2991 db_printf(" ");
2992 }
2993
2994 static void
db_print_tcphdr(struct tcp_log_buffer * tlm_buf)2995 db_print_tcphdr(struct tcp_log_buffer *tlm_buf)
2996 {
2997 struct sackblk sack;
2998 struct tcphdr *th;
2999 int cnt, i, j, opt, optlen, num_sacks;
3000 uint32_t val, ecr;
3001 uint16_t mss;
3002 uint16_t flags;
3003
3004 if ((tlm_buf->tlb_eventflags & TLB_FLAG_HDR) == 0) {
3005 return;
3006 }
3007 th = &tlm_buf->tlb_th;
3008 flags = tcp_get_flags(th);
3009 if (flags & TH_FIN) {
3010 db_printf("F");
3011 }
3012 if (flags & TH_SYN) {
3013 db_printf("S");
3014 }
3015 if (flags & TH_RST) {
3016 db_printf("R");
3017 }
3018 if (flags & TH_PUSH) {
3019 db_printf("P");
3020 }
3021 if (flags & TH_ACK) {
3022 db_printf(".");
3023 }
3024 if (flags & TH_URG) {
3025 db_printf("U");
3026 }
3027 if (flags & TH_ECE) {
3028 db_printf("E");
3029 }
3030 if (flags & TH_FIN) {
3031 db_printf("F");
3032 }
3033 if (flags & TH_CWR) {
3034 db_printf("W");
3035 }
3036 if (flags & TH_FIN) {
3037 db_printf("F");
3038 }
3039 if (flags & TH_AE) {
3040 db_printf("A");
3041 }
3042 db_printf(" %u:%u(%u)", ntohl(th->th_seq),
3043 ntohl(th->th_seq) + tlm_buf->tlb_len, tlm_buf->tlb_len);
3044 if (flags & TH_ACK) {
3045 db_printf(" ack %u", ntohl(th->th_ack));
3046 }
3047 db_printf(" win %u", ntohs(th->th_win));
3048 if (flags & TH_URG) {
3049 db_printf(" urg %u", ntohs(th->th_urp));
3050 }
3051 cnt = (th->th_off << 2) - sizeof(struct tcphdr);
3052 if (cnt > 0) {
3053 db_printf(" <");
3054 for (i = 0; i < cnt; i += optlen) {
3055 opt = tlm_buf->tlb_opts[i];
3056 if (opt == TCPOPT_EOL || opt == TCPOPT_NOP) {
3057 optlen = 1;
3058 } else {
3059 if (cnt - i < 2) {
3060 break;
3061 }
3062 optlen = tlm_buf->tlb_opts[i + 1];
3063 if (optlen < 2 || optlen > cnt - i) {
3064 break;
3065 }
3066 }
3067 if (i > 0) {
3068 db_printf(",");
3069 }
3070 switch (opt) {
3071 case TCPOPT_EOL:
3072 db_printf("eol");
3073 break;
3074 case TCPOPT_NOP:
3075 db_printf("nop");
3076 break;
3077 case TCPOPT_MAXSEG:
3078 if (optlen != TCPOLEN_MAXSEG) {
3079 break;
3080 }
3081 bcopy(tlm_buf->tlb_opts + i + 2, &mss,
3082 sizeof(uint16_t));
3083 db_printf("mss %u", ntohs(mss));
3084 break;
3085 case TCPOPT_WINDOW:
3086 if (optlen != TCPOLEN_WINDOW) {
3087 break;
3088 }
3089 db_printf("wscale %u",
3090 tlm_buf->tlb_opts[i + 2]);
3091 break;
3092 case TCPOPT_SACK_PERMITTED:
3093 if (optlen != TCPOLEN_SACK_PERMITTED) {
3094 break;
3095 }
3096 db_printf("sackOK");
3097 break;
3098 case TCPOPT_SACK:
3099 if (optlen == TCPOLEN_SACKHDR ||
3100 (optlen - 2) % TCPOLEN_SACK != 0) {
3101 break;
3102 }
3103 num_sacks = (optlen - 2) / TCPOLEN_SACK;
3104 db_printf("sack");
3105 for (j = 0; j < num_sacks; j++) {
3106 bcopy(tlm_buf->tlb_opts + i + 2 +
3107 j * TCPOLEN_SACK, &sack,
3108 TCPOLEN_SACK);
3109 db_printf(" %u:%u", ntohl(sack.start),
3110 ntohl(sack.end));
3111 }
3112 break;
3113 case TCPOPT_TIMESTAMP:
3114 if (optlen != TCPOLEN_TIMESTAMP) {
3115 break;
3116 }
3117 bcopy(tlm_buf->tlb_opts + i + 2, &val,
3118 sizeof(uint32_t));
3119 bcopy(tlm_buf->tlb_opts + i + 6, &ecr,
3120 sizeof(uint32_t));
3121 db_printf("TS val %u ecr %u", ntohl(val),
3122 ntohl(ecr));
3123 break;
3124 case TCPOPT_SIGNATURE:
3125 db_printf("md5");
3126 if (optlen > 2) {
3127 db_printf(" ");
3128 }
3129 for (j = 0; j < optlen - 2; j++) {
3130 db_printf("%02x",
3131 tlm_buf->tlb_opts[i + 2 + j]);
3132 }
3133 break;
3134 case TCPOPT_FAST_OPEN:
3135 db_printf("FO");
3136 if (optlen > 2) {
3137 db_printf(" ");
3138 }
3139 for (j = 0; j < optlen - 2; j++) {
3140 db_printf("%02x",
3141 tlm_buf->tlb_opts[i + 2 + j]);
3142 }
3143 break;
3144 default:
3145 db_printf("opt=%u len=%u", opt, optlen);
3146 break;
3147 }
3148 }
3149 db_printf(">");
3150 }
3151 }
3152 static void
db_print_pru(struct tcp_log_buffer * tlm_buf)3153 db_print_pru(struct tcp_log_buffer *tlm_buf)
3154 {
3155 switch (tlm_buf->tlb_flex1) {
3156 case PRU_ATTACH:
3157 db_printf("ATTACH");
3158 break;
3159 case PRU_DETACH:
3160 db_printf("DETACH");
3161 break;
3162 case PRU_BIND:
3163 db_printf("BIND");
3164 break;
3165 case PRU_LISTEN:
3166 db_printf("LISTEN");
3167 break;
3168 case PRU_CONNECT:
3169 db_printf("CONNECT");
3170 break;
3171 case PRU_ACCEPT:
3172 db_printf("ACCEPT");
3173 break;
3174 case PRU_DISCONNECT:
3175 db_printf("DISCONNECT");
3176 break;
3177 case PRU_SHUTDOWN:
3178 db_printf("SHUTDOWN");
3179 break;
3180 case PRU_RCVD:
3181 db_printf("RCVD");
3182 break;
3183 case PRU_SEND:
3184 db_printf("SEND");
3185 break;
3186 case PRU_ABORT:
3187 db_printf("ABORT");
3188 break;
3189 case PRU_CONTROL:
3190 db_printf("CONTROL");
3191 break;
3192 case PRU_SENSE:
3193 db_printf("SENSE");
3194 break;
3195 case PRU_RCVOOB:
3196 db_printf("RCVOOB");
3197 break;
3198 case PRU_SENDOOB:
3199 db_printf("SENDOOB");
3200 break;
3201 case PRU_SOCKADDR:
3202 db_printf("SOCKADDR");
3203 break;
3204 case PRU_PEERADDR:
3205 db_printf("PEERADDR");
3206 break;
3207 case PRU_CONNECT2:
3208 db_printf("CONNECT2");
3209 break;
3210 case PRU_FASTTIMO:
3211 db_printf("FASTTIMO");
3212 break;
3213 case PRU_SLOWTIMO:
3214 db_printf("SLOWTIMO");
3215 break;
3216 case PRU_PROTORCV:
3217 db_printf("PROTORCV");
3218 break;
3219 case PRU_PROTOSEND:
3220 db_printf("PROTOSEND");
3221 break;
3222 case PRU_SEND_EOF:
3223 db_printf("SEND_EOF");
3224 break;
3225 case PRU_SOSETLABEL:
3226 db_printf("SOSETLABEL");
3227 break;
3228 case PRU_CLOSE:
3229 db_printf("CLOSE");
3230 break;
3231 case PRU_FLUSH:
3232 db_printf("FLUSH");
3233 break;
3234 default:
3235 db_printf("Unknown PRU (%u)", tlm_buf->tlb_flex1);
3236 break;
3237 }
3238 if (tlm_buf->tlb_errno >= 0) {
3239 db_printf(", error: %d", tlm_buf->tlb_errno);
3240 }
3241 }
3242
3243 static void
db_print_rto(struct tcp_log_buffer * tlm_buf)3244 db_print_rto(struct tcp_log_buffer *tlm_buf)
3245 {
3246 tt_what what;
3247 tt_which which;
3248
3249 what = (tlm_buf->tlb_flex1 & 0xffffff00) >> 8;
3250 which = tlm_buf->tlb_flex1 & 0x000000ff;
3251 switch (what) {
3252 case TT_PROCESSING:
3253 db_printf("Processing ");
3254 break;
3255 case TT_PROCESSED:
3256 db_printf("Processed ");
3257 break;
3258 case TT_STARTING:
3259 db_printf("Starting ");
3260 break;
3261 case TT_STOPPING:
3262 db_printf("Stopping ");
3263 break;
3264 default:
3265 db_printf("Unknown operation (%u) for ", what);
3266 break;
3267 }
3268 switch (which) {
3269 case TT_REXMT:
3270 db_printf("Retransmission ");
3271 break;
3272 case TT_PERSIST:
3273 db_printf("Persist ");
3274 break;
3275 case TT_KEEP:
3276 db_printf("Keepalive ");
3277 break;
3278 case TT_2MSL:
3279 db_printf("2 MSL ");
3280 break;
3281 case TT_DELACK:
3282 db_printf("Delayed ACK ");
3283 break;
3284 default:
3285 db_printf("Unknown (%u) ", which);
3286 break;
3287 }
3288 db_printf("timer");
3289 if (what == TT_STARTING) {
3290 db_printf(": %u ms", tlm_buf->tlb_flex2);
3291 }
3292 }
3293
3294 static void
db_print_usersend(struct tcp_log_buffer * tlm_buf)3295 db_print_usersend(struct tcp_log_buffer *tlm_buf)
3296 {
3297 if ((tlm_buf->tlb_eventflags & TLB_FLAG_RXBUF) == 0) {
3298 return;
3299 }
3300 if ((tlm_buf->tlb_eventflags & TLB_FLAG_TXBUF) == 0) {
3301 return;
3302 }
3303 db_printf("usersend: rcv.acc: %u rcv.ccc: %u snd.acc: %u snd.ccc: %u",
3304 tlm_buf->tlb_rxbuf.tls_sb_acc, tlm_buf->tlb_rxbuf.tls_sb_ccc,
3305 tlm_buf->tlb_txbuf.tls_sb_acc, tlm_buf->tlb_txbuf.tls_sb_ccc);
3306 }
3307
3308 void
db_print_bblog_entries(struct tcp_log_stailq * log_entries,int indent)3309 db_print_bblog_entries(struct tcp_log_stailq *log_entries, int indent)
3310 {
3311 struct tcp_log_mem *log_entry;
3312 struct tcp_log_buffer *tlm_buf, *prev_tlm_buf;
3313 int64_t delta_t;
3314
3315 indent += 2;
3316 prev_tlm_buf = NULL;
3317 STAILQ_FOREACH(log_entry, log_entries, tlm_queue) {
3318 db_print_indent(indent);
3319 tlm_buf = &log_entry->tlm_buf;
3320 if (prev_tlm_buf == NULL) {
3321 db_printf(" 0.000 ");
3322 } else {
3323 delta_t = sbttoms(tvtosbt(tlm_buf->tlb_tv) -
3324 tvtosbt(prev_tlm_buf->tlb_tv));
3325 db_printf("+%u.%03u ", (uint32_t)(delta_t / 1000),
3326 (uint32_t)(delta_t % 1000));
3327 }
3328 switch (tlm_buf->tlb_eventid) {
3329 case TCP_LOG_IN:
3330 db_printf("< ");
3331 db_print_tcphdr(tlm_buf);
3332 break;
3333 case TCP_LOG_OUT:
3334 db_printf("> ");
3335 db_print_tcphdr(tlm_buf);
3336 break;
3337 case TCP_LOG_RTO:
3338 db_print_rto(tlm_buf);
3339 break;
3340 case TCP_LOG_PRU:
3341 db_print_pru(tlm_buf);
3342 break;
3343 case TCP_LOG_USERSEND:
3344 db_print_usersend(tlm_buf);
3345 break;
3346 default:
3347 break;
3348 }
3349 db_printf("\n");
3350 prev_tlm_buf = tlm_buf;
3351 if (db_pager_quit)
3352 break;
3353 }
3354 }
3355 #endif
3356