xref: /freebsd/sys/dev/tcp_log/tcp_log_dev.c (revision 63a938566d524836885917d95bd491aa4400b181)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2016-2017
5  *	Netflix Inc.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/conf.h>
35 #include <sys/fcntl.h>
36 #include <sys/filio.h>
37 #include <sys/kernel.h>
38 #include <sys/lock.h>
39 #include <sys/malloc.h>
40 #include <sys/module.h>
41 #include <sys/poll.h>
42 #include <sys/queue.h>
43 #include <sys/refcount.h>
44 #include <sys/mutex.h>
45 #include <sys/selinfo.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/tree.h>
50 #include <sys/uio.h>
51 #include <machine/atomic.h>
52 #include <sys/counter.h>
53 
54 #include <dev/tcp_log/tcp_log_dev.h>
55 
56 #ifdef TCPLOG_DEBUG_COUNTERS
57 extern counter_u64_t tcp_log_que_read;
58 extern counter_u64_t tcp_log_que_freed;
59 #endif
60 
61 static struct cdev *tcp_log_dev;
62 static struct selinfo tcp_log_sel;
63 
64 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head);
65 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head);
66 
67 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures");
68 
69 static int	tcp_log_dev_listeners = 0;
70 
71 static struct mtx tcp_log_dev_queue_lock;
72 
73 #define	TCP_LOG_DEV_QUEUE_LOCK()	mtx_lock(&tcp_log_dev_queue_lock)
74 #define	TCP_LOG_DEV_QUEUE_UNLOCK()	mtx_unlock(&tcp_log_dev_queue_lock)
75 #define	TCP_LOG_DEV_QUEUE_LOCK_ASSERT()	mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED)
76 #define	TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED)
77 #define	TCP_LOG_DEV_QUEUE_REF(tldq)	refcount_acquire(&((tldq)->tldq_refcnt))
78 #define	TCP_LOG_DEV_QUEUE_UNREF(tldq)	refcount_release(&((tldq)->tldq_refcnt))
79 
80 static void	tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry);
81 static void	tcp_log_dev_clear_cdevpriv(void *data);
82 static int	tcp_log_dev_open(struct cdev *dev __unused, int flags,
83     int devtype __unused, struct thread *td __unused);
84 static int	tcp_log_dev_write(struct cdev *dev __unused,
85     struct uio *uio __unused, int flags __unused);
86 static int	tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio,
87     int flags __unused);
88 static int	tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd,
89     caddr_t data, int fflag __unused, struct thread *td __unused);
90 static int	tcp_log_dev_poll(struct cdev *dev __unused, int events,
91     struct thread *td);
92 
93 
94 enum tcp_log_dev_queue_lock_state {
95 	QUEUE_UNLOCKED = 0,
96 	QUEUE_LOCKED,
97 };
98 
99 static struct cdevsw tcp_log_cdevsw = {
100 	.d_version =	D_VERSION,
101 	.d_read =	tcp_log_dev_read,
102 	.d_open =	tcp_log_dev_open,
103 	.d_write =	tcp_log_dev_write,
104 	.d_poll =	tcp_log_dev_poll,
105 	.d_ioctl =	tcp_log_dev_ioctl,
106 #ifdef NOTYET
107 	.d_mmap =	tcp_log_dev_mmap,
108 #endif
109 	.d_name =	"tcp_log",
110 };
111 
112 static __inline void
113 tcp_log_dev_queue_validate_lock(int lockstate)
114 {
115 
116 #ifdef INVARIANTS
117 	switch (lockstate) {
118 	case QUEUE_LOCKED:
119 		TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
120 		break;
121 	case QUEUE_UNLOCKED:
122 		TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT();
123 		break;
124 	default:
125 		kassert_panic("%s:%d: unknown queue lock state", __func__,
126 		    __LINE__);
127 	}
128 #endif
129 }
130 
131 /*
132  * Clear the refcount. If appropriate, it will remove the entry from the
133  * queue and call the destructor.
134  *
135  * This must be called with the queue lock held.
136  */
137 static void
138 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry)
139 {
140 
141 	KASSERT(entry != NULL, ("%s: called with NULL entry", __func__));
142 
143 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
144 
145 	if (TCP_LOG_DEV_QUEUE_UNREF(entry)) {
146 #ifdef TCPLOG_DEBUG_COUNTERS
147 		counter_u64_add(tcp_log_que_freed, 1);
148 #endif
149 		/* Remove the entry from the queue and call the destructor. */
150 		STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue,
151 		    tldq_queue);
152 		(*entry->tldq_dtor)(entry);
153 	}
154 }
155 
156 static void
157 tcp_log_dev_clear_cdevpriv(void *data)
158 {
159 	struct tcp_log_dev_info *priv;
160 	struct tcp_log_dev_queue *entry, *entry_tmp;
161 
162 	priv = (struct tcp_log_dev_info *)data;
163 	if (priv == NULL)
164 		return;
165 
166 	/*
167 	 * Lock the queue and drop our references. We hold references to all
168 	 * the entries starting with tldi_head (or, if tldi_head == NULL, all
169 	 * entries in the queue).
170 	 *
171 	 * Because we don't want anyone adding addition things to the queue
172 	 * while we are doing this, we lock the queue.
173 	 */
174 	TCP_LOG_DEV_QUEUE_LOCK();
175 	if (priv->tldi_head != NULL) {
176 		entry = priv->tldi_head;
177 		STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head,
178 		    tldq_queue, entry_tmp) {
179 			tcp_log_dev_clear_refcount(entry);
180 		}
181 	}
182 	tcp_log_dev_listeners--;
183 	KASSERT(tcp_log_dev_listeners >= 0,
184 	    ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__));
185 	STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info,
186 	    tldi_list);
187 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
188 	TCP_LOG_DEV_QUEUE_UNLOCK();
189 	free(priv, M_TCPLOGDEV);
190 }
191 
192 static int
193 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused,
194     struct thread *td __unused)
195 {
196 	struct tcp_log_dev_info *priv;
197 	struct tcp_log_dev_queue *entry;
198 	int rv;
199 
200 	/*
201 	 * Ideally, we shouldn't see these because of file system
202 	 * permissions.
203 	 */
204 	if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC))
205 		return (ENODEV);
206 
207 	/* Allocate space to hold information about where we are. */
208 	priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV,
209 	    M_ZERO | M_WAITOK);
210 
211 	/* Stash the private data away. */
212 	rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv);
213 	if (!rv) {
214 		/*
215 		 * Increase the listener count, add this reader to the list, and
216 		 * take references on all current queues.
217 		 */
218 		TCP_LOG_DEV_QUEUE_LOCK();
219 		tcp_log_dev_listeners++;
220 		STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list);
221 		priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head);
222 		if (priv->tldi_head != NULL)
223 			priv->tldi_cur = priv->tldi_head->tldq_buf;
224 		STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue)
225 			TCP_LOG_DEV_QUEUE_REF(entry);
226 		TCP_LOG_DEV_QUEUE_UNLOCK();
227 	} else {
228 		/* Free the entry. */
229 		free(priv, M_TCPLOGDEV);
230 	}
231 	return (rv);
232 }
233 
234 static int
235 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused,
236     int flags __unused)
237 {
238 
239 	return (ENODEV);
240 }
241 
242 static __inline void
243 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate)
244 {
245 	struct tcp_log_dev_queue *entry;
246 
247 	KASSERT(priv->tldi_head != NULL,
248 	    ("%s:%d: priv->tldi_head unexpectedly NULL",
249 	    __func__, __LINE__));
250 	KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur,
251 	    ("%s:%d: buffer mismatch (%p vs %p)",
252 	    __func__, __LINE__, priv->tldi_head->tldq_buf,
253 	    priv->tldi_cur));
254 	tcp_log_dev_queue_validate_lock(*lockstate);
255 
256 	if (*lockstate == QUEUE_UNLOCKED) {
257 		TCP_LOG_DEV_QUEUE_LOCK();
258 		*lockstate = QUEUE_LOCKED;
259 	}
260 	entry = priv->tldi_head;
261 	priv->tldi_head = STAILQ_NEXT(entry, tldq_queue);
262 	tcp_log_dev_clear_refcount(entry);
263 	priv->tldi_cur = NULL;
264 }
265 
266 static int
267 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags)
268 {
269 	struct tcp_log_common_header *buf;
270 	struct tcp_log_dev_info *priv;
271 	struct tcp_log_dev_queue *entry;
272 	ssize_t len;
273 	int lockstate, rv;
274 
275 	/* Get our private info. */
276 	rv = devfs_get_cdevpriv((void **)&priv);
277 	if (rv)
278 		return (rv);
279 
280 	lockstate = QUEUE_UNLOCKED;
281 
282 	/* Do we need to get a new buffer? */
283 	while (priv->tldi_cur == NULL ||
284 	    priv->tldi_cur->tlch_length <= priv->tldi_off) {
285 		/* Did we somehow forget to rotate? */
286 		KASSERT(priv->tldi_cur == NULL,
287 		    ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__,
288 		    __LINE__));
289 		if (priv->tldi_cur != NULL)
290 			tcp_log_dev_rotate_bufs(priv, &lockstate);
291 
292 		/*
293 		 * Before we start looking at tldi_head, we need a lock on the
294 		 * queue to make sure tldi_head stays stable.
295 		 */
296 		if (lockstate == QUEUE_UNLOCKED) {
297 			TCP_LOG_DEV_QUEUE_LOCK();
298 			lockstate = QUEUE_LOCKED;
299 		}
300 
301 		/* We need the next buffer. Do we have one? */
302 		if (priv->tldi_head == NULL && (flags & FNONBLOCK)) {
303 			rv = EAGAIN;
304 			goto done;
305 		}
306 		if (priv->tldi_head == NULL) {
307 			/* Sleep and wait for more things we can read. */
308 			rv = mtx_sleep(&tcp_log_dev_listeners,
309 			    &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0);
310 			if (rv)
311 				goto done;
312 			if (priv->tldi_head == NULL)
313 				continue;
314 		}
315 
316 		/*
317 		 * We have an entry to read. We want to try to create a
318 		 * buffer, if one doesn't already exist.
319 		 */
320 		entry = priv->tldi_head;
321 		if (entry->tldq_buf == NULL) {
322 			TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
323 			buf = (*entry->tldq_xform)(entry);
324 			if (buf == NULL) {
325 				rv = EBUSY;
326 				goto done;
327 			}
328 			entry->tldq_buf = buf;
329 		}
330 
331 		priv->tldi_cur = entry->tldq_buf;
332 		priv->tldi_off = 0;
333 	}
334 
335 	/* Copy what we can from this buffer to the output buffer. */
336 	if (uio->uio_resid > 0) {
337 		/* Drop locks so we can take page faults. */
338 		if (lockstate == QUEUE_LOCKED)
339 			TCP_LOG_DEV_QUEUE_UNLOCK();
340 		lockstate = QUEUE_UNLOCKED;
341 
342 		KASSERT(priv->tldi_cur != NULL,
343 		    ("%s: priv->tldi_cur is unexpectedly NULL", __func__));
344 
345 		/* Copy as much as we can to this uio. */
346 		len = priv->tldi_cur->tlch_length - priv->tldi_off;
347 		if (len > uio->uio_resid)
348 			len = uio->uio_resid;
349 		rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off,
350 		    len, uio);
351 		if (rv != 0)
352 			goto done;
353 		priv->tldi_off += len;
354 #ifdef TCPLOG_DEBUG_COUNTERS
355 		counter_u64_add(tcp_log_que_read, len);
356 #endif
357 	}
358 	/* Are we done with this buffer? If so, find the next one. */
359 	if (priv->tldi_off >= priv->tldi_cur->tlch_length) {
360 		KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length,
361 		    ("%s: offset (%ju) exceeds length (%ju)", __func__,
362 		    (uintmax_t)priv->tldi_off,
363 		    (uintmax_t)priv->tldi_cur->tlch_length));
364 		tcp_log_dev_rotate_bufs(priv, &lockstate);
365 	}
366 done:
367 	tcp_log_dev_queue_validate_lock(lockstate);
368 	if (lockstate == QUEUE_LOCKED)
369 		TCP_LOG_DEV_QUEUE_UNLOCK();
370 	return (rv);
371 }
372 
373 static int
374 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
375     int fflag __unused, struct thread *td __unused)
376 {
377 	struct tcp_log_dev_info *priv;
378 	int rv;
379 
380 	/* Get our private info. */
381 	rv = devfs_get_cdevpriv((void **)&priv);
382 	if (rv)
383 		return (rv);
384 
385 	/*
386 	 * Set things. Here, we are most concerned about the non-blocking I/O
387 	 * flag.
388 	 */
389 	rv = 0;
390 	switch (cmd) {
391 	case FIONBIO:
392 		break;
393 	case FIOASYNC:
394 		if (*(int *)data != 0)
395 			rv = EINVAL;
396 		break;
397 	default:
398 		rv = ENOIOCTL;
399 	}
400 	return (rv);
401 }
402 
403 static int
404 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td)
405 {
406 	struct tcp_log_dev_info *priv;
407 	int revents;
408 
409 	/*
410 	 * Get our private info. If this fails, claim that all events are
411 	 * ready. That should prod the user to do something that will
412 	 * make the error evident to them.
413 	 */
414 	if (devfs_get_cdevpriv((void **)&priv))
415 		return (events);
416 
417 	revents = 0;
418 	if (events & (POLLIN | POLLRDNORM)) {
419 		/*
420 		 * We can (probably) read right now if we are partway through
421 		 * a buffer or if we are just about to start a buffer.
422 		 * Because we are going to read tldi_head, we should acquire
423 		 * a read lock on the queue.
424 		 */
425 		TCP_LOG_DEV_QUEUE_LOCK();
426 		if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) ||
427 		    (priv->tldi_cur != NULL &&
428 		    priv->tldi_off < priv->tldi_cur->tlch_length))
429 			revents = events & (POLLIN | POLLRDNORM);
430 		else
431 			selrecord(td, &tcp_log_sel);
432 		TCP_LOG_DEV_QUEUE_UNLOCK();
433 	} else {
434 		/*
435 		 * It only makes sense to poll for reading. So, again, prod the
436 		 * user to do something that will make the error of their ways
437 		 * apparent.
438 		 */
439 		revents = events;
440 	}
441 	return (revents);
442 }
443 
444 int
445 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry)
446 {
447 	struct tcp_log_dev_info *priv;
448 	int rv;
449 	bool wakeup_needed;
450 
451 	KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL,
452 	    ("%s: Called with both tldq_buf and tldq_xform set to NULL",
453 	    __func__));
454 	KASSERT(entry->tldq_dtor != NULL,
455 	    ("%s: Called with tldq_dtor set to NULL", __func__));
456 
457 	/* Get a lock on the queue. */
458 	TCP_LOG_DEV_QUEUE_LOCK();
459 
460 	/* If no one is listening, tell the caller to free the resources. */
461 	if (tcp_log_dev_listeners == 0) {
462 		rv = ENXIO;
463 		goto done;
464 	}
465 
466 	/* Add this to the end of the tailq. */
467 	STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue);
468 
469 	/* Add references for all current listeners. */
470 	refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners);
471 
472 	/*
473 	 * If any listener is currently stuck on NULL, that means they are
474 	 * waiting. Point their head to this new entry.
475 	 */
476 	wakeup_needed = false;
477 	STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list)
478 		if (priv->tldi_head == NULL) {
479 			priv->tldi_head = entry;
480 			wakeup_needed = true;
481 		}
482 
483 	if (wakeup_needed) {
484 		selwakeup(&tcp_log_sel);
485 		wakeup(&tcp_log_dev_listeners);
486 	}
487 
488 	rv = 0;
489 
490 done:
491 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
492 	TCP_LOG_DEV_QUEUE_UNLOCK();
493 	return (rv);
494 }
495 
496 static int
497 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused)
498 {
499 
500 	/* TODO: Support intelligent unloading. */
501 	switch (type) {
502 	case MOD_LOAD:
503 		if (bootverbose)
504 			printf("tcp_log: tcp_log device\n");
505 		memset(&tcp_log_sel, 0, sizeof(tcp_log_sel));
506 		memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx));
507 		mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev",
508 			 "tcp_log device queues", MTX_DEF);
509 		tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
510 		    &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
511 		    "tcp_log");
512 		break;
513 	default:
514 		return (EOPNOTSUPP);
515 	}
516 
517 	return (0);
518 }
519 
520 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL);
521 MODULE_VERSION(tcp_log_dev, 1);
522