xref: /freebsd/sys/dev/tcp_log/tcp_log_dev.c (revision 6683132d54bd6d589889e43dabdc53d35e38a028)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2016-2017 Netflix, Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/conf.h>
34 #include <sys/fcntl.h>
35 #include <sys/filio.h>
36 #include <sys/kernel.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/poll.h>
41 #include <sys/queue.h>
42 #include <sys/refcount.h>
43 #include <sys/mutex.h>
44 #include <sys/selinfo.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/tree.h>
49 #include <sys/uio.h>
50 #include <machine/atomic.h>
51 #include <sys/counter.h>
52 
53 #include <dev/tcp_log/tcp_log_dev.h>
54 
55 #ifdef TCPLOG_DEBUG_COUNTERS
56 extern counter_u64_t tcp_log_que_read;
57 extern counter_u64_t tcp_log_que_freed;
58 #endif
59 
60 static struct cdev *tcp_log_dev;
61 static struct selinfo tcp_log_sel;
62 
63 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head);
64 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head);
65 
66 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures");
67 
68 static int	tcp_log_dev_listeners = 0;
69 
70 static struct mtx tcp_log_dev_queue_lock;
71 
72 #define	TCP_LOG_DEV_QUEUE_LOCK()	mtx_lock(&tcp_log_dev_queue_lock)
73 #define	TCP_LOG_DEV_QUEUE_UNLOCK()	mtx_unlock(&tcp_log_dev_queue_lock)
74 #define	TCP_LOG_DEV_QUEUE_LOCK_ASSERT()	mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED)
75 #define	TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED)
76 #define	TCP_LOG_DEV_QUEUE_REF(tldq)	refcount_acquire(&((tldq)->tldq_refcnt))
77 #define	TCP_LOG_DEV_QUEUE_UNREF(tldq)	refcount_release(&((tldq)->tldq_refcnt))
78 
79 static void	tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry);
80 static void	tcp_log_dev_clear_cdevpriv(void *data);
81 static int	tcp_log_dev_open(struct cdev *dev __unused, int flags,
82     int devtype __unused, struct thread *td __unused);
83 static int	tcp_log_dev_write(struct cdev *dev __unused,
84     struct uio *uio __unused, int flags __unused);
85 static int	tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio,
86     int flags __unused);
87 static int	tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd,
88     caddr_t data, int fflag __unused, struct thread *td __unused);
89 static int	tcp_log_dev_poll(struct cdev *dev __unused, int events,
90     struct thread *td);
91 
92 
93 enum tcp_log_dev_queue_lock_state {
94 	QUEUE_UNLOCKED = 0,
95 	QUEUE_LOCKED,
96 };
97 
98 static struct cdevsw tcp_log_cdevsw = {
99 	.d_version =	D_VERSION,
100 	.d_read =	tcp_log_dev_read,
101 	.d_open =	tcp_log_dev_open,
102 	.d_write =	tcp_log_dev_write,
103 	.d_poll =	tcp_log_dev_poll,
104 	.d_ioctl =	tcp_log_dev_ioctl,
105 #ifdef NOTYET
106 	.d_mmap =	tcp_log_dev_mmap,
107 #endif
108 	.d_name =	"tcp_log",
109 };
110 
111 static __inline void
112 tcp_log_dev_queue_validate_lock(int lockstate)
113 {
114 
115 #ifdef INVARIANTS
116 	switch (lockstate) {
117 	case QUEUE_LOCKED:
118 		TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
119 		break;
120 	case QUEUE_UNLOCKED:
121 		TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT();
122 		break;
123 	default:
124 		kassert_panic("%s:%d: unknown queue lock state", __func__,
125 		    __LINE__);
126 	}
127 #endif
128 }
129 
130 /*
131  * Clear the refcount. If appropriate, it will remove the entry from the
132  * queue and call the destructor.
133  *
134  * This must be called with the queue lock held.
135  */
136 static void
137 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry)
138 {
139 
140 	KASSERT(entry != NULL, ("%s: called with NULL entry", __func__));
141 
142 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
143 
144 	if (TCP_LOG_DEV_QUEUE_UNREF(entry)) {
145 #ifdef TCPLOG_DEBUG_COUNTERS
146 		counter_u64_add(tcp_log_que_freed, 1);
147 #endif
148 		/* Remove the entry from the queue and call the destructor. */
149 		STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue,
150 		    tldq_queue);
151 		(*entry->tldq_dtor)(entry);
152 	}
153 }
154 
155 static void
156 tcp_log_dev_clear_cdevpriv(void *data)
157 {
158 	struct tcp_log_dev_info *priv;
159 	struct tcp_log_dev_queue *entry, *entry_tmp;
160 
161 	priv = (struct tcp_log_dev_info *)data;
162 	if (priv == NULL)
163 		return;
164 
165 	/*
166 	 * Lock the queue and drop our references. We hold references to all
167 	 * the entries starting with tldi_head (or, if tldi_head == NULL, all
168 	 * entries in the queue).
169 	 *
170 	 * Because we don't want anyone adding addition things to the queue
171 	 * while we are doing this, we lock the queue.
172 	 */
173 	TCP_LOG_DEV_QUEUE_LOCK();
174 	if (priv->tldi_head != NULL) {
175 		entry = priv->tldi_head;
176 		STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head,
177 		    tldq_queue, entry_tmp) {
178 			tcp_log_dev_clear_refcount(entry);
179 		}
180 	}
181 	tcp_log_dev_listeners--;
182 	KASSERT(tcp_log_dev_listeners >= 0,
183 	    ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__));
184 	STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info,
185 	    tldi_list);
186 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
187 	TCP_LOG_DEV_QUEUE_UNLOCK();
188 	free(priv, M_TCPLOGDEV);
189 }
190 
191 static int
192 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused,
193     struct thread *td __unused)
194 {
195 	struct tcp_log_dev_info *priv;
196 	struct tcp_log_dev_queue *entry;
197 	int rv;
198 
199 	/*
200 	 * Ideally, we shouldn't see these because of file system
201 	 * permissions.
202 	 */
203 	if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC))
204 		return (ENODEV);
205 
206 	/* Allocate space to hold information about where we are. */
207 	priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV,
208 	    M_ZERO | M_WAITOK);
209 
210 	/* Stash the private data away. */
211 	rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv);
212 	if (!rv) {
213 		/*
214 		 * Increase the listener count, add this reader to the list, and
215 		 * take references on all current queues.
216 		 */
217 		TCP_LOG_DEV_QUEUE_LOCK();
218 		tcp_log_dev_listeners++;
219 		STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list);
220 		priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head);
221 		if (priv->tldi_head != NULL)
222 			priv->tldi_cur = priv->tldi_head->tldq_buf;
223 		STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue)
224 			TCP_LOG_DEV_QUEUE_REF(entry);
225 		TCP_LOG_DEV_QUEUE_UNLOCK();
226 	} else {
227 		/* Free the entry. */
228 		free(priv, M_TCPLOGDEV);
229 	}
230 	return (rv);
231 }
232 
233 static int
234 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused,
235     int flags __unused)
236 {
237 
238 	return (ENODEV);
239 }
240 
241 static __inline void
242 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate)
243 {
244 	struct tcp_log_dev_queue *entry;
245 
246 	KASSERT(priv->tldi_head != NULL,
247 	    ("%s:%d: priv->tldi_head unexpectedly NULL",
248 	    __func__, __LINE__));
249 	KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur,
250 	    ("%s:%d: buffer mismatch (%p vs %p)",
251 	    __func__, __LINE__, priv->tldi_head->tldq_buf,
252 	    priv->tldi_cur));
253 	tcp_log_dev_queue_validate_lock(*lockstate);
254 
255 	if (*lockstate == QUEUE_UNLOCKED) {
256 		TCP_LOG_DEV_QUEUE_LOCK();
257 		*lockstate = QUEUE_LOCKED;
258 	}
259 	entry = priv->tldi_head;
260 	priv->tldi_head = STAILQ_NEXT(entry, tldq_queue);
261 	tcp_log_dev_clear_refcount(entry);
262 	priv->tldi_cur = NULL;
263 }
264 
265 static int
266 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags)
267 {
268 	struct tcp_log_common_header *buf;
269 	struct tcp_log_dev_info *priv;
270 	struct tcp_log_dev_queue *entry;
271 	ssize_t len;
272 	int lockstate, rv;
273 
274 	/* Get our private info. */
275 	rv = devfs_get_cdevpriv((void **)&priv);
276 	if (rv)
277 		return (rv);
278 
279 	lockstate = QUEUE_UNLOCKED;
280 
281 	/* Do we need to get a new buffer? */
282 	while (priv->tldi_cur == NULL ||
283 	    priv->tldi_cur->tlch_length <= priv->tldi_off) {
284 		/* Did we somehow forget to rotate? */
285 		KASSERT(priv->tldi_cur == NULL,
286 		    ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__,
287 		    __LINE__));
288 		if (priv->tldi_cur != NULL)
289 			tcp_log_dev_rotate_bufs(priv, &lockstate);
290 
291 		/*
292 		 * Before we start looking at tldi_head, we need a lock on the
293 		 * queue to make sure tldi_head stays stable.
294 		 */
295 		if (lockstate == QUEUE_UNLOCKED) {
296 			TCP_LOG_DEV_QUEUE_LOCK();
297 			lockstate = QUEUE_LOCKED;
298 		}
299 
300 		/* We need the next buffer. Do we have one? */
301 		if (priv->tldi_head == NULL && (flags & FNONBLOCK)) {
302 			rv = EAGAIN;
303 			goto done;
304 		}
305 		if (priv->tldi_head == NULL) {
306 			/* Sleep and wait for more things we can read. */
307 			rv = mtx_sleep(&tcp_log_dev_listeners,
308 			    &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0);
309 			if (rv)
310 				goto done;
311 			if (priv->tldi_head == NULL)
312 				continue;
313 		}
314 
315 		/*
316 		 * We have an entry to read. We want to try to create a
317 		 * buffer, if one doesn't already exist.
318 		 */
319 		entry = priv->tldi_head;
320 		if (entry->tldq_buf == NULL) {
321 			TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
322 			buf = (*entry->tldq_xform)(entry);
323 			if (buf == NULL) {
324 				rv = EBUSY;
325 				goto done;
326 			}
327 			entry->tldq_buf = buf;
328 		}
329 
330 		priv->tldi_cur = entry->tldq_buf;
331 		priv->tldi_off = 0;
332 	}
333 
334 	/* Copy what we can from this buffer to the output buffer. */
335 	if (uio->uio_resid > 0) {
336 		/* Drop locks so we can take page faults. */
337 		if (lockstate == QUEUE_LOCKED)
338 			TCP_LOG_DEV_QUEUE_UNLOCK();
339 		lockstate = QUEUE_UNLOCKED;
340 
341 		KASSERT(priv->tldi_cur != NULL,
342 		    ("%s: priv->tldi_cur is unexpectedly NULL", __func__));
343 
344 		/* Copy as much as we can to this uio. */
345 		len = priv->tldi_cur->tlch_length - priv->tldi_off;
346 		if (len > uio->uio_resid)
347 			len = uio->uio_resid;
348 		rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off,
349 		    len, uio);
350 		if (rv != 0)
351 			goto done;
352 		priv->tldi_off += len;
353 #ifdef TCPLOG_DEBUG_COUNTERS
354 		counter_u64_add(tcp_log_que_read, len);
355 #endif
356 	}
357 	/* Are we done with this buffer? If so, find the next one. */
358 	if (priv->tldi_off >= priv->tldi_cur->tlch_length) {
359 		KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length,
360 		    ("%s: offset (%ju) exceeds length (%ju)", __func__,
361 		    (uintmax_t)priv->tldi_off,
362 		    (uintmax_t)priv->tldi_cur->tlch_length));
363 		tcp_log_dev_rotate_bufs(priv, &lockstate);
364 	}
365 done:
366 	tcp_log_dev_queue_validate_lock(lockstate);
367 	if (lockstate == QUEUE_LOCKED)
368 		TCP_LOG_DEV_QUEUE_UNLOCK();
369 	return (rv);
370 }
371 
372 static int
373 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
374     int fflag __unused, struct thread *td __unused)
375 {
376 	struct tcp_log_dev_info *priv;
377 	int rv;
378 
379 	/* Get our private info. */
380 	rv = devfs_get_cdevpriv((void **)&priv);
381 	if (rv)
382 		return (rv);
383 
384 	/*
385 	 * Set things. Here, we are most concerned about the non-blocking I/O
386 	 * flag.
387 	 */
388 	rv = 0;
389 	switch (cmd) {
390 	case FIONBIO:
391 		break;
392 	case FIOASYNC:
393 		if (*(int *)data != 0)
394 			rv = EINVAL;
395 		break;
396 	default:
397 		rv = ENOIOCTL;
398 	}
399 	return (rv);
400 }
401 
402 static int
403 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td)
404 {
405 	struct tcp_log_dev_info *priv;
406 	int revents;
407 
408 	/*
409 	 * Get our private info. If this fails, claim that all events are
410 	 * ready. That should prod the user to do something that will
411 	 * make the error evident to them.
412 	 */
413 	if (devfs_get_cdevpriv((void **)&priv))
414 		return (events);
415 
416 	revents = 0;
417 	if (events & (POLLIN | POLLRDNORM)) {
418 		/*
419 		 * We can (probably) read right now if we are partway through
420 		 * a buffer or if we are just about to start a buffer.
421 		 * Because we are going to read tldi_head, we should acquire
422 		 * a read lock on the queue.
423 		 */
424 		TCP_LOG_DEV_QUEUE_LOCK();
425 		if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) ||
426 		    (priv->tldi_cur != NULL &&
427 		    priv->tldi_off < priv->tldi_cur->tlch_length))
428 			revents = events & (POLLIN | POLLRDNORM);
429 		else
430 			selrecord(td, &tcp_log_sel);
431 		TCP_LOG_DEV_QUEUE_UNLOCK();
432 	} else {
433 		/*
434 		 * It only makes sense to poll for reading. So, again, prod the
435 		 * user to do something that will make the error of their ways
436 		 * apparent.
437 		 */
438 		revents = events;
439 	}
440 	return (revents);
441 }
442 
443 int
444 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry)
445 {
446 	struct tcp_log_dev_info *priv;
447 	int rv;
448 	bool wakeup_needed;
449 
450 	KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL,
451 	    ("%s: Called with both tldq_buf and tldq_xform set to NULL",
452 	    __func__));
453 	KASSERT(entry->tldq_dtor != NULL,
454 	    ("%s: Called with tldq_dtor set to NULL", __func__));
455 
456 	/* Get a lock on the queue. */
457 	TCP_LOG_DEV_QUEUE_LOCK();
458 
459 	/* If no one is listening, tell the caller to free the resources. */
460 	if (tcp_log_dev_listeners == 0) {
461 		rv = ENXIO;
462 		goto done;
463 	}
464 
465 	/* Add this to the end of the tailq. */
466 	STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue);
467 
468 	/* Add references for all current listeners. */
469 	refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners);
470 
471 	/*
472 	 * If any listener is currently stuck on NULL, that means they are
473 	 * waiting. Point their head to this new entry.
474 	 */
475 	wakeup_needed = false;
476 	STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list)
477 		if (priv->tldi_head == NULL) {
478 			priv->tldi_head = entry;
479 			wakeup_needed = true;
480 		}
481 
482 	if (wakeup_needed) {
483 		selwakeup(&tcp_log_sel);
484 		wakeup(&tcp_log_dev_listeners);
485 	}
486 
487 	rv = 0;
488 
489 done:
490 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
491 	TCP_LOG_DEV_QUEUE_UNLOCK();
492 	return (rv);
493 }
494 
495 static int
496 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused)
497 {
498 
499 	/* TODO: Support intelligent unloading. */
500 	switch (type) {
501 	case MOD_LOAD:
502 		if (bootverbose)
503 			printf("tcp_log: tcp_log device\n");
504 		memset(&tcp_log_sel, 0, sizeof(tcp_log_sel));
505 		memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx));
506 		mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev",
507 			 "tcp_log device queues", MTX_DEF);
508 		tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
509 		    &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
510 		    "tcp_log");
511 		break;
512 	default:
513 		return (EOPNOTSUPP);
514 	}
515 
516 	return (0);
517 }
518 
519 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL);
520 MODULE_VERSION(tcp_log_dev, 1);
521