xref: /freebsd/sys/dev/tcp_log/tcp_log_dev.c (revision 2e3f49888ec8851bafb22011533217487764fdb0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2016-2017 Netflix, Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  */
28 
29 #include <sys/param.h>
30 #include <sys/conf.h>
31 #include <sys/fcntl.h>
32 #include <sys/filio.h>
33 #include <sys/kernel.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/poll.h>
38 #include <sys/queue.h>
39 #include <sys/refcount.h>
40 #include <sys/mutex.h>
41 #include <sys/selinfo.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/sysctl.h>
45 #include <sys/tree.h>
46 #include <sys/uio.h>
47 #include <machine/atomic.h>
48 #include <sys/counter.h>
49 
50 #include <dev/tcp_log/tcp_log_dev.h>
51 
52 #ifdef TCPLOG_DEBUG_COUNTERS
53 extern counter_u64_t tcp_log_que_read;
54 extern counter_u64_t tcp_log_que_freed;
55 #endif
56 
57 static struct cdev *tcp_log_dev;
58 static struct selinfo tcp_log_sel;
59 
60 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head);
61 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head);
62 
63 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures");
64 
65 static int	tcp_log_dev_listeners = 0;
66 
67 static struct mtx tcp_log_dev_queue_lock;
68 
69 #define	TCP_LOG_DEV_QUEUE_LOCK()	mtx_lock(&tcp_log_dev_queue_lock)
70 #define	TCP_LOG_DEV_QUEUE_UNLOCK()	mtx_unlock(&tcp_log_dev_queue_lock)
71 #define	TCP_LOG_DEV_QUEUE_LOCK_ASSERT()	mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED)
72 #define	TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED)
73 #define	TCP_LOG_DEV_QUEUE_REF(tldq)	refcount_acquire(&((tldq)->tldq_refcnt))
74 #define	TCP_LOG_DEV_QUEUE_UNREF(tldq)	refcount_release(&((tldq)->tldq_refcnt))
75 
76 static void	tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry);
77 static void	tcp_log_dev_clear_cdevpriv(void *data);
78 static int	tcp_log_dev_open(struct cdev *dev __unused, int flags,
79     int devtype __unused, struct thread *td __unused);
80 static int	tcp_log_dev_write(struct cdev *dev __unused,
81     struct uio *uio __unused, int flags __unused);
82 static int	tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio,
83     int flags __unused);
84 static int	tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd,
85     caddr_t data, int fflag __unused, struct thread *td __unused);
86 static int	tcp_log_dev_poll(struct cdev *dev __unused, int events,
87     struct thread *td);
88 
89 enum tcp_log_dev_queue_lock_state {
90 	QUEUE_UNLOCKED = 0,
91 	QUEUE_LOCKED,
92 };
93 
94 static struct cdevsw tcp_log_cdevsw = {
95 	.d_version =	D_VERSION,
96 	.d_read =	tcp_log_dev_read,
97 	.d_open =	tcp_log_dev_open,
98 	.d_write =	tcp_log_dev_write,
99 	.d_poll =	tcp_log_dev_poll,
100 	.d_ioctl =	tcp_log_dev_ioctl,
101 #ifdef NOTYET
102 	.d_mmap =	tcp_log_dev_mmap,
103 #endif
104 	.d_name =	"tcp_log",
105 };
106 
107 static __inline void
108 tcp_log_dev_queue_validate_lock(int lockstate)
109 {
110 
111 #ifdef INVARIANTS
112 	switch (lockstate) {
113 	case QUEUE_LOCKED:
114 		TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
115 		break;
116 	case QUEUE_UNLOCKED:
117 		TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT();
118 		break;
119 	default:
120 		kassert_panic("%s:%d: unknown queue lock state", __func__,
121 		    __LINE__);
122 	}
123 #endif
124 }
125 
126 /*
127  * Clear the refcount. If appropriate, it will remove the entry from the
128  * queue and call the destructor.
129  *
130  * This must be called with the queue lock held.
131  */
132 static void
133 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry)
134 {
135 
136 	KASSERT(entry != NULL, ("%s: called with NULL entry", __func__));
137 
138 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
139 
140 	if (TCP_LOG_DEV_QUEUE_UNREF(entry)) {
141 #ifdef TCPLOG_DEBUG_COUNTERS
142 		counter_u64_add(tcp_log_que_freed, 1);
143 #endif
144 		/* Remove the entry from the queue and call the destructor. */
145 		STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue,
146 		    tldq_queue);
147 		(*entry->tldq_dtor)(entry);
148 	}
149 }
150 
151 static void
152 tcp_log_dev_clear_cdevpriv(void *data)
153 {
154 	struct tcp_log_dev_info *priv;
155 	struct tcp_log_dev_queue *entry, *entry_tmp;
156 
157 	priv = (struct tcp_log_dev_info *)data;
158 	if (priv == NULL)
159 		return;
160 
161 	/*
162 	 * Lock the queue and drop our references. We hold references to all
163 	 * the entries starting with tldi_head (or, if tldi_head == NULL, all
164 	 * entries in the queue).
165 	 *
166 	 * Because we don't want anyone adding addition things to the queue
167 	 * while we are doing this, we lock the queue.
168 	 */
169 	TCP_LOG_DEV_QUEUE_LOCK();
170 	if (priv->tldi_head != NULL) {
171 		entry = priv->tldi_head;
172 		STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head,
173 		    tldq_queue, entry_tmp) {
174 			tcp_log_dev_clear_refcount(entry);
175 		}
176 	}
177 	tcp_log_dev_listeners--;
178 	KASSERT(tcp_log_dev_listeners >= 0,
179 	    ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__));
180 	STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info,
181 	    tldi_list);
182 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
183 	TCP_LOG_DEV_QUEUE_UNLOCK();
184 	free(priv, M_TCPLOGDEV);
185 }
186 
187 static int
188 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused,
189     struct thread *td __unused)
190 {
191 	struct tcp_log_dev_info *priv;
192 	struct tcp_log_dev_queue *entry;
193 	int rv;
194 
195 	/*
196 	 * Ideally, we shouldn't see these because of file system
197 	 * permissions.
198 	 */
199 	if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC))
200 		return (ENODEV);
201 
202 	/* Allocate space to hold information about where we are. */
203 	priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV,
204 	    M_ZERO | M_WAITOK);
205 
206 	/* Stash the private data away. */
207 	rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv);
208 	if (!rv) {
209 		/*
210 		 * Increase the listener count, add this reader to the list, and
211 		 * take references on all current queues.
212 		 */
213 		TCP_LOG_DEV_QUEUE_LOCK();
214 		tcp_log_dev_listeners++;
215 		STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list);
216 		priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head);
217 		if (priv->tldi_head != NULL)
218 			priv->tldi_cur = priv->tldi_head->tldq_buf;
219 		STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue)
220 			TCP_LOG_DEV_QUEUE_REF(entry);
221 		TCP_LOG_DEV_QUEUE_UNLOCK();
222 	} else {
223 		/* Free the entry. */
224 		free(priv, M_TCPLOGDEV);
225 	}
226 	return (rv);
227 }
228 
229 static int
230 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused,
231     int flags __unused)
232 {
233 
234 	return (ENODEV);
235 }
236 
237 static __inline void
238 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate)
239 {
240 	struct tcp_log_dev_queue *entry;
241 
242 	KASSERT(priv->tldi_head != NULL,
243 	    ("%s:%d: priv->tldi_head unexpectedly NULL",
244 	    __func__, __LINE__));
245 	KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur,
246 	    ("%s:%d: buffer mismatch (%p vs %p)",
247 	    __func__, __LINE__, priv->tldi_head->tldq_buf,
248 	    priv->tldi_cur));
249 	tcp_log_dev_queue_validate_lock(*lockstate);
250 
251 	if (*lockstate == QUEUE_UNLOCKED) {
252 		TCP_LOG_DEV_QUEUE_LOCK();
253 		*lockstate = QUEUE_LOCKED;
254 	}
255 	entry = priv->tldi_head;
256 	priv->tldi_head = STAILQ_NEXT(entry, tldq_queue);
257 	tcp_log_dev_clear_refcount(entry);
258 	priv->tldi_cur = NULL;
259 }
260 
261 static int
262 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags)
263 {
264 	struct tcp_log_common_header *buf;
265 	struct tcp_log_dev_info *priv;
266 	struct tcp_log_dev_queue *entry;
267 	ssize_t len;
268 	int lockstate, rv;
269 
270 	/* Get our private info. */
271 	rv = devfs_get_cdevpriv((void **)&priv);
272 	if (rv)
273 		return (rv);
274 
275 	lockstate = QUEUE_UNLOCKED;
276 
277 	/* Do we need to get a new buffer? */
278 	while (priv->tldi_cur == NULL ||
279 	    priv->tldi_cur->tlch_length <= priv->tldi_off) {
280 		/* Did we somehow forget to rotate? */
281 		KASSERT(priv->tldi_cur == NULL,
282 		    ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__,
283 		    __LINE__));
284 		if (priv->tldi_cur != NULL)
285 			tcp_log_dev_rotate_bufs(priv, &lockstate);
286 
287 		/*
288 		 * Before we start looking at tldi_head, we need a lock on the
289 		 * queue to make sure tldi_head stays stable.
290 		 */
291 		if (lockstate == QUEUE_UNLOCKED) {
292 			TCP_LOG_DEV_QUEUE_LOCK();
293 			lockstate = QUEUE_LOCKED;
294 		}
295 
296 		/* We need the next buffer. Do we have one? */
297 		if (priv->tldi_head == NULL && (flags & FNONBLOCK)) {
298 			rv = EAGAIN;
299 			goto done;
300 		}
301 		if (priv->tldi_head == NULL) {
302 			/* Sleep and wait for more things we can read. */
303 			rv = mtx_sleep(&tcp_log_dev_listeners,
304 			    &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0);
305 			if (rv)
306 				goto done;
307 			if (priv->tldi_head == NULL)
308 				continue;
309 		}
310 
311 		/*
312 		 * We have an entry to read. We want to try to create a
313 		 * buffer, if one doesn't already exist.
314 		 */
315 		entry = priv->tldi_head;
316 		if (entry->tldq_buf == NULL) {
317 			TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
318 			buf = (*entry->tldq_xform)(entry);
319 			if (buf == NULL) {
320 				rv = EBUSY;
321 				goto done;
322 			}
323 			entry->tldq_buf = buf;
324 		}
325 
326 		priv->tldi_cur = entry->tldq_buf;
327 		priv->tldi_off = 0;
328 	}
329 
330 	/* Copy what we can from this buffer to the output buffer. */
331 	if (uio->uio_resid > 0) {
332 		/* Drop locks so we can take page faults. */
333 		if (lockstate == QUEUE_LOCKED)
334 			TCP_LOG_DEV_QUEUE_UNLOCK();
335 		lockstate = QUEUE_UNLOCKED;
336 
337 		KASSERT(priv->tldi_cur != NULL,
338 		    ("%s: priv->tldi_cur is unexpectedly NULL", __func__));
339 
340 		/* Copy as much as we can to this uio. */
341 		len = priv->tldi_cur->tlch_length - priv->tldi_off;
342 		if (len > uio->uio_resid)
343 			len = uio->uio_resid;
344 		rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off,
345 		    len, uio);
346 		if (rv != 0)
347 			goto done;
348 		priv->tldi_off += len;
349 #ifdef TCPLOG_DEBUG_COUNTERS
350 		counter_u64_add(tcp_log_que_read, len);
351 #endif
352 	}
353 	/* Are we done with this buffer? If so, find the next one. */
354 	if (priv->tldi_off >= priv->tldi_cur->tlch_length) {
355 		KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length,
356 		    ("%s: offset (%ju) exceeds length (%ju)", __func__,
357 		    (uintmax_t)priv->tldi_off,
358 		    (uintmax_t)priv->tldi_cur->tlch_length));
359 		tcp_log_dev_rotate_bufs(priv, &lockstate);
360 	}
361 done:
362 	tcp_log_dev_queue_validate_lock(lockstate);
363 	if (lockstate == QUEUE_LOCKED)
364 		TCP_LOG_DEV_QUEUE_UNLOCK();
365 	return (rv);
366 }
367 
368 static int
369 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
370     int fflag __unused, struct thread *td __unused)
371 {
372 	struct tcp_log_dev_info *priv;
373 	int rv;
374 
375 	/* Get our private info. */
376 	rv = devfs_get_cdevpriv((void **)&priv);
377 	if (rv)
378 		return (rv);
379 
380 	/*
381 	 * Set things. Here, we are most concerned about the non-blocking I/O
382 	 * flag.
383 	 */
384 	rv = 0;
385 	switch (cmd) {
386 	case FIONBIO:
387 		break;
388 	case FIOASYNC:
389 		if (*(int *)data != 0)
390 			rv = EINVAL;
391 		break;
392 	default:
393 		rv = ENOIOCTL;
394 	}
395 	return (rv);
396 }
397 
398 static int
399 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td)
400 {
401 	struct tcp_log_dev_info *priv;
402 	int revents;
403 
404 	/*
405 	 * Get our private info. If this fails, claim that all events are
406 	 * ready. That should prod the user to do something that will
407 	 * make the error evident to them.
408 	 */
409 	if (devfs_get_cdevpriv((void **)&priv))
410 		return (events);
411 
412 	revents = 0;
413 	if (events & (POLLIN | POLLRDNORM)) {
414 		/*
415 		 * We can (probably) read right now if we are partway through
416 		 * a buffer or if we are just about to start a buffer.
417 		 * Because we are going to read tldi_head, we should acquire
418 		 * a read lock on the queue.
419 		 */
420 		TCP_LOG_DEV_QUEUE_LOCK();
421 		if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) ||
422 		    (priv->tldi_cur != NULL &&
423 		    priv->tldi_off < priv->tldi_cur->tlch_length))
424 			revents = events & (POLLIN | POLLRDNORM);
425 		else
426 			selrecord(td, &tcp_log_sel);
427 		TCP_LOG_DEV_QUEUE_UNLOCK();
428 	} else {
429 		/*
430 		 * It only makes sense to poll for reading. So, again, prod the
431 		 * user to do something that will make the error of their ways
432 		 * apparent.
433 		 */
434 		revents = events;
435 	}
436 	return (revents);
437 }
438 
439 int
440 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry)
441 {
442 	struct tcp_log_dev_info *priv;
443 	int rv;
444 	bool wakeup_needed;
445 
446 	KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL,
447 	    ("%s: Called with both tldq_buf and tldq_xform set to NULL",
448 	    __func__));
449 	KASSERT(entry->tldq_dtor != NULL,
450 	    ("%s: Called with tldq_dtor set to NULL", __func__));
451 
452 	/* Get a lock on the queue. */
453 	TCP_LOG_DEV_QUEUE_LOCK();
454 
455 	/* If no one is listening, tell the caller to free the resources. */
456 	if (tcp_log_dev_listeners == 0) {
457 		rv = ENXIO;
458 		goto done;
459 	}
460 
461 	/* Add this to the end of the tailq. */
462 	STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue);
463 
464 	/* Add references for all current listeners. */
465 	refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners);
466 
467 	/*
468 	 * If any listener is currently stuck on NULL, that means they are
469 	 * waiting. Point their head to this new entry.
470 	 */
471 	wakeup_needed = false;
472 	STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list)
473 		if (priv->tldi_head == NULL) {
474 			priv->tldi_head = entry;
475 			wakeup_needed = true;
476 		}
477 
478 	if (wakeup_needed) {
479 		selwakeup(&tcp_log_sel);
480 		wakeup(&tcp_log_dev_listeners);
481 	}
482 
483 	rv = 0;
484 
485 done:
486 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
487 	TCP_LOG_DEV_QUEUE_UNLOCK();
488 	return (rv);
489 }
490 
491 static int
492 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused)
493 {
494 
495 	/* TODO: Support intelligent unloading. */
496 	switch (type) {
497 	case MOD_LOAD:
498 		if (bootverbose)
499 			printf("tcp_log: tcp_log device\n");
500 		memset(&tcp_log_sel, 0, sizeof(tcp_log_sel));
501 		memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx));
502 		mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev",
503 			 "tcp_log device queues", MTX_DEF);
504 		tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
505 		    &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
506 		    "tcp_log");
507 		break;
508 	default:
509 		return (EOPNOTSUPP);
510 	}
511 
512 	return (0);
513 }
514 
515 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL);
516 MODULE_VERSION(tcp_log_dev, 1);
517