xref: /freebsd/sys/dev/tcp_log/tcp_log_dev.c (revision 63f537551380d2dab29fa402ad1269feae17e594)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2016-2017 Netflix, Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/conf.h>
32 #include <sys/fcntl.h>
33 #include <sys/filio.h>
34 #include <sys/kernel.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/poll.h>
39 #include <sys/queue.h>
40 #include <sys/refcount.h>
41 #include <sys/mutex.h>
42 #include <sys/selinfo.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/sysctl.h>
46 #include <sys/tree.h>
47 #include <sys/uio.h>
48 #include <machine/atomic.h>
49 #include <sys/counter.h>
50 
51 #include <dev/tcp_log/tcp_log_dev.h>
52 
53 #ifdef TCPLOG_DEBUG_COUNTERS
54 extern counter_u64_t tcp_log_que_read;
55 extern counter_u64_t tcp_log_que_freed;
56 #endif
57 
58 static struct cdev *tcp_log_dev;
59 static struct selinfo tcp_log_sel;
60 
61 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head);
62 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head);
63 
64 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures");
65 
66 static int	tcp_log_dev_listeners = 0;
67 
68 static struct mtx tcp_log_dev_queue_lock;
69 
70 #define	TCP_LOG_DEV_QUEUE_LOCK()	mtx_lock(&tcp_log_dev_queue_lock)
71 #define	TCP_LOG_DEV_QUEUE_UNLOCK()	mtx_unlock(&tcp_log_dev_queue_lock)
72 #define	TCP_LOG_DEV_QUEUE_LOCK_ASSERT()	mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED)
73 #define	TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED)
74 #define	TCP_LOG_DEV_QUEUE_REF(tldq)	refcount_acquire(&((tldq)->tldq_refcnt))
75 #define	TCP_LOG_DEV_QUEUE_UNREF(tldq)	refcount_release(&((tldq)->tldq_refcnt))
76 
77 static void	tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry);
78 static void	tcp_log_dev_clear_cdevpriv(void *data);
79 static int	tcp_log_dev_open(struct cdev *dev __unused, int flags,
80     int devtype __unused, struct thread *td __unused);
81 static int	tcp_log_dev_write(struct cdev *dev __unused,
82     struct uio *uio __unused, int flags __unused);
83 static int	tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio,
84     int flags __unused);
85 static int	tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd,
86     caddr_t data, int fflag __unused, struct thread *td __unused);
87 static int	tcp_log_dev_poll(struct cdev *dev __unused, int events,
88     struct thread *td);
89 
90 enum tcp_log_dev_queue_lock_state {
91 	QUEUE_UNLOCKED = 0,
92 	QUEUE_LOCKED,
93 };
94 
95 static struct cdevsw tcp_log_cdevsw = {
96 	.d_version =	D_VERSION,
97 	.d_read =	tcp_log_dev_read,
98 	.d_open =	tcp_log_dev_open,
99 	.d_write =	tcp_log_dev_write,
100 	.d_poll =	tcp_log_dev_poll,
101 	.d_ioctl =	tcp_log_dev_ioctl,
102 #ifdef NOTYET
103 	.d_mmap =	tcp_log_dev_mmap,
104 #endif
105 	.d_name =	"tcp_log",
106 };
107 
108 static __inline void
109 tcp_log_dev_queue_validate_lock(int lockstate)
110 {
111 
112 #ifdef INVARIANTS
113 	switch (lockstate) {
114 	case QUEUE_LOCKED:
115 		TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
116 		break;
117 	case QUEUE_UNLOCKED:
118 		TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT();
119 		break;
120 	default:
121 		kassert_panic("%s:%d: unknown queue lock state", __func__,
122 		    __LINE__);
123 	}
124 #endif
125 }
126 
127 /*
128  * Clear the refcount. If appropriate, it will remove the entry from the
129  * queue and call the destructor.
130  *
131  * This must be called with the queue lock held.
132  */
133 static void
134 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry)
135 {
136 
137 	KASSERT(entry != NULL, ("%s: called with NULL entry", __func__));
138 
139 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
140 
141 	if (TCP_LOG_DEV_QUEUE_UNREF(entry)) {
142 #ifdef TCPLOG_DEBUG_COUNTERS
143 		counter_u64_add(tcp_log_que_freed, 1);
144 #endif
145 		/* Remove the entry from the queue and call the destructor. */
146 		STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue,
147 		    tldq_queue);
148 		(*entry->tldq_dtor)(entry);
149 	}
150 }
151 
152 static void
153 tcp_log_dev_clear_cdevpriv(void *data)
154 {
155 	struct tcp_log_dev_info *priv;
156 	struct tcp_log_dev_queue *entry, *entry_tmp;
157 
158 	priv = (struct tcp_log_dev_info *)data;
159 	if (priv == NULL)
160 		return;
161 
162 	/*
163 	 * Lock the queue and drop our references. We hold references to all
164 	 * the entries starting with tldi_head (or, if tldi_head == NULL, all
165 	 * entries in the queue).
166 	 *
167 	 * Because we don't want anyone adding addition things to the queue
168 	 * while we are doing this, we lock the queue.
169 	 */
170 	TCP_LOG_DEV_QUEUE_LOCK();
171 	if (priv->tldi_head != NULL) {
172 		entry = priv->tldi_head;
173 		STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head,
174 		    tldq_queue, entry_tmp) {
175 			tcp_log_dev_clear_refcount(entry);
176 		}
177 	}
178 	tcp_log_dev_listeners--;
179 	KASSERT(tcp_log_dev_listeners >= 0,
180 	    ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__));
181 	STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info,
182 	    tldi_list);
183 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
184 	TCP_LOG_DEV_QUEUE_UNLOCK();
185 	free(priv, M_TCPLOGDEV);
186 }
187 
188 static int
189 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused,
190     struct thread *td __unused)
191 {
192 	struct tcp_log_dev_info *priv;
193 	struct tcp_log_dev_queue *entry;
194 	int rv;
195 
196 	/*
197 	 * Ideally, we shouldn't see these because of file system
198 	 * permissions.
199 	 */
200 	if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC))
201 		return (ENODEV);
202 
203 	/* Allocate space to hold information about where we are. */
204 	priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV,
205 	    M_ZERO | M_WAITOK);
206 
207 	/* Stash the private data away. */
208 	rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv);
209 	if (!rv) {
210 		/*
211 		 * Increase the listener count, add this reader to the list, and
212 		 * take references on all current queues.
213 		 */
214 		TCP_LOG_DEV_QUEUE_LOCK();
215 		tcp_log_dev_listeners++;
216 		STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list);
217 		priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head);
218 		if (priv->tldi_head != NULL)
219 			priv->tldi_cur = priv->tldi_head->tldq_buf;
220 		STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue)
221 			TCP_LOG_DEV_QUEUE_REF(entry);
222 		TCP_LOG_DEV_QUEUE_UNLOCK();
223 	} else {
224 		/* Free the entry. */
225 		free(priv, M_TCPLOGDEV);
226 	}
227 	return (rv);
228 }
229 
230 static int
231 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused,
232     int flags __unused)
233 {
234 
235 	return (ENODEV);
236 }
237 
238 static __inline void
239 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate)
240 {
241 	struct tcp_log_dev_queue *entry;
242 
243 	KASSERT(priv->tldi_head != NULL,
244 	    ("%s:%d: priv->tldi_head unexpectedly NULL",
245 	    __func__, __LINE__));
246 	KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur,
247 	    ("%s:%d: buffer mismatch (%p vs %p)",
248 	    __func__, __LINE__, priv->tldi_head->tldq_buf,
249 	    priv->tldi_cur));
250 	tcp_log_dev_queue_validate_lock(*lockstate);
251 
252 	if (*lockstate == QUEUE_UNLOCKED) {
253 		TCP_LOG_DEV_QUEUE_LOCK();
254 		*lockstate = QUEUE_LOCKED;
255 	}
256 	entry = priv->tldi_head;
257 	priv->tldi_head = STAILQ_NEXT(entry, tldq_queue);
258 	tcp_log_dev_clear_refcount(entry);
259 	priv->tldi_cur = NULL;
260 }
261 
262 static int
263 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags)
264 {
265 	struct tcp_log_common_header *buf;
266 	struct tcp_log_dev_info *priv;
267 	struct tcp_log_dev_queue *entry;
268 	ssize_t len;
269 	int lockstate, rv;
270 
271 	/* Get our private info. */
272 	rv = devfs_get_cdevpriv((void **)&priv);
273 	if (rv)
274 		return (rv);
275 
276 	lockstate = QUEUE_UNLOCKED;
277 
278 	/* Do we need to get a new buffer? */
279 	while (priv->tldi_cur == NULL ||
280 	    priv->tldi_cur->tlch_length <= priv->tldi_off) {
281 		/* Did we somehow forget to rotate? */
282 		KASSERT(priv->tldi_cur == NULL,
283 		    ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__,
284 		    __LINE__));
285 		if (priv->tldi_cur != NULL)
286 			tcp_log_dev_rotate_bufs(priv, &lockstate);
287 
288 		/*
289 		 * Before we start looking at tldi_head, we need a lock on the
290 		 * queue to make sure tldi_head stays stable.
291 		 */
292 		if (lockstate == QUEUE_UNLOCKED) {
293 			TCP_LOG_DEV_QUEUE_LOCK();
294 			lockstate = QUEUE_LOCKED;
295 		}
296 
297 		/* We need the next buffer. Do we have one? */
298 		if (priv->tldi_head == NULL && (flags & FNONBLOCK)) {
299 			rv = EAGAIN;
300 			goto done;
301 		}
302 		if (priv->tldi_head == NULL) {
303 			/* Sleep and wait for more things we can read. */
304 			rv = mtx_sleep(&tcp_log_dev_listeners,
305 			    &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0);
306 			if (rv)
307 				goto done;
308 			if (priv->tldi_head == NULL)
309 				continue;
310 		}
311 
312 		/*
313 		 * We have an entry to read. We want to try to create a
314 		 * buffer, if one doesn't already exist.
315 		 */
316 		entry = priv->tldi_head;
317 		if (entry->tldq_buf == NULL) {
318 			TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
319 			buf = (*entry->tldq_xform)(entry);
320 			if (buf == NULL) {
321 				rv = EBUSY;
322 				goto done;
323 			}
324 			entry->tldq_buf = buf;
325 		}
326 
327 		priv->tldi_cur = entry->tldq_buf;
328 		priv->tldi_off = 0;
329 	}
330 
331 	/* Copy what we can from this buffer to the output buffer. */
332 	if (uio->uio_resid > 0) {
333 		/* Drop locks so we can take page faults. */
334 		if (lockstate == QUEUE_LOCKED)
335 			TCP_LOG_DEV_QUEUE_UNLOCK();
336 		lockstate = QUEUE_UNLOCKED;
337 
338 		KASSERT(priv->tldi_cur != NULL,
339 		    ("%s: priv->tldi_cur is unexpectedly NULL", __func__));
340 
341 		/* Copy as much as we can to this uio. */
342 		len = priv->tldi_cur->tlch_length - priv->tldi_off;
343 		if (len > uio->uio_resid)
344 			len = uio->uio_resid;
345 		rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off,
346 		    len, uio);
347 		if (rv != 0)
348 			goto done;
349 		priv->tldi_off += len;
350 #ifdef TCPLOG_DEBUG_COUNTERS
351 		counter_u64_add(tcp_log_que_read, len);
352 #endif
353 	}
354 	/* Are we done with this buffer? If so, find the next one. */
355 	if (priv->tldi_off >= priv->tldi_cur->tlch_length) {
356 		KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length,
357 		    ("%s: offset (%ju) exceeds length (%ju)", __func__,
358 		    (uintmax_t)priv->tldi_off,
359 		    (uintmax_t)priv->tldi_cur->tlch_length));
360 		tcp_log_dev_rotate_bufs(priv, &lockstate);
361 	}
362 done:
363 	tcp_log_dev_queue_validate_lock(lockstate);
364 	if (lockstate == QUEUE_LOCKED)
365 		TCP_LOG_DEV_QUEUE_UNLOCK();
366 	return (rv);
367 }
368 
369 static int
370 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
371     int fflag __unused, struct thread *td __unused)
372 {
373 	struct tcp_log_dev_info *priv;
374 	int rv;
375 
376 	/* Get our private info. */
377 	rv = devfs_get_cdevpriv((void **)&priv);
378 	if (rv)
379 		return (rv);
380 
381 	/*
382 	 * Set things. Here, we are most concerned about the non-blocking I/O
383 	 * flag.
384 	 */
385 	rv = 0;
386 	switch (cmd) {
387 	case FIONBIO:
388 		break;
389 	case FIOASYNC:
390 		if (*(int *)data != 0)
391 			rv = EINVAL;
392 		break;
393 	default:
394 		rv = ENOIOCTL;
395 	}
396 	return (rv);
397 }
398 
399 static int
400 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td)
401 {
402 	struct tcp_log_dev_info *priv;
403 	int revents;
404 
405 	/*
406 	 * Get our private info. If this fails, claim that all events are
407 	 * ready. That should prod the user to do something that will
408 	 * make the error evident to them.
409 	 */
410 	if (devfs_get_cdevpriv((void **)&priv))
411 		return (events);
412 
413 	revents = 0;
414 	if (events & (POLLIN | POLLRDNORM)) {
415 		/*
416 		 * We can (probably) read right now if we are partway through
417 		 * a buffer or if we are just about to start a buffer.
418 		 * Because we are going to read tldi_head, we should acquire
419 		 * a read lock on the queue.
420 		 */
421 		TCP_LOG_DEV_QUEUE_LOCK();
422 		if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) ||
423 		    (priv->tldi_cur != NULL &&
424 		    priv->tldi_off < priv->tldi_cur->tlch_length))
425 			revents = events & (POLLIN | POLLRDNORM);
426 		else
427 			selrecord(td, &tcp_log_sel);
428 		TCP_LOG_DEV_QUEUE_UNLOCK();
429 	} else {
430 		/*
431 		 * It only makes sense to poll for reading. So, again, prod the
432 		 * user to do something that will make the error of their ways
433 		 * apparent.
434 		 */
435 		revents = events;
436 	}
437 	return (revents);
438 }
439 
440 int
441 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry)
442 {
443 	struct tcp_log_dev_info *priv;
444 	int rv;
445 	bool wakeup_needed;
446 
447 	KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL,
448 	    ("%s: Called with both tldq_buf and tldq_xform set to NULL",
449 	    __func__));
450 	KASSERT(entry->tldq_dtor != NULL,
451 	    ("%s: Called with tldq_dtor set to NULL", __func__));
452 
453 	/* Get a lock on the queue. */
454 	TCP_LOG_DEV_QUEUE_LOCK();
455 
456 	/* If no one is listening, tell the caller to free the resources. */
457 	if (tcp_log_dev_listeners == 0) {
458 		rv = ENXIO;
459 		goto done;
460 	}
461 
462 	/* Add this to the end of the tailq. */
463 	STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue);
464 
465 	/* Add references for all current listeners. */
466 	refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners);
467 
468 	/*
469 	 * If any listener is currently stuck on NULL, that means they are
470 	 * waiting. Point their head to this new entry.
471 	 */
472 	wakeup_needed = false;
473 	STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list)
474 		if (priv->tldi_head == NULL) {
475 			priv->tldi_head = entry;
476 			wakeup_needed = true;
477 		}
478 
479 	if (wakeup_needed) {
480 		selwakeup(&tcp_log_sel);
481 		wakeup(&tcp_log_dev_listeners);
482 	}
483 
484 	rv = 0;
485 
486 done:
487 	TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
488 	TCP_LOG_DEV_QUEUE_UNLOCK();
489 	return (rv);
490 }
491 
492 static int
493 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused)
494 {
495 
496 	/* TODO: Support intelligent unloading. */
497 	switch (type) {
498 	case MOD_LOAD:
499 		if (bootverbose)
500 			printf("tcp_log: tcp_log device\n");
501 		memset(&tcp_log_sel, 0, sizeof(tcp_log_sel));
502 		memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx));
503 		mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev",
504 			 "tcp_log device queues", MTX_DEF);
505 		tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
506 		    &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
507 		    "tcp_log");
508 		break;
509 	default:
510 		return (EOPNOTSUPP);
511 	}
512 
513 	return (0);
514 }
515 
516 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL);
517 MODULE_VERSION(tcp_log_dev, 1);
518