1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2016-2017 Netflix, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 */
28
29 #include <sys/param.h>
30 #include <sys/conf.h>
31 #include <sys/fcntl.h>
32 #include <sys/filio.h>
33 #include <sys/kernel.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/poll.h>
38 #include <sys/queue.h>
39 #include <sys/refcount.h>
40 #include <sys/mutex.h>
41 #include <sys/selinfo.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/sysctl.h>
45 #include <sys/tree.h>
46 #include <sys/uio.h>
47 #include <machine/atomic.h>
48 #include <sys/counter.h>
49
50 #include <dev/tcp_log/tcp_log_dev.h>
51
52 #ifdef TCPLOG_DEBUG_COUNTERS
53 extern counter_u64_t tcp_log_que_read;
54 extern counter_u64_t tcp_log_que_freed;
55 #endif
56
57 static struct cdev *tcp_log_dev;
58 static struct selinfo tcp_log_sel;
59
60 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head);
61 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head);
62
63 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures");
64
65 static int tcp_log_dev_listeners = 0;
66
67 static struct mtx tcp_log_dev_queue_lock;
68
69 #define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock)
70 #define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock)
71 #define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED)
72 #define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED)
73 #define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt))
74 #define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt))
75
76 static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry);
77 static void tcp_log_dev_clear_cdevpriv(void *data);
78 static int tcp_log_dev_open(struct cdev *dev __unused, int flags,
79 int devtype __unused, struct thread *td __unused);
80 static int tcp_log_dev_write(struct cdev *dev __unused,
81 struct uio *uio __unused, int flags __unused);
82 static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio,
83 int flags __unused);
84 static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd,
85 caddr_t data, int fflag __unused, struct thread *td __unused);
86 static int tcp_log_dev_poll(struct cdev *dev __unused, int events,
87 struct thread *td);
88
89 enum tcp_log_dev_queue_lock_state {
90 QUEUE_UNLOCKED = 0,
91 QUEUE_LOCKED,
92 };
93
94 static struct cdevsw tcp_log_cdevsw = {
95 .d_version = D_VERSION,
96 .d_read = tcp_log_dev_read,
97 .d_open = tcp_log_dev_open,
98 .d_write = tcp_log_dev_write,
99 .d_poll = tcp_log_dev_poll,
100 .d_ioctl = tcp_log_dev_ioctl,
101 #ifdef NOTYET
102 .d_mmap = tcp_log_dev_mmap,
103 #endif
104 .d_name = "tcp_log",
105 };
106
107 static __inline void
tcp_log_dev_queue_validate_lock(int lockstate)108 tcp_log_dev_queue_validate_lock(int lockstate)
109 {
110
111 #ifdef INVARIANTS
112 switch (lockstate) {
113 case QUEUE_LOCKED:
114 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
115 break;
116 case QUEUE_UNLOCKED:
117 TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT();
118 break;
119 default:
120 kassert_panic("%s:%d: unknown queue lock state", __func__,
121 __LINE__);
122 }
123 #endif
124 }
125
126 /*
127 * Clear the refcount. If appropriate, it will remove the entry from the
128 * queue and call the destructor.
129 *
130 * This must be called with the queue lock held.
131 */
132 static void
tcp_log_dev_clear_refcount(struct tcp_log_dev_queue * entry)133 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry)
134 {
135
136 KASSERT(entry != NULL, ("%s: called with NULL entry", __func__));
137
138 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
139
140 if (TCP_LOG_DEV_QUEUE_UNREF(entry)) {
141 #ifdef TCPLOG_DEBUG_COUNTERS
142 counter_u64_add(tcp_log_que_freed, 1);
143 #endif
144 /* Remove the entry from the queue and call the destructor. */
145 STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue,
146 tldq_queue);
147 (*entry->tldq_dtor)(entry);
148 }
149 }
150
151 static void
tcp_log_dev_clear_cdevpriv(void * data)152 tcp_log_dev_clear_cdevpriv(void *data)
153 {
154 struct tcp_log_dev_info *priv;
155 struct tcp_log_dev_queue *entry, *entry_tmp;
156
157 priv = (struct tcp_log_dev_info *)data;
158 if (priv == NULL)
159 return;
160
161 /*
162 * Lock the queue and drop our references. We hold references to all
163 * the entries starting with tldi_head (or, if tldi_head == NULL, all
164 * entries in the queue).
165 *
166 * Because we don't want anyone adding addition things to the queue
167 * while we are doing this, we lock the queue.
168 */
169 TCP_LOG_DEV_QUEUE_LOCK();
170 if (priv->tldi_head != NULL) {
171 entry = priv->tldi_head;
172 STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head,
173 tldq_queue, entry_tmp) {
174 tcp_log_dev_clear_refcount(entry);
175 }
176 }
177 tcp_log_dev_listeners--;
178 KASSERT(tcp_log_dev_listeners >= 0,
179 ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__));
180 STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info,
181 tldi_list);
182 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
183 TCP_LOG_DEV_QUEUE_UNLOCK();
184 free(priv, M_TCPLOGDEV);
185 }
186
187 static int
tcp_log_dev_open(struct cdev * dev __unused,int flags,int devtype __unused,struct thread * td __unused)188 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused,
189 struct thread *td __unused)
190 {
191 struct tcp_log_dev_info *priv;
192 struct tcp_log_dev_queue *entry;
193 int rv;
194
195 /*
196 * Ideally, we shouldn't see these because of file system
197 * permissions.
198 */
199 if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC))
200 return (ENODEV);
201
202 /* Allocate space to hold information about where we are. */
203 priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV,
204 M_ZERO | M_WAITOK);
205
206 /* Stash the private data away. */
207 rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv);
208 if (!rv) {
209 /*
210 * Increase the listener count, add this reader to the list, and
211 * take references on all current queues.
212 */
213 TCP_LOG_DEV_QUEUE_LOCK();
214 tcp_log_dev_listeners++;
215 STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list);
216 priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head);
217 if (priv->tldi_head != NULL)
218 priv->tldi_cur = priv->tldi_head->tldq_buf;
219 STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue)
220 TCP_LOG_DEV_QUEUE_REF(entry);
221 TCP_LOG_DEV_QUEUE_UNLOCK();
222 } else {
223 /* Free the entry. */
224 free(priv, M_TCPLOGDEV);
225 }
226 return (rv);
227 }
228
229 static int
tcp_log_dev_write(struct cdev * dev __unused,struct uio * uio __unused,int flags __unused)230 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused,
231 int flags __unused)
232 {
233
234 return (ENODEV);
235 }
236
237 static __inline void
tcp_log_dev_rotate_bufs(struct tcp_log_dev_info * priv,int * lockstate)238 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate)
239 {
240 struct tcp_log_dev_queue *entry;
241
242 KASSERT(priv->tldi_head != NULL,
243 ("%s:%d: priv->tldi_head unexpectedly NULL",
244 __func__, __LINE__));
245 KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur,
246 ("%s:%d: buffer mismatch (%p vs %p)",
247 __func__, __LINE__, priv->tldi_head->tldq_buf,
248 priv->tldi_cur));
249 tcp_log_dev_queue_validate_lock(*lockstate);
250
251 if (*lockstate == QUEUE_UNLOCKED) {
252 TCP_LOG_DEV_QUEUE_LOCK();
253 *lockstate = QUEUE_LOCKED;
254 }
255 entry = priv->tldi_head;
256 priv->tldi_head = STAILQ_NEXT(entry, tldq_queue);
257 tcp_log_dev_clear_refcount(entry);
258 priv->tldi_cur = NULL;
259 }
260
261 static int
tcp_log_dev_read(struct cdev * dev __unused,struct uio * uio,int flags)262 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags)
263 {
264 struct tcp_log_common_header *buf;
265 struct tcp_log_dev_info *priv;
266 struct tcp_log_dev_queue *entry;
267 ssize_t len;
268 int lockstate, rv;
269
270 /* Get our private info. */
271 rv = devfs_get_cdevpriv((void **)&priv);
272 if (rv)
273 return (rv);
274
275 lockstate = QUEUE_UNLOCKED;
276
277 /* Do we need to get a new buffer? */
278 while (priv->tldi_cur == NULL ||
279 priv->tldi_cur->tlch_length <= priv->tldi_off) {
280 /* Did we somehow forget to rotate? */
281 KASSERT(priv->tldi_cur == NULL,
282 ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__,
283 __LINE__));
284 if (priv->tldi_cur != NULL)
285 tcp_log_dev_rotate_bufs(priv, &lockstate);
286
287 /*
288 * Before we start looking at tldi_head, we need a lock on the
289 * queue to make sure tldi_head stays stable.
290 */
291 if (lockstate == QUEUE_UNLOCKED) {
292 TCP_LOG_DEV_QUEUE_LOCK();
293 lockstate = QUEUE_LOCKED;
294 }
295
296 /* We need the next buffer. Do we have one? */
297 if (priv->tldi_head == NULL && (flags & FNONBLOCK)) {
298 rv = EAGAIN;
299 goto done;
300 }
301 if (priv->tldi_head == NULL) {
302 /* Sleep and wait for more things we can read. */
303 rv = mtx_sleep(&tcp_log_dev_listeners,
304 &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0);
305 if (rv)
306 goto done;
307 if (priv->tldi_head == NULL)
308 continue;
309 }
310
311 /*
312 * We have an entry to read. We want to try to create a
313 * buffer, if one doesn't already exist.
314 */
315 entry = priv->tldi_head;
316 if (entry->tldq_buf == NULL) {
317 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
318 buf = (*entry->tldq_xform)(entry);
319 if (buf == NULL) {
320 rv = EBUSY;
321 goto done;
322 }
323 entry->tldq_buf = buf;
324 }
325
326 priv->tldi_cur = entry->tldq_buf;
327 priv->tldi_off = 0;
328 }
329
330 /* Copy what we can from this buffer to the output buffer. */
331 if (uio->uio_resid > 0) {
332 /* Drop locks so we can take page faults. */
333 if (lockstate == QUEUE_LOCKED)
334 TCP_LOG_DEV_QUEUE_UNLOCK();
335 lockstate = QUEUE_UNLOCKED;
336
337 KASSERT(priv->tldi_cur != NULL,
338 ("%s: priv->tldi_cur is unexpectedly NULL", __func__));
339
340 /* Copy as much as we can to this uio. */
341 len = priv->tldi_cur->tlch_length - priv->tldi_off;
342 if (len > uio->uio_resid)
343 len = uio->uio_resid;
344 rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off,
345 len, uio);
346 if (rv != 0)
347 goto done;
348 priv->tldi_off += len;
349 #ifdef TCPLOG_DEBUG_COUNTERS
350 counter_u64_add(tcp_log_que_read, len);
351 #endif
352 }
353 /* Are we done with this buffer? If so, find the next one. */
354 if (priv->tldi_off >= priv->tldi_cur->tlch_length) {
355 KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length,
356 ("%s: offset (%ju) exceeds length (%ju)", __func__,
357 (uintmax_t)priv->tldi_off,
358 (uintmax_t)priv->tldi_cur->tlch_length));
359 tcp_log_dev_rotate_bufs(priv, &lockstate);
360 }
361 done:
362 tcp_log_dev_queue_validate_lock(lockstate);
363 if (lockstate == QUEUE_LOCKED)
364 TCP_LOG_DEV_QUEUE_UNLOCK();
365 return (rv);
366 }
367
368 static int
tcp_log_dev_ioctl(struct cdev * dev __unused,u_long cmd,caddr_t data,int fflag __unused,struct thread * td __unused)369 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
370 int fflag __unused, struct thread *td __unused)
371 {
372 struct tcp_log_dev_info *priv;
373 int rv;
374
375 /* Get our private info. */
376 rv = devfs_get_cdevpriv((void **)&priv);
377 if (rv)
378 return (rv);
379
380 /*
381 * Set things. Here, we are most concerned about the non-blocking I/O
382 * flag.
383 */
384 rv = 0;
385 switch (cmd) {
386 case FIONBIO:
387 break;
388 case FIOASYNC:
389 if (*(int *)data != 0)
390 rv = EINVAL;
391 break;
392 default:
393 rv = ENOIOCTL;
394 }
395 return (rv);
396 }
397
398 static int
tcp_log_dev_poll(struct cdev * dev __unused,int events,struct thread * td)399 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td)
400 {
401 struct tcp_log_dev_info *priv;
402 int revents;
403
404 /*
405 * Get our private info. If this fails, claim that all events are
406 * ready. That should prod the user to do something that will
407 * make the error evident to them.
408 */
409 if (devfs_get_cdevpriv((void **)&priv))
410 return (events);
411
412 revents = 0;
413 if (events & (POLLIN | POLLRDNORM)) {
414 /*
415 * We can (probably) read right now if we are partway through
416 * a buffer or if we are just about to start a buffer.
417 * Because we are going to read tldi_head, we should acquire
418 * a read lock on the queue.
419 */
420 TCP_LOG_DEV_QUEUE_LOCK();
421 if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) ||
422 (priv->tldi_cur != NULL &&
423 priv->tldi_off < priv->tldi_cur->tlch_length))
424 revents = events & (POLLIN | POLLRDNORM);
425 else
426 selrecord(td, &tcp_log_sel);
427 TCP_LOG_DEV_QUEUE_UNLOCK();
428 } else {
429 /*
430 * It only makes sense to poll for reading. So, again, prod the
431 * user to do something that will make the error of their ways
432 * apparent.
433 */
434 revents = events;
435 }
436 return (revents);
437 }
438
439 int
tcp_log_dev_add_log(struct tcp_log_dev_queue * entry)440 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry)
441 {
442 struct tcp_log_dev_info *priv;
443 int rv;
444 bool wakeup_needed;
445
446 KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL,
447 ("%s: Called with both tldq_buf and tldq_xform set to NULL",
448 __func__));
449 KASSERT(entry->tldq_dtor != NULL,
450 ("%s: Called with tldq_dtor set to NULL", __func__));
451
452 /* Get a lock on the queue. */
453 TCP_LOG_DEV_QUEUE_LOCK();
454
455 /* If no one is listening, tell the caller to free the resources. */
456 if (tcp_log_dev_listeners == 0) {
457 rv = ENXIO;
458 goto done;
459 }
460
461 /* Add this to the end of the tailq. */
462 STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue);
463
464 /* Add references for all current listeners. */
465 refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners);
466
467 /*
468 * If any listener is currently stuck on NULL, that means they are
469 * waiting. Point their head to this new entry.
470 */
471 wakeup_needed = false;
472 STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list)
473 if (priv->tldi_head == NULL) {
474 priv->tldi_head = entry;
475 wakeup_needed = true;
476 }
477
478 if (wakeup_needed) {
479 selwakeup(&tcp_log_sel);
480 wakeup(&tcp_log_dev_listeners);
481 }
482
483 rv = 0;
484
485 done:
486 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
487 TCP_LOG_DEV_QUEUE_UNLOCK();
488 return (rv);
489 }
490
491 static int
tcp_log_dev_modevent(module_t mod __unused,int type,void * data __unused)492 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused)
493 {
494
495 /* TODO: Support intelligent unloading. */
496 switch (type) {
497 case MOD_LOAD:
498 if (bootverbose)
499 printf("tcp_log: tcp_log device\n");
500 memset(&tcp_log_sel, 0, sizeof(tcp_log_sel));
501 memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx));
502 mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev",
503 "tcp_log device queues", MTX_DEF);
504 tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
505 &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
506 "tcp_log");
507 break;
508 default:
509 return (EOPNOTSUPP);
510 }
511
512 return (0);
513 }
514
515 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL);
516 MODULE_VERSION(tcp_log_dev, 1);
517