1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2016-2017 5 * Netflix Inc. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include <sys/param.h> 34 #include <sys/conf.h> 35 #include <sys/fcntl.h> 36 #include <sys/filio.h> 37 #include <sys/kernel.h> 38 #include <sys/lock.h> 39 #include <sys/malloc.h> 40 #include <sys/module.h> 41 #include <sys/poll.h> 42 #include <sys/queue.h> 43 #include <sys/refcount.h> 44 #include <sys/mutex.h> 45 #include <sys/selinfo.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/tree.h> 50 #include <sys/uio.h> 51 #include <machine/atomic.h> 52 #include <sys/counter.h> 53 54 #include <dev/tcp_log/tcp_log_dev.h> 55 56 #ifdef TCPLOG_DEBUG_COUNTERS 57 extern counter_u64_t tcp_log_que_read; 58 extern counter_u64_t tcp_log_que_freed; 59 #endif 60 61 static struct cdev *tcp_log_dev; 62 static struct selinfo tcp_log_sel; 63 64 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head); 65 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head); 66 67 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures"); 68 69 static int tcp_log_dev_listeners = 0; 70 71 static struct mtx tcp_log_dev_queue_lock; 72 73 #define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock) 74 #define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock) 75 #define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED) 76 #define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED) 77 #define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt)) 78 #define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt)) 79 80 static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry); 81 static void tcp_log_dev_clear_cdevpriv(void *data); 82 static int tcp_log_dev_open(struct cdev *dev __unused, int flags, 83 int devtype __unused, struct thread *td __unused); 84 static int tcp_log_dev_write(struct cdev *dev __unused, 85 struct uio *uio __unused, int flags __unused); 86 static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, 87 int flags __unused); 88 static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, 89 caddr_t data, int fflag __unused, struct thread *td __unused); 90 static int tcp_log_dev_poll(struct cdev *dev __unused, int events, 91 struct thread *td); 92 93 94 enum tcp_log_dev_queue_lock_state { 95 QUEUE_UNLOCKED = 0, 96 QUEUE_LOCKED, 97 }; 98 99 static struct cdevsw tcp_log_cdevsw = { 100 .d_version = D_VERSION, 101 .d_read = tcp_log_dev_read, 102 .d_open = tcp_log_dev_open, 103 .d_write = tcp_log_dev_write, 104 .d_poll = tcp_log_dev_poll, 105 .d_ioctl = tcp_log_dev_ioctl, 106 #ifdef NOTYET 107 .d_mmap = tcp_log_dev_mmap, 108 #endif 109 .d_name = "tcp_log", 110 }; 111 112 static __inline void 113 tcp_log_dev_queue_validate_lock(int lockstate) 114 { 115 116 #ifdef INVARIANTS 117 switch (lockstate) { 118 case QUEUE_LOCKED: 119 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 120 break; 121 case QUEUE_UNLOCKED: 122 TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT(); 123 break; 124 default: 125 kassert_panic("%s:%d: unknown queue lock state", __func__, 126 __LINE__); 127 } 128 #endif 129 } 130 131 /* 132 * Clear the refcount. If appropriate, it will remove the entry from the 133 * queue and call the destructor. 134 * 135 * This must be called with the queue lock held. 136 */ 137 static void 138 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry) 139 { 140 141 KASSERT(entry != NULL, ("%s: called with NULL entry", __func__)); 142 143 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 144 145 if (TCP_LOG_DEV_QUEUE_UNREF(entry)) { 146 #ifdef TCPLOG_DEBUG_COUNTERS 147 counter_u64_add(tcp_log_que_freed, 1); 148 #endif 149 /* Remove the entry from the queue and call the destructor. */ 150 STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue, 151 tldq_queue); 152 (*entry->tldq_dtor)(entry); 153 } 154 } 155 156 static void 157 tcp_log_dev_clear_cdevpriv(void *data) 158 { 159 struct tcp_log_dev_info *priv; 160 struct tcp_log_dev_queue *entry, *entry_tmp; 161 162 priv = (struct tcp_log_dev_info *)data; 163 if (priv == NULL) 164 return; 165 166 /* 167 * Lock the queue and drop our references. We hold references to all 168 * the entries starting with tldi_head (or, if tldi_head == NULL, all 169 * entries in the queue). 170 * 171 * Because we don't want anyone adding addition things to the queue 172 * while we are doing this, we lock the queue. 173 */ 174 TCP_LOG_DEV_QUEUE_LOCK(); 175 if (priv->tldi_head != NULL) { 176 entry = priv->tldi_head; 177 STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head, 178 tldq_queue, entry_tmp) { 179 tcp_log_dev_clear_refcount(entry); 180 } 181 } 182 tcp_log_dev_listeners--; 183 KASSERT(tcp_log_dev_listeners >= 0, 184 ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__)); 185 STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info, 186 tldi_list); 187 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 188 TCP_LOG_DEV_QUEUE_UNLOCK(); 189 free(priv, M_TCPLOGDEV); 190 } 191 192 static int 193 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused, 194 struct thread *td __unused) 195 { 196 struct tcp_log_dev_info *priv; 197 struct tcp_log_dev_queue *entry; 198 int rv; 199 200 /* 201 * Ideally, we shouldn't see these because of file system 202 * permissions. 203 */ 204 if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC)) 205 return (ENODEV); 206 207 /* Allocate space to hold information about where we are. */ 208 priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV, 209 M_ZERO | M_WAITOK); 210 211 /* Stash the private data away. */ 212 rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv); 213 if (!rv) { 214 /* 215 * Increase the listener count, add this reader to the list, and 216 * take references on all current queues. 217 */ 218 TCP_LOG_DEV_QUEUE_LOCK(); 219 tcp_log_dev_listeners++; 220 STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list); 221 priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head); 222 if (priv->tldi_head != NULL) 223 priv->tldi_cur = priv->tldi_head->tldq_buf; 224 STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue) 225 TCP_LOG_DEV_QUEUE_REF(entry); 226 TCP_LOG_DEV_QUEUE_UNLOCK(); 227 } else { 228 /* Free the entry. */ 229 free(priv, M_TCPLOGDEV); 230 } 231 return (rv); 232 } 233 234 static int 235 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused, 236 int flags __unused) 237 { 238 239 return (ENODEV); 240 } 241 242 static __inline void 243 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate) 244 { 245 struct tcp_log_dev_queue *entry; 246 247 KASSERT(priv->tldi_head != NULL, 248 ("%s:%d: priv->tldi_head unexpectedly NULL", 249 __func__, __LINE__)); 250 KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur, 251 ("%s:%d: buffer mismatch (%p vs %p)", 252 __func__, __LINE__, priv->tldi_head->tldq_buf, 253 priv->tldi_cur)); 254 tcp_log_dev_queue_validate_lock(*lockstate); 255 256 if (*lockstate == QUEUE_UNLOCKED) { 257 TCP_LOG_DEV_QUEUE_LOCK(); 258 *lockstate = QUEUE_LOCKED; 259 } 260 entry = priv->tldi_head; 261 priv->tldi_head = STAILQ_NEXT(entry, tldq_queue); 262 tcp_log_dev_clear_refcount(entry); 263 priv->tldi_cur = NULL; 264 } 265 266 static int 267 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags) 268 { 269 struct tcp_log_common_header *buf; 270 struct tcp_log_dev_info *priv; 271 struct tcp_log_dev_queue *entry; 272 ssize_t len; 273 int lockstate, rv; 274 275 /* Get our private info. */ 276 rv = devfs_get_cdevpriv((void **)&priv); 277 if (rv) 278 return (rv); 279 280 lockstate = QUEUE_UNLOCKED; 281 282 /* Do we need to get a new buffer? */ 283 while (priv->tldi_cur == NULL || 284 priv->tldi_cur->tlch_length <= priv->tldi_off) { 285 /* Did we somehow forget to rotate? */ 286 KASSERT(priv->tldi_cur == NULL, 287 ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__, 288 __LINE__)); 289 if (priv->tldi_cur != NULL) 290 tcp_log_dev_rotate_bufs(priv, &lockstate); 291 292 /* 293 * Before we start looking at tldi_head, we need a lock on the 294 * queue to make sure tldi_head stays stable. 295 */ 296 if (lockstate == QUEUE_UNLOCKED) { 297 TCP_LOG_DEV_QUEUE_LOCK(); 298 lockstate = QUEUE_LOCKED; 299 } 300 301 /* We need the next buffer. Do we have one? */ 302 if (priv->tldi_head == NULL && (flags & FNONBLOCK)) { 303 rv = EAGAIN; 304 goto done; 305 } 306 if (priv->tldi_head == NULL) { 307 /* Sleep and wait for more things we can read. */ 308 rv = mtx_sleep(&tcp_log_dev_listeners, 309 &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0); 310 if (rv) 311 goto done; 312 if (priv->tldi_head == NULL) 313 continue; 314 } 315 316 /* 317 * We have an entry to read. We want to try to create a 318 * buffer, if one doesn't already exist. 319 */ 320 entry = priv->tldi_head; 321 if (entry->tldq_buf == NULL) { 322 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 323 buf = (*entry->tldq_xform)(entry); 324 if (buf == NULL) { 325 rv = EBUSY; 326 goto done; 327 } 328 entry->tldq_buf = buf; 329 } 330 331 priv->tldi_cur = entry->tldq_buf; 332 priv->tldi_off = 0; 333 } 334 335 /* Copy what we can from this buffer to the output buffer. */ 336 if (uio->uio_resid > 0) { 337 /* Drop locks so we can take page faults. */ 338 if (lockstate == QUEUE_LOCKED) 339 TCP_LOG_DEV_QUEUE_UNLOCK(); 340 lockstate = QUEUE_UNLOCKED; 341 342 KASSERT(priv->tldi_cur != NULL, 343 ("%s: priv->tldi_cur is unexpectedly NULL", __func__)); 344 345 /* Copy as much as we can to this uio. */ 346 len = priv->tldi_cur->tlch_length - priv->tldi_off; 347 if (len > uio->uio_resid) 348 len = uio->uio_resid; 349 rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off, 350 len, uio); 351 if (rv != 0) 352 goto done; 353 priv->tldi_off += len; 354 #ifdef TCPLOG_DEBUG_COUNTERS 355 counter_u64_add(tcp_log_que_read, len); 356 #endif 357 } 358 /* Are we done with this buffer? If so, find the next one. */ 359 if (priv->tldi_off >= priv->tldi_cur->tlch_length) { 360 KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length, 361 ("%s: offset (%ju) exceeds length (%ju)", __func__, 362 (uintmax_t)priv->tldi_off, 363 (uintmax_t)priv->tldi_cur->tlch_length)); 364 tcp_log_dev_rotate_bufs(priv, &lockstate); 365 } 366 done: 367 tcp_log_dev_queue_validate_lock(lockstate); 368 if (lockstate == QUEUE_LOCKED) 369 TCP_LOG_DEV_QUEUE_UNLOCK(); 370 return (rv); 371 } 372 373 static int 374 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, 375 int fflag __unused, struct thread *td __unused) 376 { 377 struct tcp_log_dev_info *priv; 378 int rv; 379 380 /* Get our private info. */ 381 rv = devfs_get_cdevpriv((void **)&priv); 382 if (rv) 383 return (rv); 384 385 /* 386 * Set things. Here, we are most concerned about the non-blocking I/O 387 * flag. 388 */ 389 rv = 0; 390 switch (cmd) { 391 case FIONBIO: 392 break; 393 case FIOASYNC: 394 if (*(int *)data != 0) 395 rv = EINVAL; 396 break; 397 default: 398 rv = ENOIOCTL; 399 } 400 return (rv); 401 } 402 403 static int 404 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td) 405 { 406 struct tcp_log_dev_info *priv; 407 int revents; 408 409 /* 410 * Get our private info. If this fails, claim that all events are 411 * ready. That should prod the user to do something that will 412 * make the error evident to them. 413 */ 414 if (devfs_get_cdevpriv((void **)&priv)) 415 return (events); 416 417 revents = 0; 418 if (events & (POLLIN | POLLRDNORM)) { 419 /* 420 * We can (probably) read right now if we are partway through 421 * a buffer or if we are just about to start a buffer. 422 * Because we are going to read tldi_head, we should acquire 423 * a read lock on the queue. 424 */ 425 TCP_LOG_DEV_QUEUE_LOCK(); 426 if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) || 427 (priv->tldi_cur != NULL && 428 priv->tldi_off < priv->tldi_cur->tlch_length)) 429 revents = events & (POLLIN | POLLRDNORM); 430 else 431 selrecord(td, &tcp_log_sel); 432 TCP_LOG_DEV_QUEUE_UNLOCK(); 433 } else { 434 /* 435 * It only makes sense to poll for reading. So, again, prod the 436 * user to do something that will make the error of their ways 437 * apparent. 438 */ 439 revents = events; 440 } 441 return (revents); 442 } 443 444 int 445 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry) 446 { 447 struct tcp_log_dev_info *priv; 448 int rv; 449 bool wakeup_needed; 450 451 KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL, 452 ("%s: Called with both tldq_buf and tldq_xform set to NULL", 453 __func__)); 454 KASSERT(entry->tldq_dtor != NULL, 455 ("%s: Called with tldq_dtor set to NULL", __func__)); 456 457 /* Get a lock on the queue. */ 458 TCP_LOG_DEV_QUEUE_LOCK(); 459 460 /* If no one is listening, tell the caller to free the resources. */ 461 if (tcp_log_dev_listeners == 0) { 462 rv = ENXIO; 463 goto done; 464 } 465 466 /* Add this to the end of the tailq. */ 467 STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue); 468 469 /* Add references for all current listeners. */ 470 refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners); 471 472 /* 473 * If any listener is currently stuck on NULL, that means they are 474 * waiting. Point their head to this new entry. 475 */ 476 wakeup_needed = false; 477 STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list) 478 if (priv->tldi_head == NULL) { 479 priv->tldi_head = entry; 480 wakeup_needed = true; 481 } 482 483 if (wakeup_needed) { 484 selwakeup(&tcp_log_sel); 485 wakeup(&tcp_log_dev_listeners); 486 } 487 488 rv = 0; 489 490 done: 491 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 492 TCP_LOG_DEV_QUEUE_UNLOCK(); 493 return (rv); 494 } 495 496 static int 497 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused) 498 { 499 500 /* TODO: Support intelligent unloading. */ 501 switch (type) { 502 case MOD_LOAD: 503 if (bootverbose) 504 printf("tcp_log: tcp_log device\n"); 505 memset(&tcp_log_sel, 0, sizeof(tcp_log_sel)); 506 memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx)); 507 mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev", 508 "tcp_log device queues", MTX_DEF); 509 tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, 510 &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, 511 "tcp_log"); 512 break; 513 default: 514 return (EOPNOTSUPP); 515 } 516 517 return (0); 518 } 519 520 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL); 521 MODULE_VERSION(tcp_log_dev, 1); 522