1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2016-2017 Netflix, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/conf.h> 34 #include <sys/fcntl.h> 35 #include <sys/filio.h> 36 #include <sys/kernel.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/module.h> 40 #include <sys/poll.h> 41 #include <sys/queue.h> 42 #include <sys/refcount.h> 43 #include <sys/mutex.h> 44 #include <sys/selinfo.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/sysctl.h> 48 #include <sys/tree.h> 49 #include <sys/uio.h> 50 #include <machine/atomic.h> 51 #include <sys/counter.h> 52 53 #include <dev/tcp_log/tcp_log_dev.h> 54 55 #ifdef TCPLOG_DEBUG_COUNTERS 56 extern counter_u64_t tcp_log_que_read; 57 extern counter_u64_t tcp_log_que_freed; 58 #endif 59 60 static struct cdev *tcp_log_dev; 61 static struct selinfo tcp_log_sel; 62 63 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head); 64 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head); 65 66 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures"); 67 68 static int tcp_log_dev_listeners = 0; 69 70 static struct mtx tcp_log_dev_queue_lock; 71 72 #define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock) 73 #define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock) 74 #define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED) 75 #define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED) 76 #define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt)) 77 #define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt)) 78 79 static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry); 80 static void tcp_log_dev_clear_cdevpriv(void *data); 81 static int tcp_log_dev_open(struct cdev *dev __unused, int flags, 82 int devtype __unused, struct thread *td __unused); 83 static int tcp_log_dev_write(struct cdev *dev __unused, 84 struct uio *uio __unused, int flags __unused); 85 static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, 86 int flags __unused); 87 static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, 88 caddr_t data, int fflag __unused, struct thread *td __unused); 89 static int tcp_log_dev_poll(struct cdev *dev __unused, int events, 90 struct thread *td); 91 92 93 enum tcp_log_dev_queue_lock_state { 94 QUEUE_UNLOCKED = 0, 95 QUEUE_LOCKED, 96 }; 97 98 static struct cdevsw tcp_log_cdevsw = { 99 .d_version = D_VERSION, 100 .d_read = tcp_log_dev_read, 101 .d_open = tcp_log_dev_open, 102 .d_write = tcp_log_dev_write, 103 .d_poll = tcp_log_dev_poll, 104 .d_ioctl = tcp_log_dev_ioctl, 105 #ifdef NOTYET 106 .d_mmap = tcp_log_dev_mmap, 107 #endif 108 .d_name = "tcp_log", 109 }; 110 111 static __inline void 112 tcp_log_dev_queue_validate_lock(int lockstate) 113 { 114 115 #ifdef INVARIANTS 116 switch (lockstate) { 117 case QUEUE_LOCKED: 118 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 119 break; 120 case QUEUE_UNLOCKED: 121 TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT(); 122 break; 123 default: 124 kassert_panic("%s:%d: unknown queue lock state", __func__, 125 __LINE__); 126 } 127 #endif 128 } 129 130 /* 131 * Clear the refcount. If appropriate, it will remove the entry from the 132 * queue and call the destructor. 133 * 134 * This must be called with the queue lock held. 135 */ 136 static void 137 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry) 138 { 139 140 KASSERT(entry != NULL, ("%s: called with NULL entry", __func__)); 141 142 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 143 144 if (TCP_LOG_DEV_QUEUE_UNREF(entry)) { 145 #ifdef TCPLOG_DEBUG_COUNTERS 146 counter_u64_add(tcp_log_que_freed, 1); 147 #endif 148 /* Remove the entry from the queue and call the destructor. */ 149 STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue, 150 tldq_queue); 151 (*entry->tldq_dtor)(entry); 152 } 153 } 154 155 static void 156 tcp_log_dev_clear_cdevpriv(void *data) 157 { 158 struct tcp_log_dev_info *priv; 159 struct tcp_log_dev_queue *entry, *entry_tmp; 160 161 priv = (struct tcp_log_dev_info *)data; 162 if (priv == NULL) 163 return; 164 165 /* 166 * Lock the queue and drop our references. We hold references to all 167 * the entries starting with tldi_head (or, if tldi_head == NULL, all 168 * entries in the queue). 169 * 170 * Because we don't want anyone adding addition things to the queue 171 * while we are doing this, we lock the queue. 172 */ 173 TCP_LOG_DEV_QUEUE_LOCK(); 174 if (priv->tldi_head != NULL) { 175 entry = priv->tldi_head; 176 STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head, 177 tldq_queue, entry_tmp) { 178 tcp_log_dev_clear_refcount(entry); 179 } 180 } 181 tcp_log_dev_listeners--; 182 KASSERT(tcp_log_dev_listeners >= 0, 183 ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__)); 184 STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info, 185 tldi_list); 186 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 187 TCP_LOG_DEV_QUEUE_UNLOCK(); 188 free(priv, M_TCPLOGDEV); 189 } 190 191 static int 192 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused, 193 struct thread *td __unused) 194 { 195 struct tcp_log_dev_info *priv; 196 struct tcp_log_dev_queue *entry; 197 int rv; 198 199 /* 200 * Ideally, we shouldn't see these because of file system 201 * permissions. 202 */ 203 if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC)) 204 return (ENODEV); 205 206 /* Allocate space to hold information about where we are. */ 207 priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV, 208 M_ZERO | M_WAITOK); 209 210 /* Stash the private data away. */ 211 rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv); 212 if (!rv) { 213 /* 214 * Increase the listener count, add this reader to the list, and 215 * take references on all current queues. 216 */ 217 TCP_LOG_DEV_QUEUE_LOCK(); 218 tcp_log_dev_listeners++; 219 STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list); 220 priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head); 221 if (priv->tldi_head != NULL) 222 priv->tldi_cur = priv->tldi_head->tldq_buf; 223 STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue) 224 TCP_LOG_DEV_QUEUE_REF(entry); 225 TCP_LOG_DEV_QUEUE_UNLOCK(); 226 } else { 227 /* Free the entry. */ 228 free(priv, M_TCPLOGDEV); 229 } 230 return (rv); 231 } 232 233 static int 234 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused, 235 int flags __unused) 236 { 237 238 return (ENODEV); 239 } 240 241 static __inline void 242 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate) 243 { 244 struct tcp_log_dev_queue *entry; 245 246 KASSERT(priv->tldi_head != NULL, 247 ("%s:%d: priv->tldi_head unexpectedly NULL", 248 __func__, __LINE__)); 249 KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur, 250 ("%s:%d: buffer mismatch (%p vs %p)", 251 __func__, __LINE__, priv->tldi_head->tldq_buf, 252 priv->tldi_cur)); 253 tcp_log_dev_queue_validate_lock(*lockstate); 254 255 if (*lockstate == QUEUE_UNLOCKED) { 256 TCP_LOG_DEV_QUEUE_LOCK(); 257 *lockstate = QUEUE_LOCKED; 258 } 259 entry = priv->tldi_head; 260 priv->tldi_head = STAILQ_NEXT(entry, tldq_queue); 261 tcp_log_dev_clear_refcount(entry); 262 priv->tldi_cur = NULL; 263 } 264 265 static int 266 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags) 267 { 268 struct tcp_log_common_header *buf; 269 struct tcp_log_dev_info *priv; 270 struct tcp_log_dev_queue *entry; 271 ssize_t len; 272 int lockstate, rv; 273 274 /* Get our private info. */ 275 rv = devfs_get_cdevpriv((void **)&priv); 276 if (rv) 277 return (rv); 278 279 lockstate = QUEUE_UNLOCKED; 280 281 /* Do we need to get a new buffer? */ 282 while (priv->tldi_cur == NULL || 283 priv->tldi_cur->tlch_length <= priv->tldi_off) { 284 /* Did we somehow forget to rotate? */ 285 KASSERT(priv->tldi_cur == NULL, 286 ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__, 287 __LINE__)); 288 if (priv->tldi_cur != NULL) 289 tcp_log_dev_rotate_bufs(priv, &lockstate); 290 291 /* 292 * Before we start looking at tldi_head, we need a lock on the 293 * queue to make sure tldi_head stays stable. 294 */ 295 if (lockstate == QUEUE_UNLOCKED) { 296 TCP_LOG_DEV_QUEUE_LOCK(); 297 lockstate = QUEUE_LOCKED; 298 } 299 300 /* We need the next buffer. Do we have one? */ 301 if (priv->tldi_head == NULL && (flags & FNONBLOCK)) { 302 rv = EAGAIN; 303 goto done; 304 } 305 if (priv->tldi_head == NULL) { 306 /* Sleep and wait for more things we can read. */ 307 rv = mtx_sleep(&tcp_log_dev_listeners, 308 &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0); 309 if (rv) 310 goto done; 311 if (priv->tldi_head == NULL) 312 continue; 313 } 314 315 /* 316 * We have an entry to read. We want to try to create a 317 * buffer, if one doesn't already exist. 318 */ 319 entry = priv->tldi_head; 320 if (entry->tldq_buf == NULL) { 321 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 322 buf = (*entry->tldq_xform)(entry); 323 if (buf == NULL) { 324 rv = EBUSY; 325 goto done; 326 } 327 entry->tldq_buf = buf; 328 } 329 330 priv->tldi_cur = entry->tldq_buf; 331 priv->tldi_off = 0; 332 } 333 334 /* Copy what we can from this buffer to the output buffer. */ 335 if (uio->uio_resid > 0) { 336 /* Drop locks so we can take page faults. */ 337 if (lockstate == QUEUE_LOCKED) 338 TCP_LOG_DEV_QUEUE_UNLOCK(); 339 lockstate = QUEUE_UNLOCKED; 340 341 KASSERT(priv->tldi_cur != NULL, 342 ("%s: priv->tldi_cur is unexpectedly NULL", __func__)); 343 344 /* Copy as much as we can to this uio. */ 345 len = priv->tldi_cur->tlch_length - priv->tldi_off; 346 if (len > uio->uio_resid) 347 len = uio->uio_resid; 348 rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off, 349 len, uio); 350 if (rv != 0) 351 goto done; 352 priv->tldi_off += len; 353 #ifdef TCPLOG_DEBUG_COUNTERS 354 counter_u64_add(tcp_log_que_read, len); 355 #endif 356 } 357 /* Are we done with this buffer? If so, find the next one. */ 358 if (priv->tldi_off >= priv->tldi_cur->tlch_length) { 359 KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length, 360 ("%s: offset (%ju) exceeds length (%ju)", __func__, 361 (uintmax_t)priv->tldi_off, 362 (uintmax_t)priv->tldi_cur->tlch_length)); 363 tcp_log_dev_rotate_bufs(priv, &lockstate); 364 } 365 done: 366 tcp_log_dev_queue_validate_lock(lockstate); 367 if (lockstate == QUEUE_LOCKED) 368 TCP_LOG_DEV_QUEUE_UNLOCK(); 369 return (rv); 370 } 371 372 static int 373 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, 374 int fflag __unused, struct thread *td __unused) 375 { 376 struct tcp_log_dev_info *priv; 377 int rv; 378 379 /* Get our private info. */ 380 rv = devfs_get_cdevpriv((void **)&priv); 381 if (rv) 382 return (rv); 383 384 /* 385 * Set things. Here, we are most concerned about the non-blocking I/O 386 * flag. 387 */ 388 rv = 0; 389 switch (cmd) { 390 case FIONBIO: 391 break; 392 case FIOASYNC: 393 if (*(int *)data != 0) 394 rv = EINVAL; 395 break; 396 default: 397 rv = ENOIOCTL; 398 } 399 return (rv); 400 } 401 402 static int 403 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td) 404 { 405 struct tcp_log_dev_info *priv; 406 int revents; 407 408 /* 409 * Get our private info. If this fails, claim that all events are 410 * ready. That should prod the user to do something that will 411 * make the error evident to them. 412 */ 413 if (devfs_get_cdevpriv((void **)&priv)) 414 return (events); 415 416 revents = 0; 417 if (events & (POLLIN | POLLRDNORM)) { 418 /* 419 * We can (probably) read right now if we are partway through 420 * a buffer or if we are just about to start a buffer. 421 * Because we are going to read tldi_head, we should acquire 422 * a read lock on the queue. 423 */ 424 TCP_LOG_DEV_QUEUE_LOCK(); 425 if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) || 426 (priv->tldi_cur != NULL && 427 priv->tldi_off < priv->tldi_cur->tlch_length)) 428 revents = events & (POLLIN | POLLRDNORM); 429 else 430 selrecord(td, &tcp_log_sel); 431 TCP_LOG_DEV_QUEUE_UNLOCK(); 432 } else { 433 /* 434 * It only makes sense to poll for reading. So, again, prod the 435 * user to do something that will make the error of their ways 436 * apparent. 437 */ 438 revents = events; 439 } 440 return (revents); 441 } 442 443 int 444 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry) 445 { 446 struct tcp_log_dev_info *priv; 447 int rv; 448 bool wakeup_needed; 449 450 KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL, 451 ("%s: Called with both tldq_buf and tldq_xform set to NULL", 452 __func__)); 453 KASSERT(entry->tldq_dtor != NULL, 454 ("%s: Called with tldq_dtor set to NULL", __func__)); 455 456 /* Get a lock on the queue. */ 457 TCP_LOG_DEV_QUEUE_LOCK(); 458 459 /* If no one is listening, tell the caller to free the resources. */ 460 if (tcp_log_dev_listeners == 0) { 461 rv = ENXIO; 462 goto done; 463 } 464 465 /* Add this to the end of the tailq. */ 466 STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue); 467 468 /* Add references for all current listeners. */ 469 refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners); 470 471 /* 472 * If any listener is currently stuck on NULL, that means they are 473 * waiting. Point their head to this new entry. 474 */ 475 wakeup_needed = false; 476 STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list) 477 if (priv->tldi_head == NULL) { 478 priv->tldi_head = entry; 479 wakeup_needed = true; 480 } 481 482 if (wakeup_needed) { 483 selwakeup(&tcp_log_sel); 484 wakeup(&tcp_log_dev_listeners); 485 } 486 487 rv = 0; 488 489 done: 490 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 491 TCP_LOG_DEV_QUEUE_UNLOCK(); 492 return (rv); 493 } 494 495 static int 496 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused) 497 { 498 499 /* TODO: Support intelligent unloading. */ 500 switch (type) { 501 case MOD_LOAD: 502 if (bootverbose) 503 printf("tcp_log: tcp_log device\n"); 504 memset(&tcp_log_sel, 0, sizeof(tcp_log_sel)); 505 memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx)); 506 mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev", 507 "tcp_log device queues", MTX_DEF); 508 tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, 509 &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, 510 "tcp_log"); 511 break; 512 default: 513 return (EOPNOTSUPP); 514 } 515 516 return (0); 517 } 518 519 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL); 520 MODULE_VERSION(tcp_log_dev, 1); 521