1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2016-2017 Netflix, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/conf.h> 34 #include <sys/fcntl.h> 35 #include <sys/filio.h> 36 #include <sys/kernel.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/module.h> 40 #include <sys/poll.h> 41 #include <sys/queue.h> 42 #include <sys/refcount.h> 43 #include <sys/mutex.h> 44 #include <sys/selinfo.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/sysctl.h> 48 #include <sys/tree.h> 49 #include <sys/uio.h> 50 #include <machine/atomic.h> 51 #include <sys/counter.h> 52 53 #include <dev/tcp_log/tcp_log_dev.h> 54 55 #ifdef TCPLOG_DEBUG_COUNTERS 56 extern counter_u64_t tcp_log_que_read; 57 extern counter_u64_t tcp_log_que_freed; 58 #endif 59 60 static struct cdev *tcp_log_dev; 61 static struct selinfo tcp_log_sel; 62 63 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head); 64 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head); 65 66 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures"); 67 68 static int tcp_log_dev_listeners = 0; 69 70 static struct mtx tcp_log_dev_queue_lock; 71 72 #define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock) 73 #define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock) 74 #define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED) 75 #define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED) 76 #define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt)) 77 #define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt)) 78 79 static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry); 80 static void tcp_log_dev_clear_cdevpriv(void *data); 81 static int tcp_log_dev_open(struct cdev *dev __unused, int flags, 82 int devtype __unused, struct thread *td __unused); 83 static int tcp_log_dev_write(struct cdev *dev __unused, 84 struct uio *uio __unused, int flags __unused); 85 static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, 86 int flags __unused); 87 static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, 88 caddr_t data, int fflag __unused, struct thread *td __unused); 89 static int tcp_log_dev_poll(struct cdev *dev __unused, int events, 90 struct thread *td); 91 92 enum tcp_log_dev_queue_lock_state { 93 QUEUE_UNLOCKED = 0, 94 QUEUE_LOCKED, 95 }; 96 97 static struct cdevsw tcp_log_cdevsw = { 98 .d_version = D_VERSION, 99 .d_read = tcp_log_dev_read, 100 .d_open = tcp_log_dev_open, 101 .d_write = tcp_log_dev_write, 102 .d_poll = tcp_log_dev_poll, 103 .d_ioctl = tcp_log_dev_ioctl, 104 #ifdef NOTYET 105 .d_mmap = tcp_log_dev_mmap, 106 #endif 107 .d_name = "tcp_log", 108 }; 109 110 static __inline void 111 tcp_log_dev_queue_validate_lock(int lockstate) 112 { 113 114 #ifdef INVARIANTS 115 switch (lockstate) { 116 case QUEUE_LOCKED: 117 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 118 break; 119 case QUEUE_UNLOCKED: 120 TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT(); 121 break; 122 default: 123 kassert_panic("%s:%d: unknown queue lock state", __func__, 124 __LINE__); 125 } 126 #endif 127 } 128 129 /* 130 * Clear the refcount. If appropriate, it will remove the entry from the 131 * queue and call the destructor. 132 * 133 * This must be called with the queue lock held. 134 */ 135 static void 136 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry) 137 { 138 139 KASSERT(entry != NULL, ("%s: called with NULL entry", __func__)); 140 141 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 142 143 if (TCP_LOG_DEV_QUEUE_UNREF(entry)) { 144 #ifdef TCPLOG_DEBUG_COUNTERS 145 counter_u64_add(tcp_log_que_freed, 1); 146 #endif 147 /* Remove the entry from the queue and call the destructor. */ 148 STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue, 149 tldq_queue); 150 (*entry->tldq_dtor)(entry); 151 } 152 } 153 154 static void 155 tcp_log_dev_clear_cdevpriv(void *data) 156 { 157 struct tcp_log_dev_info *priv; 158 struct tcp_log_dev_queue *entry, *entry_tmp; 159 160 priv = (struct tcp_log_dev_info *)data; 161 if (priv == NULL) 162 return; 163 164 /* 165 * Lock the queue and drop our references. We hold references to all 166 * the entries starting with tldi_head (or, if tldi_head == NULL, all 167 * entries in the queue). 168 * 169 * Because we don't want anyone adding addition things to the queue 170 * while we are doing this, we lock the queue. 171 */ 172 TCP_LOG_DEV_QUEUE_LOCK(); 173 if (priv->tldi_head != NULL) { 174 entry = priv->tldi_head; 175 STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head, 176 tldq_queue, entry_tmp) { 177 tcp_log_dev_clear_refcount(entry); 178 } 179 } 180 tcp_log_dev_listeners--; 181 KASSERT(tcp_log_dev_listeners >= 0, 182 ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__)); 183 STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info, 184 tldi_list); 185 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 186 TCP_LOG_DEV_QUEUE_UNLOCK(); 187 free(priv, M_TCPLOGDEV); 188 } 189 190 static int 191 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused, 192 struct thread *td __unused) 193 { 194 struct tcp_log_dev_info *priv; 195 struct tcp_log_dev_queue *entry; 196 int rv; 197 198 /* 199 * Ideally, we shouldn't see these because of file system 200 * permissions. 201 */ 202 if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC)) 203 return (ENODEV); 204 205 /* Allocate space to hold information about where we are. */ 206 priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV, 207 M_ZERO | M_WAITOK); 208 209 /* Stash the private data away. */ 210 rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv); 211 if (!rv) { 212 /* 213 * Increase the listener count, add this reader to the list, and 214 * take references on all current queues. 215 */ 216 TCP_LOG_DEV_QUEUE_LOCK(); 217 tcp_log_dev_listeners++; 218 STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list); 219 priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head); 220 if (priv->tldi_head != NULL) 221 priv->tldi_cur = priv->tldi_head->tldq_buf; 222 STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue) 223 TCP_LOG_DEV_QUEUE_REF(entry); 224 TCP_LOG_DEV_QUEUE_UNLOCK(); 225 } else { 226 /* Free the entry. */ 227 free(priv, M_TCPLOGDEV); 228 } 229 return (rv); 230 } 231 232 static int 233 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused, 234 int flags __unused) 235 { 236 237 return (ENODEV); 238 } 239 240 static __inline void 241 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate) 242 { 243 struct tcp_log_dev_queue *entry; 244 245 KASSERT(priv->tldi_head != NULL, 246 ("%s:%d: priv->tldi_head unexpectedly NULL", 247 __func__, __LINE__)); 248 KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur, 249 ("%s:%d: buffer mismatch (%p vs %p)", 250 __func__, __LINE__, priv->tldi_head->tldq_buf, 251 priv->tldi_cur)); 252 tcp_log_dev_queue_validate_lock(*lockstate); 253 254 if (*lockstate == QUEUE_UNLOCKED) { 255 TCP_LOG_DEV_QUEUE_LOCK(); 256 *lockstate = QUEUE_LOCKED; 257 } 258 entry = priv->tldi_head; 259 priv->tldi_head = STAILQ_NEXT(entry, tldq_queue); 260 tcp_log_dev_clear_refcount(entry); 261 priv->tldi_cur = NULL; 262 } 263 264 static int 265 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags) 266 { 267 struct tcp_log_common_header *buf; 268 struct tcp_log_dev_info *priv; 269 struct tcp_log_dev_queue *entry; 270 ssize_t len; 271 int lockstate, rv; 272 273 /* Get our private info. */ 274 rv = devfs_get_cdevpriv((void **)&priv); 275 if (rv) 276 return (rv); 277 278 lockstate = QUEUE_UNLOCKED; 279 280 /* Do we need to get a new buffer? */ 281 while (priv->tldi_cur == NULL || 282 priv->tldi_cur->tlch_length <= priv->tldi_off) { 283 /* Did we somehow forget to rotate? */ 284 KASSERT(priv->tldi_cur == NULL, 285 ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__, 286 __LINE__)); 287 if (priv->tldi_cur != NULL) 288 tcp_log_dev_rotate_bufs(priv, &lockstate); 289 290 /* 291 * Before we start looking at tldi_head, we need a lock on the 292 * queue to make sure tldi_head stays stable. 293 */ 294 if (lockstate == QUEUE_UNLOCKED) { 295 TCP_LOG_DEV_QUEUE_LOCK(); 296 lockstate = QUEUE_LOCKED; 297 } 298 299 /* We need the next buffer. Do we have one? */ 300 if (priv->tldi_head == NULL && (flags & FNONBLOCK)) { 301 rv = EAGAIN; 302 goto done; 303 } 304 if (priv->tldi_head == NULL) { 305 /* Sleep and wait for more things we can read. */ 306 rv = mtx_sleep(&tcp_log_dev_listeners, 307 &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0); 308 if (rv) 309 goto done; 310 if (priv->tldi_head == NULL) 311 continue; 312 } 313 314 /* 315 * We have an entry to read. We want to try to create a 316 * buffer, if one doesn't already exist. 317 */ 318 entry = priv->tldi_head; 319 if (entry->tldq_buf == NULL) { 320 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 321 buf = (*entry->tldq_xform)(entry); 322 if (buf == NULL) { 323 rv = EBUSY; 324 goto done; 325 } 326 entry->tldq_buf = buf; 327 } 328 329 priv->tldi_cur = entry->tldq_buf; 330 priv->tldi_off = 0; 331 } 332 333 /* Copy what we can from this buffer to the output buffer. */ 334 if (uio->uio_resid > 0) { 335 /* Drop locks so we can take page faults. */ 336 if (lockstate == QUEUE_LOCKED) 337 TCP_LOG_DEV_QUEUE_UNLOCK(); 338 lockstate = QUEUE_UNLOCKED; 339 340 KASSERT(priv->tldi_cur != NULL, 341 ("%s: priv->tldi_cur is unexpectedly NULL", __func__)); 342 343 /* Copy as much as we can to this uio. */ 344 len = priv->tldi_cur->tlch_length - priv->tldi_off; 345 if (len > uio->uio_resid) 346 len = uio->uio_resid; 347 rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off, 348 len, uio); 349 if (rv != 0) 350 goto done; 351 priv->tldi_off += len; 352 #ifdef TCPLOG_DEBUG_COUNTERS 353 counter_u64_add(tcp_log_que_read, len); 354 #endif 355 } 356 /* Are we done with this buffer? If so, find the next one. */ 357 if (priv->tldi_off >= priv->tldi_cur->tlch_length) { 358 KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length, 359 ("%s: offset (%ju) exceeds length (%ju)", __func__, 360 (uintmax_t)priv->tldi_off, 361 (uintmax_t)priv->tldi_cur->tlch_length)); 362 tcp_log_dev_rotate_bufs(priv, &lockstate); 363 } 364 done: 365 tcp_log_dev_queue_validate_lock(lockstate); 366 if (lockstate == QUEUE_LOCKED) 367 TCP_LOG_DEV_QUEUE_UNLOCK(); 368 return (rv); 369 } 370 371 static int 372 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, 373 int fflag __unused, struct thread *td __unused) 374 { 375 struct tcp_log_dev_info *priv; 376 int rv; 377 378 /* Get our private info. */ 379 rv = devfs_get_cdevpriv((void **)&priv); 380 if (rv) 381 return (rv); 382 383 /* 384 * Set things. Here, we are most concerned about the non-blocking I/O 385 * flag. 386 */ 387 rv = 0; 388 switch (cmd) { 389 case FIONBIO: 390 break; 391 case FIOASYNC: 392 if (*(int *)data != 0) 393 rv = EINVAL; 394 break; 395 default: 396 rv = ENOIOCTL; 397 } 398 return (rv); 399 } 400 401 static int 402 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td) 403 { 404 struct tcp_log_dev_info *priv; 405 int revents; 406 407 /* 408 * Get our private info. If this fails, claim that all events are 409 * ready. That should prod the user to do something that will 410 * make the error evident to them. 411 */ 412 if (devfs_get_cdevpriv((void **)&priv)) 413 return (events); 414 415 revents = 0; 416 if (events & (POLLIN | POLLRDNORM)) { 417 /* 418 * We can (probably) read right now if we are partway through 419 * a buffer or if we are just about to start a buffer. 420 * Because we are going to read tldi_head, we should acquire 421 * a read lock on the queue. 422 */ 423 TCP_LOG_DEV_QUEUE_LOCK(); 424 if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) || 425 (priv->tldi_cur != NULL && 426 priv->tldi_off < priv->tldi_cur->tlch_length)) 427 revents = events & (POLLIN | POLLRDNORM); 428 else 429 selrecord(td, &tcp_log_sel); 430 TCP_LOG_DEV_QUEUE_UNLOCK(); 431 } else { 432 /* 433 * It only makes sense to poll for reading. So, again, prod the 434 * user to do something that will make the error of their ways 435 * apparent. 436 */ 437 revents = events; 438 } 439 return (revents); 440 } 441 442 int 443 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry) 444 { 445 struct tcp_log_dev_info *priv; 446 int rv; 447 bool wakeup_needed; 448 449 KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL, 450 ("%s: Called with both tldq_buf and tldq_xform set to NULL", 451 __func__)); 452 KASSERT(entry->tldq_dtor != NULL, 453 ("%s: Called with tldq_dtor set to NULL", __func__)); 454 455 /* Get a lock on the queue. */ 456 TCP_LOG_DEV_QUEUE_LOCK(); 457 458 /* If no one is listening, tell the caller to free the resources. */ 459 if (tcp_log_dev_listeners == 0) { 460 rv = ENXIO; 461 goto done; 462 } 463 464 /* Add this to the end of the tailq. */ 465 STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue); 466 467 /* Add references for all current listeners. */ 468 refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners); 469 470 /* 471 * If any listener is currently stuck on NULL, that means they are 472 * waiting. Point their head to this new entry. 473 */ 474 wakeup_needed = false; 475 STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list) 476 if (priv->tldi_head == NULL) { 477 priv->tldi_head = entry; 478 wakeup_needed = true; 479 } 480 481 if (wakeup_needed) { 482 selwakeup(&tcp_log_sel); 483 wakeup(&tcp_log_dev_listeners); 484 } 485 486 rv = 0; 487 488 done: 489 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 490 TCP_LOG_DEV_QUEUE_UNLOCK(); 491 return (rv); 492 } 493 494 static int 495 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused) 496 { 497 498 /* TODO: Support intelligent unloading. */ 499 switch (type) { 500 case MOD_LOAD: 501 if (bootverbose) 502 printf("tcp_log: tcp_log device\n"); 503 memset(&tcp_log_sel, 0, sizeof(tcp_log_sel)); 504 memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx)); 505 mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev", 506 "tcp_log device queues", MTX_DEF); 507 tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, 508 &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, 509 "tcp_log"); 510 break; 511 default: 512 return (EOPNOTSUPP); 513 } 514 515 return (0); 516 } 517 518 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL); 519 MODULE_VERSION(tcp_log_dev, 1); 520