1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2016-2017 Netflix, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 */ 28 29 #include <sys/param.h> 30 #include <sys/conf.h> 31 #include <sys/fcntl.h> 32 #include <sys/filio.h> 33 #include <sys/kernel.h> 34 #include <sys/lock.h> 35 #include <sys/malloc.h> 36 #include <sys/module.h> 37 #include <sys/poll.h> 38 #include <sys/queue.h> 39 #include <sys/refcount.h> 40 #include <sys/mutex.h> 41 #include <sys/selinfo.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/sysctl.h> 45 #include <sys/tree.h> 46 #include <sys/uio.h> 47 #include <machine/atomic.h> 48 #include <sys/counter.h> 49 50 #include <dev/tcp_log/tcp_log_dev.h> 51 52 #ifdef TCPLOG_DEBUG_COUNTERS 53 extern counter_u64_t tcp_log_que_read; 54 extern counter_u64_t tcp_log_que_freed; 55 #endif 56 57 static struct cdev *tcp_log_dev; 58 static struct selinfo tcp_log_sel; 59 60 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head); 61 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head); 62 63 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures"); 64 65 static int tcp_log_dev_listeners = 0; 66 67 static struct mtx tcp_log_dev_queue_lock; 68 69 #define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock) 70 #define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock) 71 #define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED) 72 #define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED) 73 #define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt)) 74 #define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt)) 75 76 static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry); 77 static void tcp_log_dev_clear_cdevpriv(void *data); 78 static int tcp_log_dev_open(struct cdev *dev __unused, int flags, 79 int devtype __unused, struct thread *td __unused); 80 static int tcp_log_dev_write(struct cdev *dev __unused, 81 struct uio *uio __unused, int flags __unused); 82 static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, 83 int flags __unused); 84 static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, 85 caddr_t data, int fflag __unused, struct thread *td __unused); 86 static int tcp_log_dev_poll(struct cdev *dev __unused, int events, 87 struct thread *td); 88 89 enum tcp_log_dev_queue_lock_state { 90 QUEUE_UNLOCKED = 0, 91 QUEUE_LOCKED, 92 }; 93 94 static struct cdevsw tcp_log_cdevsw = { 95 .d_version = D_VERSION, 96 .d_read = tcp_log_dev_read, 97 .d_open = tcp_log_dev_open, 98 .d_write = tcp_log_dev_write, 99 .d_poll = tcp_log_dev_poll, 100 .d_ioctl = tcp_log_dev_ioctl, 101 #ifdef NOTYET 102 .d_mmap = tcp_log_dev_mmap, 103 #endif 104 .d_name = "tcp_log", 105 }; 106 107 static __inline void 108 tcp_log_dev_queue_validate_lock(int lockstate) 109 { 110 111 #ifdef INVARIANTS 112 switch (lockstate) { 113 case QUEUE_LOCKED: 114 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 115 break; 116 case QUEUE_UNLOCKED: 117 TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT(); 118 break; 119 default: 120 kassert_panic("%s:%d: unknown queue lock state", __func__, 121 __LINE__); 122 } 123 #endif 124 } 125 126 /* 127 * Clear the refcount. If appropriate, it will remove the entry from the 128 * queue and call the destructor. 129 * 130 * This must be called with the queue lock held. 131 */ 132 static void 133 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry) 134 { 135 136 KASSERT(entry != NULL, ("%s: called with NULL entry", __func__)); 137 138 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 139 140 if (TCP_LOG_DEV_QUEUE_UNREF(entry)) { 141 #ifdef TCPLOG_DEBUG_COUNTERS 142 counter_u64_add(tcp_log_que_freed, 1); 143 #endif 144 /* Remove the entry from the queue and call the destructor. */ 145 STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue, 146 tldq_queue); 147 (*entry->tldq_dtor)(entry); 148 } 149 } 150 151 static void 152 tcp_log_dev_clear_cdevpriv(void *data) 153 { 154 struct tcp_log_dev_info *priv; 155 struct tcp_log_dev_queue *entry, *entry_tmp; 156 157 priv = (struct tcp_log_dev_info *)data; 158 if (priv == NULL) 159 return; 160 161 /* 162 * Lock the queue and drop our references. We hold references to all 163 * the entries starting with tldi_head (or, if tldi_head == NULL, all 164 * entries in the queue). 165 * 166 * Because we don't want anyone adding addition things to the queue 167 * while we are doing this, we lock the queue. 168 */ 169 TCP_LOG_DEV_QUEUE_LOCK(); 170 if (priv->tldi_head != NULL) { 171 entry = priv->tldi_head; 172 STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head, 173 tldq_queue, entry_tmp) { 174 tcp_log_dev_clear_refcount(entry); 175 } 176 } 177 tcp_log_dev_listeners--; 178 KASSERT(tcp_log_dev_listeners >= 0, 179 ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__)); 180 STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info, 181 tldi_list); 182 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 183 TCP_LOG_DEV_QUEUE_UNLOCK(); 184 free(priv, M_TCPLOGDEV); 185 } 186 187 static int 188 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused, 189 struct thread *td __unused) 190 { 191 struct tcp_log_dev_info *priv; 192 struct tcp_log_dev_queue *entry; 193 int rv; 194 195 /* 196 * Ideally, we shouldn't see these because of file system 197 * permissions. 198 */ 199 if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC)) 200 return (ENODEV); 201 202 /* Allocate space to hold information about where we are. */ 203 priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV, 204 M_ZERO | M_WAITOK); 205 206 /* Stash the private data away. */ 207 rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv); 208 if (!rv) { 209 /* 210 * Increase the listener count, add this reader to the list, and 211 * take references on all current queues. 212 */ 213 TCP_LOG_DEV_QUEUE_LOCK(); 214 tcp_log_dev_listeners++; 215 STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list); 216 priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head); 217 if (priv->tldi_head != NULL) 218 priv->tldi_cur = priv->tldi_head->tldq_buf; 219 STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue) 220 TCP_LOG_DEV_QUEUE_REF(entry); 221 TCP_LOG_DEV_QUEUE_UNLOCK(); 222 } else { 223 /* Free the entry. */ 224 free(priv, M_TCPLOGDEV); 225 } 226 return (rv); 227 } 228 229 static int 230 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused, 231 int flags __unused) 232 { 233 234 return (ENODEV); 235 } 236 237 static __inline void 238 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate) 239 { 240 struct tcp_log_dev_queue *entry; 241 242 KASSERT(priv->tldi_head != NULL, 243 ("%s:%d: priv->tldi_head unexpectedly NULL", 244 __func__, __LINE__)); 245 KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur, 246 ("%s:%d: buffer mismatch (%p vs %p)", 247 __func__, __LINE__, priv->tldi_head->tldq_buf, 248 priv->tldi_cur)); 249 tcp_log_dev_queue_validate_lock(*lockstate); 250 251 if (*lockstate == QUEUE_UNLOCKED) { 252 TCP_LOG_DEV_QUEUE_LOCK(); 253 *lockstate = QUEUE_LOCKED; 254 } 255 entry = priv->tldi_head; 256 priv->tldi_head = STAILQ_NEXT(entry, tldq_queue); 257 tcp_log_dev_clear_refcount(entry); 258 priv->tldi_cur = NULL; 259 } 260 261 static int 262 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags) 263 { 264 struct tcp_log_common_header *buf; 265 struct tcp_log_dev_info *priv; 266 struct tcp_log_dev_queue *entry; 267 ssize_t len; 268 int lockstate, rv; 269 270 /* Get our private info. */ 271 rv = devfs_get_cdevpriv((void **)&priv); 272 if (rv) 273 return (rv); 274 275 lockstate = QUEUE_UNLOCKED; 276 277 /* Do we need to get a new buffer? */ 278 while (priv->tldi_cur == NULL || 279 priv->tldi_cur->tlch_length <= priv->tldi_off) { 280 /* Did we somehow forget to rotate? */ 281 KASSERT(priv->tldi_cur == NULL, 282 ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__, 283 __LINE__)); 284 if (priv->tldi_cur != NULL) 285 tcp_log_dev_rotate_bufs(priv, &lockstate); 286 287 /* 288 * Before we start looking at tldi_head, we need a lock on the 289 * queue to make sure tldi_head stays stable. 290 */ 291 if (lockstate == QUEUE_UNLOCKED) { 292 TCP_LOG_DEV_QUEUE_LOCK(); 293 lockstate = QUEUE_LOCKED; 294 } 295 296 /* We need the next buffer. Do we have one? */ 297 if (priv->tldi_head == NULL && (flags & FNONBLOCK)) { 298 rv = EAGAIN; 299 goto done; 300 } 301 if (priv->tldi_head == NULL) { 302 /* Sleep and wait for more things we can read. */ 303 rv = mtx_sleep(&tcp_log_dev_listeners, 304 &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0); 305 if (rv) 306 goto done; 307 if (priv->tldi_head == NULL) 308 continue; 309 } 310 311 /* 312 * We have an entry to read. We want to try to create a 313 * buffer, if one doesn't already exist. 314 */ 315 entry = priv->tldi_head; 316 if (entry->tldq_buf == NULL) { 317 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 318 buf = (*entry->tldq_xform)(entry); 319 if (buf == NULL) { 320 rv = EBUSY; 321 goto done; 322 } 323 entry->tldq_buf = buf; 324 } 325 326 priv->tldi_cur = entry->tldq_buf; 327 priv->tldi_off = 0; 328 } 329 330 /* Copy what we can from this buffer to the output buffer. */ 331 if (uio->uio_resid > 0) { 332 /* Drop locks so we can take page faults. */ 333 if (lockstate == QUEUE_LOCKED) 334 TCP_LOG_DEV_QUEUE_UNLOCK(); 335 lockstate = QUEUE_UNLOCKED; 336 337 KASSERT(priv->tldi_cur != NULL, 338 ("%s: priv->tldi_cur is unexpectedly NULL", __func__)); 339 340 /* Copy as much as we can to this uio. */ 341 len = priv->tldi_cur->tlch_length - priv->tldi_off; 342 if (len > uio->uio_resid) 343 len = uio->uio_resid; 344 rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off, 345 len, uio); 346 if (rv != 0) 347 goto done; 348 priv->tldi_off += len; 349 #ifdef TCPLOG_DEBUG_COUNTERS 350 counter_u64_add(tcp_log_que_read, len); 351 #endif 352 } 353 /* Are we done with this buffer? If so, find the next one. */ 354 if (priv->tldi_off >= priv->tldi_cur->tlch_length) { 355 KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length, 356 ("%s: offset (%ju) exceeds length (%ju)", __func__, 357 (uintmax_t)priv->tldi_off, 358 (uintmax_t)priv->tldi_cur->tlch_length)); 359 tcp_log_dev_rotate_bufs(priv, &lockstate); 360 } 361 done: 362 tcp_log_dev_queue_validate_lock(lockstate); 363 if (lockstate == QUEUE_LOCKED) 364 TCP_LOG_DEV_QUEUE_UNLOCK(); 365 return (rv); 366 } 367 368 static int 369 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, 370 int fflag __unused, struct thread *td __unused) 371 { 372 struct tcp_log_dev_info *priv; 373 int rv; 374 375 /* Get our private info. */ 376 rv = devfs_get_cdevpriv((void **)&priv); 377 if (rv) 378 return (rv); 379 380 /* 381 * Set things. Here, we are most concerned about the non-blocking I/O 382 * flag. 383 */ 384 rv = 0; 385 switch (cmd) { 386 case FIONBIO: 387 break; 388 case FIOASYNC: 389 if (*(int *)data != 0) 390 rv = EINVAL; 391 break; 392 default: 393 rv = ENOIOCTL; 394 } 395 return (rv); 396 } 397 398 static int 399 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td) 400 { 401 struct tcp_log_dev_info *priv; 402 int revents; 403 404 /* 405 * Get our private info. If this fails, claim that all events are 406 * ready. That should prod the user to do something that will 407 * make the error evident to them. 408 */ 409 if (devfs_get_cdevpriv((void **)&priv)) 410 return (events); 411 412 revents = 0; 413 if (events & (POLLIN | POLLRDNORM)) { 414 /* 415 * We can (probably) read right now if we are partway through 416 * a buffer or if we are just about to start a buffer. 417 * Because we are going to read tldi_head, we should acquire 418 * a read lock on the queue. 419 */ 420 TCP_LOG_DEV_QUEUE_LOCK(); 421 if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) || 422 (priv->tldi_cur != NULL && 423 priv->tldi_off < priv->tldi_cur->tlch_length)) 424 revents = events & (POLLIN | POLLRDNORM); 425 else 426 selrecord(td, &tcp_log_sel); 427 TCP_LOG_DEV_QUEUE_UNLOCK(); 428 } else { 429 /* 430 * It only makes sense to poll for reading. So, again, prod the 431 * user to do something that will make the error of their ways 432 * apparent. 433 */ 434 revents = events; 435 } 436 return (revents); 437 } 438 439 int 440 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry) 441 { 442 struct tcp_log_dev_info *priv; 443 int rv; 444 bool wakeup_needed; 445 446 KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL, 447 ("%s: Called with both tldq_buf and tldq_xform set to NULL", 448 __func__)); 449 KASSERT(entry->tldq_dtor != NULL, 450 ("%s: Called with tldq_dtor set to NULL", __func__)); 451 452 /* Get a lock on the queue. */ 453 TCP_LOG_DEV_QUEUE_LOCK(); 454 455 /* If no one is listening, tell the caller to free the resources. */ 456 if (tcp_log_dev_listeners == 0) { 457 rv = ENXIO; 458 goto done; 459 } 460 461 /* Add this to the end of the tailq. */ 462 STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue); 463 464 /* Add references for all current listeners. */ 465 refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners); 466 467 /* 468 * If any listener is currently stuck on NULL, that means they are 469 * waiting. Point their head to this new entry. 470 */ 471 wakeup_needed = false; 472 STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list) 473 if (priv->tldi_head == NULL) { 474 priv->tldi_head = entry; 475 wakeup_needed = true; 476 } 477 478 if (wakeup_needed) { 479 selwakeup(&tcp_log_sel); 480 wakeup(&tcp_log_dev_listeners); 481 } 482 483 rv = 0; 484 485 done: 486 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 487 TCP_LOG_DEV_QUEUE_UNLOCK(); 488 return (rv); 489 } 490 491 static int 492 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused) 493 { 494 495 /* TODO: Support intelligent unloading. */ 496 switch (type) { 497 case MOD_LOAD: 498 if (bootverbose) 499 printf("tcp_log: tcp_log device\n"); 500 memset(&tcp_log_sel, 0, sizeof(tcp_log_sel)); 501 memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx)); 502 mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev", 503 "tcp_log device queues", MTX_DEF); 504 tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, 505 &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, 506 "tcp_log"); 507 break; 508 default: 509 return (EOPNOTSUPP); 510 } 511 512 return (0); 513 } 514 515 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL); 516 MODULE_VERSION(tcp_log_dev, 1); 517