1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2016-2017 Netflix, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 */ 28 29 #include <sys/cdefs.h> 30 #include <sys/param.h> 31 #include <sys/conf.h> 32 #include <sys/fcntl.h> 33 #include <sys/filio.h> 34 #include <sys/kernel.h> 35 #include <sys/lock.h> 36 #include <sys/malloc.h> 37 #include <sys/module.h> 38 #include <sys/poll.h> 39 #include <sys/queue.h> 40 #include <sys/refcount.h> 41 #include <sys/mutex.h> 42 #include <sys/selinfo.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/sysctl.h> 46 #include <sys/tree.h> 47 #include <sys/uio.h> 48 #include <machine/atomic.h> 49 #include <sys/counter.h> 50 51 #include <dev/tcp_log/tcp_log_dev.h> 52 53 #ifdef TCPLOG_DEBUG_COUNTERS 54 extern counter_u64_t tcp_log_que_read; 55 extern counter_u64_t tcp_log_que_freed; 56 #endif 57 58 static struct cdev *tcp_log_dev; 59 static struct selinfo tcp_log_sel; 60 61 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head); 62 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head); 63 64 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures"); 65 66 static int tcp_log_dev_listeners = 0; 67 68 static struct mtx tcp_log_dev_queue_lock; 69 70 #define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock) 71 #define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock) 72 #define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED) 73 #define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED) 74 #define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt)) 75 #define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt)) 76 77 static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry); 78 static void tcp_log_dev_clear_cdevpriv(void *data); 79 static int tcp_log_dev_open(struct cdev *dev __unused, int flags, 80 int devtype __unused, struct thread *td __unused); 81 static int tcp_log_dev_write(struct cdev *dev __unused, 82 struct uio *uio __unused, int flags __unused); 83 static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, 84 int flags __unused); 85 static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, 86 caddr_t data, int fflag __unused, struct thread *td __unused); 87 static int tcp_log_dev_poll(struct cdev *dev __unused, int events, 88 struct thread *td); 89 90 enum tcp_log_dev_queue_lock_state { 91 QUEUE_UNLOCKED = 0, 92 QUEUE_LOCKED, 93 }; 94 95 static struct cdevsw tcp_log_cdevsw = { 96 .d_version = D_VERSION, 97 .d_read = tcp_log_dev_read, 98 .d_open = tcp_log_dev_open, 99 .d_write = tcp_log_dev_write, 100 .d_poll = tcp_log_dev_poll, 101 .d_ioctl = tcp_log_dev_ioctl, 102 #ifdef NOTYET 103 .d_mmap = tcp_log_dev_mmap, 104 #endif 105 .d_name = "tcp_log", 106 }; 107 108 static __inline void 109 tcp_log_dev_queue_validate_lock(int lockstate) 110 { 111 112 #ifdef INVARIANTS 113 switch (lockstate) { 114 case QUEUE_LOCKED: 115 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 116 break; 117 case QUEUE_UNLOCKED: 118 TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT(); 119 break; 120 default: 121 kassert_panic("%s:%d: unknown queue lock state", __func__, 122 __LINE__); 123 } 124 #endif 125 } 126 127 /* 128 * Clear the refcount. If appropriate, it will remove the entry from the 129 * queue and call the destructor. 130 * 131 * This must be called with the queue lock held. 132 */ 133 static void 134 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry) 135 { 136 137 KASSERT(entry != NULL, ("%s: called with NULL entry", __func__)); 138 139 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 140 141 if (TCP_LOG_DEV_QUEUE_UNREF(entry)) { 142 #ifdef TCPLOG_DEBUG_COUNTERS 143 counter_u64_add(tcp_log_que_freed, 1); 144 #endif 145 /* Remove the entry from the queue and call the destructor. */ 146 STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue, 147 tldq_queue); 148 (*entry->tldq_dtor)(entry); 149 } 150 } 151 152 static void 153 tcp_log_dev_clear_cdevpriv(void *data) 154 { 155 struct tcp_log_dev_info *priv; 156 struct tcp_log_dev_queue *entry, *entry_tmp; 157 158 priv = (struct tcp_log_dev_info *)data; 159 if (priv == NULL) 160 return; 161 162 /* 163 * Lock the queue and drop our references. We hold references to all 164 * the entries starting with tldi_head (or, if tldi_head == NULL, all 165 * entries in the queue). 166 * 167 * Because we don't want anyone adding addition things to the queue 168 * while we are doing this, we lock the queue. 169 */ 170 TCP_LOG_DEV_QUEUE_LOCK(); 171 if (priv->tldi_head != NULL) { 172 entry = priv->tldi_head; 173 STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head, 174 tldq_queue, entry_tmp) { 175 tcp_log_dev_clear_refcount(entry); 176 } 177 } 178 tcp_log_dev_listeners--; 179 KASSERT(tcp_log_dev_listeners >= 0, 180 ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__)); 181 STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info, 182 tldi_list); 183 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 184 TCP_LOG_DEV_QUEUE_UNLOCK(); 185 free(priv, M_TCPLOGDEV); 186 } 187 188 static int 189 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused, 190 struct thread *td __unused) 191 { 192 struct tcp_log_dev_info *priv; 193 struct tcp_log_dev_queue *entry; 194 int rv; 195 196 /* 197 * Ideally, we shouldn't see these because of file system 198 * permissions. 199 */ 200 if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC)) 201 return (ENODEV); 202 203 /* Allocate space to hold information about where we are. */ 204 priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV, 205 M_ZERO | M_WAITOK); 206 207 /* Stash the private data away. */ 208 rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv); 209 if (!rv) { 210 /* 211 * Increase the listener count, add this reader to the list, and 212 * take references on all current queues. 213 */ 214 TCP_LOG_DEV_QUEUE_LOCK(); 215 tcp_log_dev_listeners++; 216 STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list); 217 priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head); 218 if (priv->tldi_head != NULL) 219 priv->tldi_cur = priv->tldi_head->tldq_buf; 220 STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue) 221 TCP_LOG_DEV_QUEUE_REF(entry); 222 TCP_LOG_DEV_QUEUE_UNLOCK(); 223 } else { 224 /* Free the entry. */ 225 free(priv, M_TCPLOGDEV); 226 } 227 return (rv); 228 } 229 230 static int 231 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused, 232 int flags __unused) 233 { 234 235 return (ENODEV); 236 } 237 238 static __inline void 239 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate) 240 { 241 struct tcp_log_dev_queue *entry; 242 243 KASSERT(priv->tldi_head != NULL, 244 ("%s:%d: priv->tldi_head unexpectedly NULL", 245 __func__, __LINE__)); 246 KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur, 247 ("%s:%d: buffer mismatch (%p vs %p)", 248 __func__, __LINE__, priv->tldi_head->tldq_buf, 249 priv->tldi_cur)); 250 tcp_log_dev_queue_validate_lock(*lockstate); 251 252 if (*lockstate == QUEUE_UNLOCKED) { 253 TCP_LOG_DEV_QUEUE_LOCK(); 254 *lockstate = QUEUE_LOCKED; 255 } 256 entry = priv->tldi_head; 257 priv->tldi_head = STAILQ_NEXT(entry, tldq_queue); 258 tcp_log_dev_clear_refcount(entry); 259 priv->tldi_cur = NULL; 260 } 261 262 static int 263 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags) 264 { 265 struct tcp_log_common_header *buf; 266 struct tcp_log_dev_info *priv; 267 struct tcp_log_dev_queue *entry; 268 ssize_t len; 269 int lockstate, rv; 270 271 /* Get our private info. */ 272 rv = devfs_get_cdevpriv((void **)&priv); 273 if (rv) 274 return (rv); 275 276 lockstate = QUEUE_UNLOCKED; 277 278 /* Do we need to get a new buffer? */ 279 while (priv->tldi_cur == NULL || 280 priv->tldi_cur->tlch_length <= priv->tldi_off) { 281 /* Did we somehow forget to rotate? */ 282 KASSERT(priv->tldi_cur == NULL, 283 ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__, 284 __LINE__)); 285 if (priv->tldi_cur != NULL) 286 tcp_log_dev_rotate_bufs(priv, &lockstate); 287 288 /* 289 * Before we start looking at tldi_head, we need a lock on the 290 * queue to make sure tldi_head stays stable. 291 */ 292 if (lockstate == QUEUE_UNLOCKED) { 293 TCP_LOG_DEV_QUEUE_LOCK(); 294 lockstate = QUEUE_LOCKED; 295 } 296 297 /* We need the next buffer. Do we have one? */ 298 if (priv->tldi_head == NULL && (flags & FNONBLOCK)) { 299 rv = EAGAIN; 300 goto done; 301 } 302 if (priv->tldi_head == NULL) { 303 /* Sleep and wait for more things we can read. */ 304 rv = mtx_sleep(&tcp_log_dev_listeners, 305 &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0); 306 if (rv) 307 goto done; 308 if (priv->tldi_head == NULL) 309 continue; 310 } 311 312 /* 313 * We have an entry to read. We want to try to create a 314 * buffer, if one doesn't already exist. 315 */ 316 entry = priv->tldi_head; 317 if (entry->tldq_buf == NULL) { 318 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 319 buf = (*entry->tldq_xform)(entry); 320 if (buf == NULL) { 321 rv = EBUSY; 322 goto done; 323 } 324 entry->tldq_buf = buf; 325 } 326 327 priv->tldi_cur = entry->tldq_buf; 328 priv->tldi_off = 0; 329 } 330 331 /* Copy what we can from this buffer to the output buffer. */ 332 if (uio->uio_resid > 0) { 333 /* Drop locks so we can take page faults. */ 334 if (lockstate == QUEUE_LOCKED) 335 TCP_LOG_DEV_QUEUE_UNLOCK(); 336 lockstate = QUEUE_UNLOCKED; 337 338 KASSERT(priv->tldi_cur != NULL, 339 ("%s: priv->tldi_cur is unexpectedly NULL", __func__)); 340 341 /* Copy as much as we can to this uio. */ 342 len = priv->tldi_cur->tlch_length - priv->tldi_off; 343 if (len > uio->uio_resid) 344 len = uio->uio_resid; 345 rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off, 346 len, uio); 347 if (rv != 0) 348 goto done; 349 priv->tldi_off += len; 350 #ifdef TCPLOG_DEBUG_COUNTERS 351 counter_u64_add(tcp_log_que_read, len); 352 #endif 353 } 354 /* Are we done with this buffer? If so, find the next one. */ 355 if (priv->tldi_off >= priv->tldi_cur->tlch_length) { 356 KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length, 357 ("%s: offset (%ju) exceeds length (%ju)", __func__, 358 (uintmax_t)priv->tldi_off, 359 (uintmax_t)priv->tldi_cur->tlch_length)); 360 tcp_log_dev_rotate_bufs(priv, &lockstate); 361 } 362 done: 363 tcp_log_dev_queue_validate_lock(lockstate); 364 if (lockstate == QUEUE_LOCKED) 365 TCP_LOG_DEV_QUEUE_UNLOCK(); 366 return (rv); 367 } 368 369 static int 370 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, 371 int fflag __unused, struct thread *td __unused) 372 { 373 struct tcp_log_dev_info *priv; 374 int rv; 375 376 /* Get our private info. */ 377 rv = devfs_get_cdevpriv((void **)&priv); 378 if (rv) 379 return (rv); 380 381 /* 382 * Set things. Here, we are most concerned about the non-blocking I/O 383 * flag. 384 */ 385 rv = 0; 386 switch (cmd) { 387 case FIONBIO: 388 break; 389 case FIOASYNC: 390 if (*(int *)data != 0) 391 rv = EINVAL; 392 break; 393 default: 394 rv = ENOIOCTL; 395 } 396 return (rv); 397 } 398 399 static int 400 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td) 401 { 402 struct tcp_log_dev_info *priv; 403 int revents; 404 405 /* 406 * Get our private info. If this fails, claim that all events are 407 * ready. That should prod the user to do something that will 408 * make the error evident to them. 409 */ 410 if (devfs_get_cdevpriv((void **)&priv)) 411 return (events); 412 413 revents = 0; 414 if (events & (POLLIN | POLLRDNORM)) { 415 /* 416 * We can (probably) read right now if we are partway through 417 * a buffer or if we are just about to start a buffer. 418 * Because we are going to read tldi_head, we should acquire 419 * a read lock on the queue. 420 */ 421 TCP_LOG_DEV_QUEUE_LOCK(); 422 if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) || 423 (priv->tldi_cur != NULL && 424 priv->tldi_off < priv->tldi_cur->tlch_length)) 425 revents = events & (POLLIN | POLLRDNORM); 426 else 427 selrecord(td, &tcp_log_sel); 428 TCP_LOG_DEV_QUEUE_UNLOCK(); 429 } else { 430 /* 431 * It only makes sense to poll for reading. So, again, prod the 432 * user to do something that will make the error of their ways 433 * apparent. 434 */ 435 revents = events; 436 } 437 return (revents); 438 } 439 440 int 441 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry) 442 { 443 struct tcp_log_dev_info *priv; 444 int rv; 445 bool wakeup_needed; 446 447 KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL, 448 ("%s: Called with both tldq_buf and tldq_xform set to NULL", 449 __func__)); 450 KASSERT(entry->tldq_dtor != NULL, 451 ("%s: Called with tldq_dtor set to NULL", __func__)); 452 453 /* Get a lock on the queue. */ 454 TCP_LOG_DEV_QUEUE_LOCK(); 455 456 /* If no one is listening, tell the caller to free the resources. */ 457 if (tcp_log_dev_listeners == 0) { 458 rv = ENXIO; 459 goto done; 460 } 461 462 /* Add this to the end of the tailq. */ 463 STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue); 464 465 /* Add references for all current listeners. */ 466 refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners); 467 468 /* 469 * If any listener is currently stuck on NULL, that means they are 470 * waiting. Point their head to this new entry. 471 */ 472 wakeup_needed = false; 473 STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list) 474 if (priv->tldi_head == NULL) { 475 priv->tldi_head = entry; 476 wakeup_needed = true; 477 } 478 479 if (wakeup_needed) { 480 selwakeup(&tcp_log_sel); 481 wakeup(&tcp_log_dev_listeners); 482 } 483 484 rv = 0; 485 486 done: 487 TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); 488 TCP_LOG_DEV_QUEUE_UNLOCK(); 489 return (rv); 490 } 491 492 static int 493 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused) 494 { 495 496 /* TODO: Support intelligent unloading. */ 497 switch (type) { 498 case MOD_LOAD: 499 if (bootverbose) 500 printf("tcp_log: tcp_log device\n"); 501 memset(&tcp_log_sel, 0, sizeof(tcp_log_sel)); 502 memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx)); 503 mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev", 504 "tcp_log device queues", MTX_DEF); 505 tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, 506 &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, 507 "tcp_log"); 508 break; 509 default: 510 return (EOPNOTSUPP); 511 } 512 513 return (0); 514 } 515 516 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL); 517 MODULE_VERSION(tcp_log_dev, 1); 518