1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 #include <sys/types.h> 30 #include <sys/sysmacros.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/errno.h> 34 #include <sys/signal.h> 35 #include <sys/cred.h> 36 #include <sys/user.h> 37 #include <sys/conf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/pathname.h> 41 #include <sys/file.h> 42 #include <sys/proc.h> 43 #include <sys/var.h> 44 #include <sys/cpuvar.h> 45 #include <sys/open.h> 46 #include <sys/cmn_err.h> 47 #include <sys/priocntl.h> 48 #include <sys/procset.h> 49 #include <sys/prsystm.h> 50 #include <sys/debug.h> 51 #include <sys/kmem.h> 52 #include <sys/atomic.h> 53 #include <sys/fcntl.h> 54 #include <sys/poll.h> 55 #include <sys/rctl.h> 56 #include <sys/port_impl.h> 57 58 #include <c2/audit.h> 59 #include <sys/nbmlock.h> 60 61 #ifdef DEBUG 62 63 static uint32_t afd_maxfd; /* # of entries in maximum allocated array */ 64 static uint32_t afd_alloc; /* count of kmem_alloc()s */ 65 static uint32_t afd_free; /* count of kmem_free()s */ 66 static uint32_t afd_wait; /* count of waits on non-zero ref count */ 67 #define MAXFD(x) (afd_maxfd = ((afd_maxfd >= (x))? afd_maxfd : (x))) 68 #define COUNT(x) atomic_add_32(&x, 1) 69 70 #else /* DEBUG */ 71 72 #define MAXFD(x) 73 #define COUNT(x) 74 75 #endif /* DEBUG */ 76 77 kmem_cache_t *file_cache; 78 79 static void port_close_fd(portfd_t *); 80 81 /* 82 * File descriptor allocation. 83 * 84 * fd_find(fip, minfd) finds the first available descriptor >= minfd. 85 * The most common case is open(2), in which minfd = 0, but we must also 86 * support fcntl(fd, F_DUPFD, minfd). 87 * 88 * The algorithm is as follows: we keep all file descriptors in an infix 89 * binary tree in which each node records the number of descriptors 90 * allocated in its right subtree, including itself. Starting at minfd, 91 * we ascend the tree until we find a non-fully allocated right subtree. 92 * We then descend that subtree in a binary search for the smallest fd. 93 * Finally, we ascend the tree again to increment the allocation count 94 * of every subtree containing the newly-allocated fd. Freeing an fd 95 * requires only the last step: we ascend the tree to decrement allocation 96 * counts. Each of these three steps (ascent to find non-full subtree, 97 * descent to find lowest fd, ascent to update allocation counts) is 98 * O(log n), thus the algorithm as a whole is O(log n). 99 * 100 * We don't implement the fd tree using the customary left/right/parent 101 * pointers, but instead take advantage of the glorious mathematics of 102 * full infix binary trees. For reference, here's an illustration of the 103 * logical structure of such a tree, rooted at 4 (binary 100), covering 104 * the range 1-7 (binary 001-111). Our canonical trees do not include 105 * fd 0; we'll deal with that later. 106 * 107 * 100 108 * / \ 109 * / \ 110 * 010 110 111 * / \ / \ 112 * 001 011 101 111 113 * 114 * We make the following observations, all of which are easily proven by 115 * induction on the depth of the tree: 116 * 117 * (T1) The least-significant bit (LSB) of any node is equal to its level 118 * in the tree. In our example, nodes 001, 011, 101 and 111 are at 119 * level 0; nodes 010 and 110 are at level 1; and node 100 is at level 2. 120 * 121 * (T2) The child size (CSIZE) of node N -- that is, the total number of 122 * right-branch descendants in a child of node N, including itself -- is 123 * given by clearing all but the least significant bit of N. This 124 * follows immediately from (T1). Applying this rule to our example, we 125 * see that CSIZE(100) = 100, CSIZE(x10) = 10, and CSIZE(xx1) = 1. 126 * 127 * (T3) The nearest left ancestor (LPARENT) of node N -- that is, the nearest 128 * ancestor containing node N in its right child -- is given by clearing 129 * the LSB of N. For example, LPARENT(111) = 110 and LPARENT(110) = 100. 130 * Clearing the LSB of nodes 001, 010 or 100 yields zero, reflecting 131 * the fact that these are leftmost nodes. Note that this algorithm 132 * automatically skips generations as necessary. For example, the parent 133 * of node 101 is 110, which is a *right* ancestor (not what we want); 134 * but its grandparent is 100, which is a left ancestor. Clearing the LSB 135 * of 101 gets us to 100 directly, skipping right past the uninteresting 136 * generation (110). 137 * 138 * Note that since LPARENT clears the LSB, whereas CSIZE clears all *but* 139 * the LSB, we can express LPARENT() nicely in terms of CSIZE(): 140 * 141 * LPARENT(N) = N - CSIZE(N) 142 * 143 * (T4) The nearest right ancestor (RPARENT) of node N is given by: 144 * 145 * RPARENT(N) = N + CSIZE(N) 146 * 147 * (T5) For every interior node, the children differ from their parent by 148 * CSIZE(parent) / 2. In our example, CSIZE(100) / 2 = 2 = 10 binary, 149 * and indeed, the children of 100 are 100 +/- 10 = 010 and 110. 150 * 151 * Next, we'll need a few two's-complement math tricks. Suppose a number, 152 * N, has the following form: 153 * 154 * N = xxxx10...0 155 * 156 * That is, the binary representation of N consists of some string of bits, 157 * then a 1, then all zeroes. This amounts to nothing more than saying that 158 * N has a least-significant bit, which is true for any N != 0. If we look 159 * at N and N - 1 together, we see that we can combine them in useful ways: 160 * 161 * N = xxxx10...0 162 * N - 1 = xxxx01...1 163 * ------------------------ 164 * N & (N - 1) = xxxx000000 165 * N | (N - 1) = xxxx111111 166 * N ^ (N - 1) = 111111 167 * 168 * In particular, this suggests several easy ways to clear all but the LSB, 169 * which by (T2) is exactly what we need to determine CSIZE(N) = 10...0. 170 * We'll opt for this formulation: 171 * 172 * (C1) CSIZE(N) = (N - 1) ^ (N | (N - 1)) 173 * 174 * Similarly, we have an easy way to determine LPARENT(N), which requires 175 * that we clear the LSB of N: 176 * 177 * (L1) LPARENT(N) = N & (N - 1) 178 * 179 * We note in the above relations that (N | (N - 1)) - N = CSIZE(N) - 1. 180 * When combined with (T4), this yields an easy way to compute RPARENT(N): 181 * 182 * (R1) RPARENT(N) = (N | (N - 1)) + 1 183 * 184 * Finally, to accommodate fd 0 we must adjust all of our results by +/-1 to 185 * move the fd range from [1, 2^n) to [0, 2^n - 1). This is straightforward, 186 * so there's no need to belabor the algebra; the revised relations become: 187 * 188 * (C1a) CSIZE(N) = N ^ (N | (N + 1)) 189 * 190 * (L1a) LPARENT(N) = (N & (N + 1)) - 1 191 * 192 * (R1a) RPARENT(N) = N | (N + 1) 193 * 194 * This completes the mathematical framework. We now have all the tools 195 * we need to implement fd_find() and fd_reserve(). 196 * 197 * fd_find(fip, minfd) finds the smallest available file descriptor >= minfd. 198 * It does not actually allocate the descriptor; that's done by fd_reserve(). 199 * fd_find() proceeds in two steps: 200 * 201 * (1) Find the leftmost subtree that contains a descriptor >= minfd. 202 * We start at the right subtree rooted at minfd. If this subtree is 203 * not full -- if fip->fi_list[minfd].uf_alloc != CSIZE(minfd) -- then 204 * step 1 is done. Otherwise, we know that all fds in this subtree 205 * are taken, so we ascend to RPARENT(minfd) using (R1a). We repeat 206 * this process until we either find a candidate subtree or exceed 207 * fip->fi_nfiles. We use (C1a) to compute CSIZE(). 208 * 209 * (2) Find the smallest fd in the subtree discovered by step 1. 210 * Starting at the root of this subtree, we descend to find the 211 * smallest available fd. Since the left children have the smaller 212 * fds, we will descend rightward only when the left child is full. 213 * 214 * We begin by comparing the number of allocated fds in the root 215 * to the number of allocated fds in its right child; if they differ 216 * by exactly CSIZE(child), we know the left subtree is full, so we 217 * descend right; that is, the right child becomes the search root. 218 * Otherwise we leave the root alone and start following the right 219 * child's left children. As fortune would have it, this is very 220 * simple computationally: by (T5), the right child of fd is just 221 * fd + size, where size = CSIZE(fd) / 2. Applying (T5) again, 222 * we find that the right child's left child is fd + size - (size / 2) = 223 * fd + (size / 2); *its* left child is fd + (size / 2) - (size / 4) = 224 * fd + (size / 4), and so on. In general, fd's right child's 225 * leftmost nth descendant is fd + (size >> n). Thus, to follow 226 * the right child's left descendants, we just halve the size in 227 * each iteration of the search. 228 * 229 * When we descend leftward, we must keep track of the number of fds 230 * that were allocated in all the right subtrees we rejected, so we 231 * know how many of the root fd's allocations are in the remaining 232 * (as yet unexplored) leftmost part of its right subtree. When we 233 * encounter a fully-allocated left child -- that is, when we find 234 * that fip->fi_list[fd].uf_alloc == ralloc + size -- we descend right 235 * (as described earlier), resetting ralloc to zero. 236 * 237 * fd_reserve(fip, fd, incr) either allocates or frees fd, depending 238 * on whether incr is 1 or -1. Starting at fd, fd_reserve() ascends 239 * the leftmost ancestors (see (T3)) and updates the allocation counts. 240 * At each step we use (L1a) to compute LPARENT(), the next left ancestor. 241 * 242 * flist_minsize() finds the minimal tree that still covers all 243 * used fds; as long as the allocation count of a root node is zero, we 244 * don't need that node or its right subtree. 245 * 246 * flist_nalloc() counts the number of allocated fds in the tree, by starting 247 * at the top of the tree and summing the right-subtree allocation counts as 248 * it descends leftwards. 249 * 250 * Note: we assume that flist_grow() will keep fip->fi_nfiles of the form 251 * 2^n - 1. This ensures that the fd trees are always full, which saves 252 * quite a bit of boundary checking. 253 */ 254 static int 255 fd_find(uf_info_t *fip, int minfd) 256 { 257 int size, ralloc, fd; 258 259 ASSERT(MUTEX_HELD(&fip->fi_lock)); 260 ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0); 261 262 for (fd = minfd; (uint_t)fd < fip->fi_nfiles; fd |= fd + 1) { 263 size = fd ^ (fd | (fd + 1)); 264 if (fip->fi_list[fd].uf_alloc == size) 265 continue; 266 for (ralloc = 0, size >>= 1; size != 0; size >>= 1) { 267 ralloc += fip->fi_list[fd + size].uf_alloc; 268 if (fip->fi_list[fd].uf_alloc == ralloc + size) { 269 fd += size; 270 ralloc = 0; 271 } 272 } 273 return (fd); 274 } 275 return (-1); 276 } 277 278 static void 279 fd_reserve(uf_info_t *fip, int fd, int incr) 280 { 281 int pfd; 282 uf_entry_t *ufp = &fip->fi_list[fd]; 283 284 ASSERT((uint_t)fd < fip->fi_nfiles); 285 ASSERT((ufp->uf_busy == 0 && incr == 1) || 286 (ufp->uf_busy == 1 && incr == -1)); 287 ASSERT(MUTEX_HELD(&ufp->uf_lock)); 288 ASSERT(MUTEX_HELD(&fip->fi_lock)); 289 290 for (pfd = fd; pfd >= 0; pfd = (pfd & (pfd + 1)) - 1) 291 fip->fi_list[pfd].uf_alloc += incr; 292 293 ufp->uf_busy += incr; 294 } 295 296 static int 297 flist_minsize(uf_info_t *fip) 298 { 299 int fd; 300 301 /* 302 * We'd like to ASSERT(MUTEX_HELD(&fip->fi_lock)), but we're called 303 * by flist_fork(), which relies on other mechanisms for mutual 304 * exclusion. 305 */ 306 ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0); 307 308 for (fd = fip->fi_nfiles; fd != 0; fd >>= 1) 309 if (fip->fi_list[fd >> 1].uf_alloc != 0) 310 break; 311 312 return (fd); 313 } 314 315 static int 316 flist_nalloc(uf_info_t *fip) 317 { 318 int fd; 319 int nalloc = 0; 320 321 ASSERT(MUTEX_HELD(&fip->fi_lock)); 322 ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0); 323 324 for (fd = fip->fi_nfiles; fd != 0; fd >>= 1) 325 nalloc += fip->fi_list[fd >> 1].uf_alloc; 326 327 return (nalloc); 328 } 329 330 /* 331 * Increase size of the fi_list array to accommodate at least maxfd. 332 * We keep the size of the form 2^n - 1 for benefit of fd_find(). 333 */ 334 static void 335 flist_grow(int maxfd) 336 { 337 uf_info_t *fip = P_FINFO(curproc); 338 int newcnt, oldcnt; 339 uf_entry_t *src, *dst, *newlist, *oldlist, *newend, *oldend; 340 uf_rlist_t *urp; 341 342 for (newcnt = 1; newcnt <= maxfd; newcnt = (newcnt << 1) | 1) 343 continue; 344 345 newlist = kmem_zalloc(newcnt * sizeof (uf_entry_t), KM_SLEEP); 346 347 mutex_enter(&fip->fi_lock); 348 oldcnt = fip->fi_nfiles; 349 if (newcnt <= oldcnt) { 350 mutex_exit(&fip->fi_lock); 351 kmem_free(newlist, newcnt * sizeof (uf_entry_t)); 352 return; 353 } 354 ASSERT((newcnt & (newcnt + 1)) == 0); 355 oldlist = fip->fi_list; 356 oldend = oldlist + oldcnt; 357 newend = newlist + oldcnt; /* no need to lock beyond old end */ 358 359 /* 360 * fi_list and fi_nfiles cannot change while any uf_lock is held, 361 * so we must grab all the old locks *and* the new locks up to oldcnt. 362 * (Locks beyond the end of oldcnt aren't visible until we store 363 * the new fi_nfiles, which is the last thing we do before dropping 364 * all the locks, so there's no need to acquire these locks). 365 * Holding the new locks is necessary because when fi_list changes 366 * to point to the new list, fi_nfiles won't have been stored yet. 367 * If we *didn't* hold the new locks, someone doing a UF_ENTER() 368 * could see the new fi_list, grab the new uf_lock, and then see 369 * fi_nfiles change while the lock is held -- in violation of 370 * UF_ENTER() semantics. 371 */ 372 for (src = oldlist; src < oldend; src++) 373 mutex_enter(&src->uf_lock); 374 375 for (dst = newlist; dst < newend; dst++) 376 mutex_enter(&dst->uf_lock); 377 378 for (src = oldlist, dst = newlist; src < oldend; src++, dst++) { 379 dst->uf_file = src->uf_file; 380 dst->uf_fpollinfo = src->uf_fpollinfo; 381 dst->uf_refcnt = src->uf_refcnt; 382 dst->uf_alloc = src->uf_alloc; 383 dst->uf_flag = src->uf_flag; 384 dst->uf_busy = src->uf_busy; 385 dst->uf_portfd = src->uf_portfd; 386 } 387 388 /* 389 * As soon as we store the new flist, future locking operations 390 * will use it. Therefore, we must ensure that all the state 391 * we've just established reaches global visibility before the 392 * new flist does. 393 */ 394 membar_producer(); 395 fip->fi_list = newlist; 396 397 /* 398 * Routines like getf() make an optimistic check on the validity 399 * of the supplied file descriptor: if it's less than the current 400 * value of fi_nfiles -- examined without any locks -- then it's 401 * safe to attempt a UF_ENTER() on that fd (which is a valid 402 * assumption because fi_nfiles only increases). Therefore, it 403 * is critical that the new value of fi_nfiles not reach global 404 * visibility until after the new fi_list: if it happened the 405 * other way around, getf() could see the new fi_nfiles and attempt 406 * a UF_ENTER() on the old fi_list, which would write beyond its 407 * end if the fd exceeded the old fi_nfiles. 408 */ 409 membar_producer(); 410 fip->fi_nfiles = newcnt; 411 412 /* 413 * The new state is consistent now, so we can drop all the locks. 414 */ 415 for (dst = newlist; dst < newend; dst++) 416 mutex_exit(&dst->uf_lock); 417 418 for (src = oldlist; src < oldend; src++) { 419 /* 420 * If any threads are blocked on the old cvs, wake them. 421 * This will force them to wake up, discover that fi_list 422 * has changed, and go back to sleep on the new cvs. 423 */ 424 cv_broadcast(&src->uf_wanted_cv); 425 cv_broadcast(&src->uf_closing_cv); 426 mutex_exit(&src->uf_lock); 427 } 428 429 mutex_exit(&fip->fi_lock); 430 431 /* 432 * Retire the old flist. We can't actually kmem_free() it now 433 * because someone may still have a pointer to it. Instead, 434 * we link it onto a list of retired flists. The new flist 435 * is at least double the size of the previous flist, so the 436 * total size of all retired flists will be less than the size 437 * of the current one (to prove, consider the sum of a geometric 438 * series in powers of 2). exit() frees the retired flists. 439 */ 440 urp = kmem_zalloc(sizeof (uf_rlist_t), KM_SLEEP); 441 urp->ur_list = oldlist; 442 urp->ur_nfiles = oldcnt; 443 444 mutex_enter(&fip->fi_lock); 445 urp->ur_next = fip->fi_rlist; 446 fip->fi_rlist = urp; 447 mutex_exit(&fip->fi_lock); 448 } 449 450 /* 451 * Utility functions for keeping track of the active file descriptors. 452 */ 453 void 454 clear_stale_fd() /* called from post_syscall() */ 455 { 456 afd_t *afd = &curthread->t_activefd; 457 int i; 458 459 /* uninitialized is ok here, a_nfd is then zero */ 460 for (i = 0; i < afd->a_nfd; i++) { 461 /* assert that this should not be necessary */ 462 ASSERT(afd->a_fd[i] == -1); 463 afd->a_fd[i] = -1; 464 } 465 afd->a_stale = 0; 466 } 467 468 void 469 free_afd(afd_t *afd) /* called below and from thread_free() */ 470 { 471 int i; 472 473 /* free the buffer if it was kmem_alloc()ed */ 474 if (afd->a_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) { 475 COUNT(afd_free); 476 kmem_free(afd->a_fd, afd->a_nfd * sizeof (afd->a_fd[0])); 477 } 478 479 /* (re)initialize the structure */ 480 afd->a_fd = &afd->a_buf[0]; 481 afd->a_nfd = sizeof (afd->a_buf) / sizeof (afd->a_buf[0]); 482 afd->a_stale = 0; 483 for (i = 0; i < afd->a_nfd; i++) 484 afd->a_fd[i] = -1; 485 } 486 487 static void 488 set_active_fd(int fd) 489 { 490 afd_t *afd = &curthread->t_activefd; 491 int i; 492 int *old_fd; 493 int old_nfd; 494 int *new_fd; 495 int new_nfd; 496 497 if (afd->a_nfd == 0) { /* first time initialization */ 498 ASSERT(fd == -1); 499 mutex_enter(&afd->a_fdlock); 500 free_afd(afd); 501 mutex_exit(&afd->a_fdlock); 502 } 503 504 /* insert fd into vacant slot, if any */ 505 for (i = 0; i < afd->a_nfd; i++) { 506 if (afd->a_fd[i] == -1) { 507 afd->a_fd[i] = fd; 508 return; 509 } 510 } 511 512 /* 513 * Reallocate the a_fd[] array to add one more slot. 514 */ 515 ASSERT(fd == -1); 516 old_nfd = afd->a_nfd; 517 old_fd = afd->a_fd; 518 new_nfd = old_nfd + 1; 519 new_fd = kmem_alloc(new_nfd * sizeof (afd->a_fd[0]), KM_SLEEP); 520 MAXFD(new_nfd); 521 COUNT(afd_alloc); 522 523 mutex_enter(&afd->a_fdlock); 524 afd->a_fd = new_fd; 525 afd->a_nfd = new_nfd; 526 for (i = 0; i < old_nfd; i++) 527 afd->a_fd[i] = old_fd[i]; 528 afd->a_fd[i] = fd; 529 mutex_exit(&afd->a_fdlock); 530 531 if (old_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) { 532 COUNT(afd_free); 533 kmem_free(old_fd, old_nfd * sizeof (afd->a_fd[0])); 534 } 535 } 536 537 void 538 clear_active_fd(int fd) /* called below and from aio.c */ 539 { 540 afd_t *afd = &curthread->t_activefd; 541 int i; 542 543 for (i = 0; i < afd->a_nfd; i++) { 544 if (afd->a_fd[i] == fd) { 545 afd->a_fd[i] = -1; 546 break; 547 } 548 } 549 ASSERT(i < afd->a_nfd); /* not found is not ok */ 550 } 551 552 /* 553 * Does this thread have this fd active? 554 */ 555 static int 556 is_active_fd(kthread_t *t, int fd) 557 { 558 afd_t *afd = &t->t_activefd; 559 int i; 560 561 ASSERT(t != curthread); 562 mutex_enter(&afd->a_fdlock); 563 /* uninitialized is ok here, a_nfd is then zero */ 564 for (i = 0; i < afd->a_nfd; i++) { 565 if (afd->a_fd[i] == fd) { 566 mutex_exit(&afd->a_fdlock); 567 return (1); 568 } 569 } 570 mutex_exit(&afd->a_fdlock); 571 return (0); 572 } 573 574 /* 575 * Convert a user supplied file descriptor into a pointer to a file 576 * structure. Only task is to check range of the descriptor (soft 577 * resource limit was enforced at open time and shouldn't be checked 578 * here). 579 */ 580 file_t * 581 getf(int fd) 582 { 583 uf_info_t *fip = P_FINFO(curproc); 584 uf_entry_t *ufp; 585 file_t *fp; 586 587 if ((uint_t)fd >= fip->fi_nfiles) 588 return (NULL); 589 590 /* 591 * Reserve a slot in the active fd array now so we can call 592 * set_active_fd(fd) for real below, while still inside UF_ENTER(). 593 */ 594 set_active_fd(-1); 595 596 UF_ENTER(ufp, fip, fd); 597 598 if ((fp = ufp->uf_file) == NULL) { 599 UF_EXIT(ufp); 600 601 if (fd == fip->fi_badfd && fip->fi_action > 0) 602 tsignal(curthread, fip->fi_action); 603 604 return (NULL); 605 } 606 ufp->uf_refcnt++; 607 608 set_active_fd(fd); /* record the active file descriptor */ 609 610 UF_EXIT(ufp); 611 612 return (fp); 613 } 614 615 /* 616 * Close whatever file currently occupies the file descriptor slot 617 * and install the new file, usually NULL, in the file descriptor slot. 618 * The close must complete before we release the file descriptor slot. 619 * If newfp != NULL we only return an error if we can't allocate the 620 * slot so the caller knows that it needs to free the filep; 621 * in the other cases we return the error number from closef(). 622 */ 623 int 624 closeandsetf(int fd, file_t *newfp) 625 { 626 proc_t *p = curproc; 627 uf_info_t *fip = P_FINFO(p); 628 uf_entry_t *ufp; 629 file_t *fp; 630 fpollinfo_t *fpip; 631 portfd_t *pfd; 632 int error; 633 634 if ((uint_t)fd >= fip->fi_nfiles) { 635 if (newfp == NULL) 636 return (EBADF); 637 flist_grow(fd); 638 } 639 640 if (newfp != NULL) { 641 /* 642 * If ufp is reserved but has no file pointer, it's in the 643 * transition between ufalloc() and setf(). We must wait 644 * for this transition to complete before assigning the 645 * new non-NULL file pointer. 646 */ 647 mutex_enter(&fip->fi_lock); 648 if (fd == fip->fi_badfd) { 649 mutex_exit(&fip->fi_lock); 650 if (fip->fi_action > 0) 651 tsignal(curthread, fip->fi_action); 652 return (EBADF); 653 } 654 UF_ENTER(ufp, fip, fd); 655 while (ufp->uf_busy && ufp->uf_file == NULL) { 656 mutex_exit(&fip->fi_lock); 657 cv_wait_stop(&ufp->uf_wanted_cv, &ufp->uf_lock, 250); 658 UF_EXIT(ufp); 659 mutex_enter(&fip->fi_lock); 660 UF_ENTER(ufp, fip, fd); 661 } 662 if ((fp = ufp->uf_file) == NULL) { 663 ASSERT(ufp->uf_fpollinfo == NULL); 664 ASSERT(ufp->uf_flag == 0); 665 fd_reserve(fip, fd, 1); 666 ufp->uf_file = newfp; 667 UF_EXIT(ufp); 668 mutex_exit(&fip->fi_lock); 669 return (0); 670 } 671 mutex_exit(&fip->fi_lock); 672 } else { 673 UF_ENTER(ufp, fip, fd); 674 if ((fp = ufp->uf_file) == NULL) { 675 UF_EXIT(ufp); 676 return (EBADF); 677 } 678 } 679 680 ASSERT(ufp->uf_busy); 681 ufp->uf_file = NULL; 682 ufp->uf_flag = 0; 683 684 /* 685 * If the file descriptor reference count is non-zero, then 686 * some other lwp in the process is performing system call 687 * activity on the file. To avoid blocking here for a long 688 * time (the other lwp might be in a long term sleep in its 689 * system call), we scan all other lwps in the process to 690 * find the ones with this fd as one of their active fds, 691 * set their a_stale flag, and set them running if they 692 * are in an interruptible sleep so they will emerge from 693 * their system calls immediately. post_syscall() will 694 * test the a_stale flag and set errno to EBADF. 695 */ 696 ASSERT(ufp->uf_refcnt == 0 || p->p_lwpcnt > 1); 697 if (ufp->uf_refcnt > 0) { 698 kthread_t *t; 699 700 /* 701 * We call sprlock_proc(p) to ensure that the thread 702 * list will not change while we are scanning it. 703 * To do this, we must drop ufp->uf_lock and then 704 * reacquire it (so we are not holding both p->p_lock 705 * and ufp->uf_lock at the same time). ufp->uf_lock 706 * must be held for is_active_fd() to be correct 707 * (set_active_fd() is called while holding ufp->uf_lock). 708 * 709 * This is a convoluted dance, but it is better than 710 * the old brute-force method of stopping every thread 711 * in the process by calling holdlwps(SHOLDFORK1). 712 */ 713 714 UF_EXIT(ufp); 715 COUNT(afd_wait); 716 717 mutex_enter(&p->p_lock); 718 sprlock_proc(p); 719 mutex_exit(&p->p_lock); 720 721 UF_ENTER(ufp, fip, fd); 722 ASSERT(ufp->uf_file == NULL); 723 724 if (ufp->uf_refcnt > 0) { 725 for (t = curthread->t_forw; 726 t != curthread; 727 t = t->t_forw) { 728 if (is_active_fd(t, fd)) { 729 thread_lock(t); 730 t->t_activefd.a_stale = 1; 731 t->t_post_sys = 1; 732 if (ISWAKEABLE(t)) 733 setrun_locked(t); 734 thread_unlock(t); 735 } 736 } 737 } 738 739 UF_EXIT(ufp); 740 741 mutex_enter(&p->p_lock); 742 sprunlock(p); 743 744 UF_ENTER(ufp, fip, fd); 745 ASSERT(ufp->uf_file == NULL); 746 } 747 748 /* 749 * Wait for other lwps to stop using this file descriptor. 750 */ 751 while (ufp->uf_refcnt > 0) { 752 cv_wait_stop(&ufp->uf_closing_cv, &ufp->uf_lock, 250); 753 /* 754 * cv_wait_stop() drops ufp->uf_lock, so the file list 755 * can change. Drop the lock on our (possibly) stale 756 * ufp and let UF_ENTER() find and lock the current ufp. 757 */ 758 UF_EXIT(ufp); 759 UF_ENTER(ufp, fip, fd); 760 } 761 762 #ifdef DEBUG 763 /* 764 * catch a watchfd on device's pollhead list but not on fpollinfo list 765 */ 766 if (ufp->uf_fpollinfo != NULL) 767 checkwfdlist(fp->f_vnode, ufp->uf_fpollinfo); 768 #endif /* DEBUG */ 769 770 /* 771 * We may need to cleanup some cached poll states in t_pollstate 772 * before the fd can be reused. It is important that we don't 773 * access a stale thread structure. We will do the cleanup in two 774 * phases to avoid deadlock and holding uf_lock for too long. 775 * In phase 1, hold the uf_lock and call pollblockexit() to set 776 * state in t_pollstate struct so that a thread does not exit on 777 * us. In phase 2, we drop the uf_lock and call pollcacheclean(). 778 */ 779 pfd = ufp->uf_portfd; 780 ufp->uf_portfd = NULL; 781 fpip = ufp->uf_fpollinfo; 782 ufp->uf_fpollinfo = NULL; 783 if (fpip != NULL) 784 pollblockexit(fpip); 785 UF_EXIT(ufp); 786 if (fpip != NULL) 787 pollcacheclean(fpip, fd); 788 if (pfd) 789 port_close_fd(pfd); 790 791 /* 792 * Keep the file descriptor entry reserved across the closef(). 793 */ 794 error = closef(fp); 795 796 setf(fd, newfp); 797 798 /* Only return closef() error when closing is all we do */ 799 return (newfp == NULL ? error : 0); 800 } 801 802 /* 803 * Decrement uf_refcnt; wakeup anyone waiting to close the file. 804 */ 805 void 806 releasef(int fd) 807 { 808 uf_info_t *fip = P_FINFO(curproc); 809 uf_entry_t *ufp; 810 811 UF_ENTER(ufp, fip, fd); 812 ASSERT(ufp->uf_refcnt > 0); 813 clear_active_fd(fd); /* clear the active file descriptor */ 814 if (--ufp->uf_refcnt == 0) 815 cv_broadcast(&ufp->uf_closing_cv); 816 UF_EXIT(ufp); 817 } 818 819 /* 820 * Identical to releasef() but can be called from another process. 821 */ 822 void 823 areleasef(int fd, uf_info_t *fip) 824 { 825 uf_entry_t *ufp; 826 827 UF_ENTER(ufp, fip, fd); 828 ASSERT(ufp->uf_refcnt > 0); 829 if (--ufp->uf_refcnt == 0) 830 cv_broadcast(&ufp->uf_closing_cv); 831 UF_EXIT(ufp); 832 } 833 834 /* 835 * Duplicate all file descriptors across a fork. 836 */ 837 void 838 flist_fork(uf_info_t *pfip, uf_info_t *cfip) 839 { 840 int fd, nfiles; 841 uf_entry_t *pufp, *cufp; 842 843 mutex_init(&cfip->fi_lock, NULL, MUTEX_DEFAULT, NULL); 844 cfip->fi_rlist = NULL; 845 846 /* 847 * We don't need to hold fi_lock because all other lwp's in the 848 * parent have been held. 849 */ 850 cfip->fi_nfiles = nfiles = flist_minsize(pfip); 851 852 cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); 853 854 for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles; 855 fd++, pufp++, cufp++) { 856 cufp->uf_file = pufp->uf_file; 857 cufp->uf_alloc = pufp->uf_alloc; 858 cufp->uf_flag = pufp->uf_flag; 859 cufp->uf_busy = pufp->uf_busy; 860 if (pufp->uf_file == NULL) { 861 ASSERT(pufp->uf_flag == 0); 862 if (pufp->uf_busy) { 863 /* 864 * Grab locks to appease ASSERTs in fd_reserve 865 */ 866 mutex_enter(&cfip->fi_lock); 867 mutex_enter(&cufp->uf_lock); 868 fd_reserve(cfip, fd, -1); 869 mutex_exit(&cufp->uf_lock); 870 mutex_exit(&cfip->fi_lock); 871 } 872 } 873 } 874 } 875 876 /* 877 * Close all open file descriptors for the current process. 878 * This is only called from exit(), which is single-threaded, 879 * so we don't need any locking. 880 */ 881 void 882 closeall(uf_info_t *fip) 883 { 884 int fd; 885 file_t *fp; 886 uf_entry_t *ufp; 887 888 ufp = fip->fi_list; 889 for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) { 890 if ((fp = ufp->uf_file) != NULL) { 891 ufp->uf_file = NULL; 892 if (ufp->uf_portfd != NULL) { 893 portfd_t *pfd; 894 /* remove event port association */ 895 pfd = ufp->uf_portfd; 896 ufp->uf_portfd = NULL; 897 port_close_fd(pfd); 898 } 899 ASSERT(ufp->uf_fpollinfo == NULL); 900 (void) closef(fp); 901 } 902 } 903 904 kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t)); 905 fip->fi_list = NULL; 906 fip->fi_nfiles = 0; 907 while (fip->fi_rlist != NULL) { 908 uf_rlist_t *urp = fip->fi_rlist; 909 fip->fi_rlist = urp->ur_next; 910 kmem_free(urp->ur_list, urp->ur_nfiles * sizeof (uf_entry_t)); 911 kmem_free(urp, sizeof (uf_rlist_t)); 912 } 913 } 914 915 /* 916 * Internal form of close. Decrement reference count on file 917 * structure. Decrement reference count on the vnode following 918 * removal of the referencing file structure. 919 */ 920 int 921 closef(file_t *fp) 922 { 923 vnode_t *vp; 924 int error; 925 int count; 926 int flag; 927 offset_t offset; 928 929 /* 930 * audit close of file (may be exit) 931 */ 932 if (AU_AUDITING()) 933 audit_closef(fp); 934 ASSERT(MUTEX_NOT_HELD(&P_FINFO(curproc)->fi_lock)); 935 936 mutex_enter(&fp->f_tlock); 937 938 ASSERT(fp->f_count > 0); 939 940 count = fp->f_count--; 941 flag = fp->f_flag; 942 offset = fp->f_offset; 943 944 vp = fp->f_vnode; 945 946 error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL); 947 948 if (count > 1) { 949 mutex_exit(&fp->f_tlock); 950 return (error); 951 } 952 ASSERT(fp->f_count == 0); 953 mutex_exit(&fp->f_tlock); 954 955 VN_RELE(vp); 956 /* 957 * deallocate resources to audit_data 958 */ 959 if (audit_active) 960 audit_unfalloc(fp); 961 crfree(fp->f_cred); 962 kmem_cache_free(file_cache, fp); 963 return (error); 964 } 965 966 /* 967 * This is a combination of ufalloc() and setf(). 968 */ 969 int 970 ufalloc_file(int start, file_t *fp) 971 { 972 proc_t *p = curproc; 973 uf_info_t *fip = P_FINFO(p); 974 int filelimit; 975 uf_entry_t *ufp; 976 int nfiles; 977 int fd; 978 979 /* 980 * Assertion is to convince the correctness of the following 981 * assignment for filelimit after casting to int. 982 */ 983 ASSERT(p->p_fno_ctl <= INT_MAX); 984 filelimit = (int)p->p_fno_ctl; 985 986 for (;;) { 987 mutex_enter(&fip->fi_lock); 988 fd = fd_find(fip, start); 989 if (fd >= 0 && fd == fip->fi_badfd) { 990 start = fd + 1; 991 mutex_exit(&fip->fi_lock); 992 continue; 993 } 994 if ((uint_t)fd < filelimit) 995 break; 996 if (fd >= filelimit) { 997 mutex_exit(&fip->fi_lock); 998 mutex_enter(&p->p_lock); 999 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 1000 p->p_rctls, p, RCA_SAFE); 1001 mutex_exit(&p->p_lock); 1002 return (-1); 1003 } 1004 /* fd_find() returned -1 */ 1005 nfiles = fip->fi_nfiles; 1006 mutex_exit(&fip->fi_lock); 1007 flist_grow(MAX(start, nfiles)); 1008 } 1009 1010 UF_ENTER(ufp, fip, fd); 1011 fd_reserve(fip, fd, 1); 1012 ASSERT(ufp->uf_file == NULL); 1013 ufp->uf_file = fp; 1014 UF_EXIT(ufp); 1015 mutex_exit(&fip->fi_lock); 1016 return (fd); 1017 } 1018 1019 /* 1020 * Allocate a user file descriptor greater than or equal to "start". 1021 */ 1022 int 1023 ufalloc(int start) 1024 { 1025 return (ufalloc_file(start, NULL)); 1026 } 1027 1028 /* 1029 * Check that a future allocation of count fds on proc p has a good 1030 * chance of succeeding. If not, do rctl processing as if we'd failed 1031 * the allocation. 1032 * 1033 * Our caller must guarantee that p cannot disappear underneath us. 1034 */ 1035 int 1036 ufcanalloc(proc_t *p, uint_t count) 1037 { 1038 uf_info_t *fip = P_FINFO(p); 1039 int filelimit; 1040 int current; 1041 1042 if (count == 0) 1043 return (1); 1044 1045 ASSERT(p->p_fno_ctl <= INT_MAX); 1046 filelimit = (int)p->p_fno_ctl; 1047 1048 mutex_enter(&fip->fi_lock); 1049 current = flist_nalloc(fip); /* # of in-use descriptors */ 1050 mutex_exit(&fip->fi_lock); 1051 1052 /* 1053 * If count is a positive integer, the worst that can happen is 1054 * an overflow to a negative value, which is caught by the >= 0 check. 1055 */ 1056 current += count; 1057 if (count <= INT_MAX && current >= 0 && current <= filelimit) 1058 return (1); 1059 1060 mutex_enter(&p->p_lock); 1061 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 1062 p->p_rctls, p, RCA_SAFE); 1063 mutex_exit(&p->p_lock); 1064 return (0); 1065 } 1066 1067 /* 1068 * Allocate a user file descriptor and a file structure. 1069 * Initialize the descriptor to point at the file structure. 1070 * If fdp is NULL, the user file descriptor will not be allocated. 1071 */ 1072 int 1073 falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp) 1074 { 1075 file_t *fp; 1076 int fd; 1077 1078 if (fdp) { 1079 if ((fd = ufalloc(0)) == -1) 1080 return (EMFILE); 1081 } 1082 fp = kmem_cache_alloc(file_cache, KM_SLEEP); 1083 /* 1084 * Note: falloc returns the fp locked 1085 */ 1086 mutex_enter(&fp->f_tlock); 1087 fp->f_count = 1; 1088 fp->f_flag = (ushort_t)flag; 1089 fp->f_flag2 = (flag & (FSEARCH|FEXEC)) >> 16; 1090 fp->f_vnode = vp; 1091 fp->f_offset = 0; 1092 fp->f_audit_data = 0; 1093 crhold(fp->f_cred = CRED()); 1094 /* 1095 * allocate resources to audit_data 1096 */ 1097 if (audit_active) 1098 audit_falloc(fp); 1099 *fpp = fp; 1100 if (fdp) 1101 *fdp = fd; 1102 return (0); 1103 } 1104 1105 /*ARGSUSED*/ 1106 static int 1107 file_cache_constructor(void *buf, void *cdrarg, int kmflags) 1108 { 1109 file_t *fp = buf; 1110 1111 mutex_init(&fp->f_tlock, NULL, MUTEX_DEFAULT, NULL); 1112 return (0); 1113 } 1114 1115 /*ARGSUSED*/ 1116 static void 1117 file_cache_destructor(void *buf, void *cdrarg) 1118 { 1119 file_t *fp = buf; 1120 1121 mutex_destroy(&fp->f_tlock); 1122 } 1123 1124 void 1125 finit() 1126 { 1127 file_cache = kmem_cache_create("file_cache", sizeof (file_t), 0, 1128 file_cache_constructor, file_cache_destructor, NULL, NULL, NULL, 0); 1129 } 1130 1131 void 1132 unfalloc(file_t *fp) 1133 { 1134 ASSERT(MUTEX_HELD(&fp->f_tlock)); 1135 if (--fp->f_count <= 0) { 1136 /* 1137 * deallocate resources to audit_data 1138 */ 1139 if (audit_active) 1140 audit_unfalloc(fp); 1141 crfree(fp->f_cred); 1142 mutex_exit(&fp->f_tlock); 1143 kmem_cache_free(file_cache, fp); 1144 } else 1145 mutex_exit(&fp->f_tlock); 1146 } 1147 1148 /* 1149 * Given a file descriptor, set the user's 1150 * file pointer to the given parameter. 1151 */ 1152 void 1153 setf(int fd, file_t *fp) 1154 { 1155 uf_info_t *fip = P_FINFO(curproc); 1156 uf_entry_t *ufp; 1157 1158 if (AU_AUDITING()) 1159 audit_setf(fp, fd); 1160 1161 if (fp == NULL) { 1162 mutex_enter(&fip->fi_lock); 1163 UF_ENTER(ufp, fip, fd); 1164 fd_reserve(fip, fd, -1); 1165 mutex_exit(&fip->fi_lock); 1166 } else { 1167 UF_ENTER(ufp, fip, fd); 1168 ASSERT(ufp->uf_busy); 1169 } 1170 ASSERT(ufp->uf_fpollinfo == NULL); 1171 ASSERT(ufp->uf_flag == 0); 1172 ufp->uf_file = fp; 1173 cv_broadcast(&ufp->uf_wanted_cv); 1174 UF_EXIT(ufp); 1175 } 1176 1177 /* 1178 * Given a file descriptor, return the file table flags, plus, 1179 * if this is a socket in asynchronous mode, the FASYNC flag. 1180 * getf() may or may not have been called before calling f_getfl(). 1181 */ 1182 int 1183 f_getfl(int fd, int *flagp) 1184 { 1185 uf_info_t *fip = P_FINFO(curproc); 1186 uf_entry_t *ufp; 1187 file_t *fp; 1188 int error; 1189 1190 if ((uint_t)fd >= fip->fi_nfiles) 1191 error = EBADF; 1192 else { 1193 UF_ENTER(ufp, fip, fd); 1194 if ((fp = ufp->uf_file) == NULL) 1195 error = EBADF; 1196 else { 1197 vnode_t *vp = fp->f_vnode; 1198 int flag = fp->f_flag | (fp->f_flag2 << 16); 1199 1200 /* 1201 * BSD fcntl() FASYNC compatibility. 1202 */ 1203 if (vp->v_type == VSOCK) 1204 flag |= sock_getfasync(vp); 1205 *flagp = flag; 1206 error = 0; 1207 } 1208 UF_EXIT(ufp); 1209 } 1210 1211 return (error); 1212 } 1213 1214 /* 1215 * Given a file descriptor, return the user's file flags. 1216 * Force the FD_CLOEXEC flag for writable self-open /proc files. 1217 * getf() may or may not have been called before calling f_getfd_error(). 1218 */ 1219 int 1220 f_getfd_error(int fd, int *flagp) 1221 { 1222 uf_info_t *fip = P_FINFO(curproc); 1223 uf_entry_t *ufp; 1224 file_t *fp; 1225 int flag; 1226 int error; 1227 1228 if ((uint_t)fd >= fip->fi_nfiles) 1229 error = EBADF; 1230 else { 1231 UF_ENTER(ufp, fip, fd); 1232 if ((fp = ufp->uf_file) == NULL) 1233 error = EBADF; 1234 else { 1235 flag = ufp->uf_flag; 1236 if ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode)) 1237 flag |= FD_CLOEXEC; 1238 *flagp = flag; 1239 error = 0; 1240 } 1241 UF_EXIT(ufp); 1242 } 1243 1244 return (error); 1245 } 1246 1247 /* 1248 * getf() must have been called before calling f_getfd(). 1249 */ 1250 char 1251 f_getfd(int fd) 1252 { 1253 int flag = 0; 1254 (void) f_getfd_error(fd, &flag); 1255 return ((char)flag); 1256 } 1257 1258 /* 1259 * Given a file descriptor and file flags, set the user's file flags. 1260 * At present, the only valid flag is FD_CLOEXEC. 1261 * getf() may or may not have been called before calling f_setfd_error(). 1262 */ 1263 int 1264 f_setfd_error(int fd, int flags) 1265 { 1266 uf_info_t *fip = P_FINFO(curproc); 1267 uf_entry_t *ufp; 1268 int error; 1269 1270 if ((uint_t)fd >= fip->fi_nfiles) 1271 error = EBADF; 1272 else { 1273 UF_ENTER(ufp, fip, fd); 1274 if (ufp->uf_file == NULL) 1275 error = EBADF; 1276 else { 1277 ufp->uf_flag = flags & FD_CLOEXEC; 1278 error = 0; 1279 } 1280 UF_EXIT(ufp); 1281 } 1282 return (error); 1283 } 1284 1285 void 1286 f_setfd(int fd, char flags) 1287 { 1288 (void) f_setfd_error(fd, flags); 1289 } 1290 1291 #define BADFD_MIN 3 1292 #define BADFD_MAX 255 1293 1294 /* 1295 * Attempt to allocate a file descriptor which is bad and which 1296 * is "poison" to the application. It cannot be closed (except 1297 * on exec), allocated for a different use, etc. 1298 */ 1299 int 1300 f_badfd(int start, int *fdp, int action) 1301 { 1302 int fdr; 1303 int badfd; 1304 uf_info_t *fip = P_FINFO(curproc); 1305 1306 #ifdef _LP64 1307 /* No restrictions on 64 bit _file */ 1308 if (get_udatamodel() != DATAMODEL_ILP32) 1309 return (EINVAL); 1310 #endif 1311 1312 if (start > BADFD_MAX || start < BADFD_MIN) 1313 return (EINVAL); 1314 1315 if (action >= NSIG || action < 0) 1316 return (EINVAL); 1317 1318 mutex_enter(&fip->fi_lock); 1319 badfd = fip->fi_badfd; 1320 mutex_exit(&fip->fi_lock); 1321 1322 if (badfd != -1) 1323 return (EAGAIN); 1324 1325 fdr = ufalloc(start); 1326 1327 if (fdr > BADFD_MAX) { 1328 setf(fdr, NULL); 1329 return (EMFILE); 1330 } 1331 if (fdr < 0) 1332 return (EMFILE); 1333 1334 mutex_enter(&fip->fi_lock); 1335 if (fip->fi_badfd != -1) { 1336 /* Lost race */ 1337 mutex_exit(&fip->fi_lock); 1338 setf(fdr, NULL); 1339 return (EAGAIN); 1340 } 1341 fip->fi_action = action; 1342 fip->fi_badfd = fdr; 1343 mutex_exit(&fip->fi_lock); 1344 setf(fdr, NULL); 1345 1346 *fdp = fdr; 1347 1348 return (0); 1349 } 1350 1351 /* 1352 * Allocate a file descriptor and assign it to the vnode "*vpp", 1353 * performing the usual open protocol upon it and returning the 1354 * file descriptor allocated. It is the responsibility of the 1355 * caller to dispose of "*vpp" if any error occurs. 1356 */ 1357 int 1358 fassign(vnode_t **vpp, int mode, int *fdp) 1359 { 1360 file_t *fp; 1361 int error; 1362 int fd; 1363 1364 if (error = falloc((vnode_t *)NULL, mode, &fp, &fd)) 1365 return (error); 1366 if (error = VOP_OPEN(vpp, mode, fp->f_cred, NULL)) { 1367 setf(fd, NULL); 1368 unfalloc(fp); 1369 return (error); 1370 } 1371 fp->f_vnode = *vpp; 1372 mutex_exit(&fp->f_tlock); 1373 /* 1374 * Fill in the slot falloc reserved. 1375 */ 1376 setf(fd, fp); 1377 *fdp = fd; 1378 return (0); 1379 } 1380 1381 /* 1382 * When a process forks it must increment the f_count of all file pointers 1383 * since there is a new process pointing at them. fcnt_add(fip, 1) does this. 1384 * Since we are called when there is only 1 active lwp we don't need to 1385 * hold fi_lock or any uf_lock. If the fork fails, fork_fail() calls 1386 * fcnt_add(fip, -1) to restore the counts. 1387 */ 1388 void 1389 fcnt_add(uf_info_t *fip, int incr) 1390 { 1391 int i; 1392 uf_entry_t *ufp; 1393 file_t *fp; 1394 1395 ufp = fip->fi_list; 1396 for (i = 0; i < fip->fi_nfiles; i++, ufp++) { 1397 if ((fp = ufp->uf_file) != NULL) { 1398 mutex_enter(&fp->f_tlock); 1399 ASSERT((incr == 1 && fp->f_count >= 1) || 1400 (incr == -1 && fp->f_count >= 2)); 1401 fp->f_count += incr; 1402 mutex_exit(&fp->f_tlock); 1403 } 1404 } 1405 } 1406 1407 /* 1408 * This is called from exec to close all fd's that have the FD_CLOEXEC flag 1409 * set and also to close all self-open for write /proc file descriptors. 1410 */ 1411 void 1412 close_exec(uf_info_t *fip) 1413 { 1414 int fd; 1415 file_t *fp; 1416 fpollinfo_t *fpip; 1417 uf_entry_t *ufp; 1418 portfd_t *pfd; 1419 1420 ufp = fip->fi_list; 1421 for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) { 1422 if ((fp = ufp->uf_file) != NULL && 1423 ((ufp->uf_flag & FD_CLOEXEC) || 1424 ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode)))) { 1425 fpip = ufp->uf_fpollinfo; 1426 mutex_enter(&fip->fi_lock); 1427 mutex_enter(&ufp->uf_lock); 1428 fd_reserve(fip, fd, -1); 1429 mutex_exit(&fip->fi_lock); 1430 ufp->uf_file = NULL; 1431 ufp->uf_fpollinfo = NULL; 1432 ufp->uf_flag = 0; 1433 /* 1434 * We may need to cleanup some cached poll states 1435 * in t_pollstate before the fd can be reused. It 1436 * is important that we don't access a stale thread 1437 * structure. We will do the cleanup in two 1438 * phases to avoid deadlock and holding uf_lock for 1439 * too long. In phase 1, hold the uf_lock and call 1440 * pollblockexit() to set state in t_pollstate struct 1441 * so that a thread does not exit on us. In phase 2, 1442 * we drop the uf_lock and call pollcacheclean(). 1443 */ 1444 pfd = ufp->uf_portfd; 1445 ufp->uf_portfd = NULL; 1446 if (fpip != NULL) 1447 pollblockexit(fpip); 1448 mutex_exit(&ufp->uf_lock); 1449 if (fpip != NULL) 1450 pollcacheclean(fpip, fd); 1451 if (pfd) 1452 port_close_fd(pfd); 1453 (void) closef(fp); 1454 } 1455 } 1456 1457 /* Reset bad fd */ 1458 fip->fi_badfd = -1; 1459 fip->fi_action = -1; 1460 } 1461 1462 /* 1463 * Utility function called by most of the *at() system call interfaces. 1464 * 1465 * Generate a starting vnode pointer for an (fd, path) pair where 'fd' 1466 * is an open file descriptor for a directory to be used as the starting 1467 * point for the lookup of the relative pathname 'path' (or, if path is 1468 * NULL, generate a vnode pointer for the direct target of the operation). 1469 * 1470 * If we successfully return a non-NULL startvp, it has been the target 1471 * of VN_HOLD() and the caller must call VN_RELE() on it. 1472 */ 1473 int 1474 fgetstartvp(int fd, char *path, vnode_t **startvpp) 1475 { 1476 vnode_t *startvp; 1477 file_t *startfp; 1478 char startchar; 1479 1480 if (fd == AT_FDCWD && path == NULL) 1481 return (EFAULT); 1482 1483 if (fd == AT_FDCWD) { 1484 /* 1485 * Start from the current working directory. 1486 */ 1487 startvp = NULL; 1488 } else { 1489 if (path == NULL) 1490 startchar = '\0'; 1491 else if (copyin(path, &startchar, sizeof (char))) 1492 return (EFAULT); 1493 1494 if (startchar == '/') { 1495 /* 1496 * 'path' is an absolute pathname. 1497 */ 1498 startvp = NULL; 1499 } else { 1500 /* 1501 * 'path' is a relative pathname or we will 1502 * be applying the operation to 'fd' itself. 1503 */ 1504 if ((startfp = getf(fd)) == NULL) 1505 return (EBADF); 1506 startvp = startfp->f_vnode; 1507 VN_HOLD(startvp); 1508 releasef(fd); 1509 } 1510 } 1511 *startvpp = startvp; 1512 return (0); 1513 } 1514 1515 /* 1516 * Called from fchownat() and fchmodat() to set ownership and mode. 1517 * The contents of *vap must be set before calling here. 1518 */ 1519 int 1520 fsetattrat(int fd, char *path, int flags, struct vattr *vap) 1521 { 1522 vnode_t *startvp; 1523 vnode_t *vp; 1524 int error; 1525 1526 /* 1527 * Since we are never called to set the size of a file, we don't 1528 * need to check for non-blocking locks (via nbl_need_check(vp)). 1529 */ 1530 ASSERT(!(vap->va_mask & AT_SIZE)); 1531 1532 if ((error = fgetstartvp(fd, path, &startvp)) != 0) 1533 return (error); 1534 if (AU_AUDITING() && startvp != NULL) 1535 audit_setfsat_path(1); 1536 1537 /* 1538 * Do lookup for fchownat/fchmodat when path not NULL 1539 */ 1540 if (path != NULL) { 1541 if (error = lookupnameat(path, UIO_USERSPACE, 1542 (flags == AT_SYMLINK_NOFOLLOW) ? 1543 NO_FOLLOW : FOLLOW, 1544 NULLVPP, &vp, startvp)) { 1545 if (startvp != NULL) 1546 VN_RELE(startvp); 1547 return (error); 1548 } 1549 } else { 1550 vp = startvp; 1551 ASSERT(vp); 1552 VN_HOLD(vp); 1553 } 1554 1555 if (vn_is_readonly(vp)) { 1556 error = EROFS; 1557 } else { 1558 error = VOP_SETATTR(vp, vap, 0, CRED(), NULL); 1559 } 1560 1561 if (startvp != NULL) 1562 VN_RELE(startvp); 1563 VN_RELE(vp); 1564 1565 return (error); 1566 } 1567 1568 /* 1569 * Return true if the given vnode is referenced by any 1570 * entry in the current process's file descriptor table. 1571 */ 1572 int 1573 fisopen(vnode_t *vp) 1574 { 1575 int fd; 1576 file_t *fp; 1577 vnode_t *ovp; 1578 uf_info_t *fip = P_FINFO(curproc); 1579 uf_entry_t *ufp; 1580 1581 mutex_enter(&fip->fi_lock); 1582 for (fd = 0; fd < fip->fi_nfiles; fd++) { 1583 UF_ENTER(ufp, fip, fd); 1584 if ((fp = ufp->uf_file) != NULL && 1585 (ovp = fp->f_vnode) != NULL && VN_CMP(vp, ovp)) { 1586 UF_EXIT(ufp); 1587 mutex_exit(&fip->fi_lock); 1588 return (1); 1589 } 1590 UF_EXIT(ufp); 1591 } 1592 mutex_exit(&fip->fi_lock); 1593 return (0); 1594 } 1595 1596 /* 1597 * Return zero if at least one file currently open (by curproc) shouldn't be 1598 * allowed to change zones. 1599 */ 1600 int 1601 files_can_change_zones(void) 1602 { 1603 int fd; 1604 file_t *fp; 1605 uf_info_t *fip = P_FINFO(curproc); 1606 uf_entry_t *ufp; 1607 1608 mutex_enter(&fip->fi_lock); 1609 for (fd = 0; fd < fip->fi_nfiles; fd++) { 1610 UF_ENTER(ufp, fip, fd); 1611 if ((fp = ufp->uf_file) != NULL && 1612 !vn_can_change_zones(fp->f_vnode)) { 1613 UF_EXIT(ufp); 1614 mutex_exit(&fip->fi_lock); 1615 return (0); 1616 } 1617 UF_EXIT(ufp); 1618 } 1619 mutex_exit(&fip->fi_lock); 1620 return (1); 1621 } 1622 1623 #ifdef DEBUG 1624 1625 /* 1626 * The following functions are only used in ASSERT()s elsewhere. 1627 * They do not modify the state of the system. 1628 */ 1629 1630 /* 1631 * Return true (1) if the current thread is in the fpollinfo 1632 * list for this file descriptor, else false (0). 1633 */ 1634 static int 1635 curthread_in_plist(uf_entry_t *ufp) 1636 { 1637 fpollinfo_t *fpip; 1638 1639 ASSERT(MUTEX_HELD(&ufp->uf_lock)); 1640 for (fpip = ufp->uf_fpollinfo; fpip; fpip = fpip->fp_next) 1641 if (fpip->fp_thread == curthread) 1642 return (1); 1643 return (0); 1644 } 1645 1646 /* 1647 * Sanity check to make sure that after lwp_exit(), 1648 * curthread does not appear on any fd's fpollinfo list. 1649 */ 1650 void 1651 checkfpollinfo(void) 1652 { 1653 int fd; 1654 uf_info_t *fip = P_FINFO(curproc); 1655 uf_entry_t *ufp; 1656 1657 mutex_enter(&fip->fi_lock); 1658 for (fd = 0; fd < fip->fi_nfiles; fd++) { 1659 UF_ENTER(ufp, fip, fd); 1660 ASSERT(!curthread_in_plist(ufp)); 1661 UF_EXIT(ufp); 1662 } 1663 mutex_exit(&fip->fi_lock); 1664 } 1665 1666 /* 1667 * Return true (1) if the current thread is in the fpollinfo 1668 * list for this file descriptor, else false (0). 1669 * This is the same as curthread_in_plist(), 1670 * but is called w/o holding uf_lock. 1671 */ 1672 int 1673 infpollinfo(int fd) 1674 { 1675 uf_info_t *fip = P_FINFO(curproc); 1676 uf_entry_t *ufp; 1677 int rc; 1678 1679 UF_ENTER(ufp, fip, fd); 1680 rc = curthread_in_plist(ufp); 1681 UF_EXIT(ufp); 1682 return (rc); 1683 } 1684 1685 #endif /* DEBUG */ 1686 1687 /* 1688 * Add the curthread to fpollinfo list, meaning this fd is currently in the 1689 * thread's poll cache. Each lwp polling this file descriptor should call 1690 * this routine once. 1691 */ 1692 void 1693 addfpollinfo(int fd) 1694 { 1695 struct uf_entry *ufp; 1696 fpollinfo_t *fpip; 1697 uf_info_t *fip = P_FINFO(curproc); 1698 1699 fpip = kmem_zalloc(sizeof (fpollinfo_t), KM_SLEEP); 1700 fpip->fp_thread = curthread; 1701 UF_ENTER(ufp, fip, fd); 1702 /* 1703 * Assert we are not already on the list, that is, that 1704 * this lwp did not call addfpollinfo twice for the same fd. 1705 */ 1706 ASSERT(!curthread_in_plist(ufp)); 1707 /* 1708 * addfpollinfo is always done inside the getf/releasef pair. 1709 */ 1710 ASSERT(ufp->uf_refcnt >= 1); 1711 fpip->fp_next = ufp->uf_fpollinfo; 1712 ufp->uf_fpollinfo = fpip; 1713 UF_EXIT(ufp); 1714 } 1715 1716 /* 1717 * Delete curthread from fpollinfo list if it is there. 1718 */ 1719 void 1720 delfpollinfo(int fd) 1721 { 1722 struct uf_entry *ufp; 1723 struct fpollinfo *fpip; 1724 struct fpollinfo **fpipp; 1725 uf_info_t *fip = P_FINFO(curproc); 1726 1727 UF_ENTER(ufp, fip, fd); 1728 for (fpipp = &ufp->uf_fpollinfo; 1729 (fpip = *fpipp) != NULL; 1730 fpipp = &fpip->fp_next) { 1731 if (fpip->fp_thread == curthread) { 1732 *fpipp = fpip->fp_next; 1733 kmem_free(fpip, sizeof (fpollinfo_t)); 1734 break; 1735 } 1736 } 1737 /* 1738 * Assert that we are not still on the list, that is, that 1739 * this lwp did not call addfpollinfo twice for the same fd. 1740 */ 1741 ASSERT(!curthread_in_plist(ufp)); 1742 UF_EXIT(ufp); 1743 } 1744 1745 /* 1746 * fd is associated with a port. pfd is a pointer to the fd entry in the 1747 * cache of the port. 1748 */ 1749 1750 void 1751 addfd_port(int fd, portfd_t *pfd) 1752 { 1753 struct uf_entry *ufp; 1754 uf_info_t *fip = P_FINFO(curproc); 1755 1756 UF_ENTER(ufp, fip, fd); 1757 /* 1758 * addfd_port is always done inside the getf/releasef pair. 1759 */ 1760 ASSERT(ufp->uf_refcnt >= 1); 1761 if (ufp->uf_portfd == NULL) { 1762 /* first entry */ 1763 ufp->uf_portfd = pfd; 1764 pfd->pfd_next = NULL; 1765 } else { 1766 pfd->pfd_next = ufp->uf_portfd; 1767 ufp->uf_portfd = pfd; 1768 pfd->pfd_next->pfd_prev = pfd; 1769 } 1770 UF_EXIT(ufp); 1771 } 1772 1773 void 1774 delfd_port(int fd, portfd_t *pfd) 1775 { 1776 struct uf_entry *ufp; 1777 uf_info_t *fip = P_FINFO(curproc); 1778 1779 UF_ENTER(ufp, fip, fd); 1780 /* 1781 * delfd_port is always done inside the getf/releasef pair. 1782 */ 1783 ASSERT(ufp->uf_refcnt >= 1); 1784 if (ufp->uf_portfd == pfd) { 1785 /* remove first entry */ 1786 ufp->uf_portfd = pfd->pfd_next; 1787 } else { 1788 pfd->pfd_prev->pfd_next = pfd->pfd_next; 1789 if (pfd->pfd_next != NULL) 1790 pfd->pfd_next->pfd_prev = pfd->pfd_prev; 1791 } 1792 UF_EXIT(ufp); 1793 } 1794 1795 static void 1796 port_close_fd(portfd_t *pfd) 1797 { 1798 portfd_t *pfdn; 1799 1800 /* 1801 * At this point, no other thread should access 1802 * the portfd_t list for this fd. The uf_file, uf_portfd 1803 * pointers in the uf_entry_t struct for this fd would 1804 * be set to NULL. 1805 */ 1806 for (; pfd != NULL; pfd = pfdn) { 1807 pfdn = pfd->pfd_next; 1808 port_close_pfd(pfd); 1809 } 1810 } 1811