1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 #include <sys/types.h> 30 #include <sys/sysmacros.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/errno.h> 34 #include <sys/signal.h> 35 #include <sys/cred.h> 36 #include <sys/user.h> 37 #include <sys/conf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/pathname.h> 41 #include <sys/file.h> 42 #include <sys/proc.h> 43 #include <sys/var.h> 44 #include <sys/cpuvar.h> 45 #include <sys/open.h> 46 #include <sys/cmn_err.h> 47 #include <sys/priocntl.h> 48 #include <sys/procset.h> 49 #include <sys/prsystm.h> 50 #include <sys/debug.h> 51 #include <sys/kmem.h> 52 #include <sys/atomic.h> 53 #include <sys/fcntl.h> 54 #include <sys/poll.h> 55 #include <sys/rctl.h> 56 #include <sys/port_impl.h> 57 58 #include <c2/audit.h> 59 #include <sys/nbmlock.h> 60 61 #ifdef DEBUG 62 63 static uint32_t afd_maxfd; /* # of entries in maximum allocated array */ 64 static uint32_t afd_alloc; /* count of kmem_alloc()s */ 65 static uint32_t afd_free; /* count of kmem_free()s */ 66 static uint32_t afd_wait; /* count of waits on non-zero ref count */ 67 #define MAXFD(x) (afd_maxfd = ((afd_maxfd >= (x))? afd_maxfd : (x))) 68 #define COUNT(x) atomic_add_32(&x, 1) 69 70 #else /* DEBUG */ 71 72 #define MAXFD(x) 73 #define COUNT(x) 74 75 #endif /* DEBUG */ 76 77 kmem_cache_t *file_cache; 78 static int vpsetattr(vnode_t *, vattr_t *, int); 79 80 static void port_close_fd(portfd_t *); 81 82 /* 83 * File descriptor allocation. 84 * 85 * fd_find(fip, minfd) finds the first available descriptor >= minfd. 86 * The most common case is open(2), in which minfd = 0, but we must also 87 * support fcntl(fd, F_DUPFD, minfd). 88 * 89 * The algorithm is as follows: we keep all file descriptors in an infix 90 * binary tree in which each node records the number of descriptors 91 * allocated in its right subtree, including itself. Starting at minfd, 92 * we ascend the tree until we find a non-fully allocated right subtree. 93 * We then descend that subtree in a binary search for the smallest fd. 94 * Finally, we ascend the tree again to increment the allocation count 95 * of every subtree containing the newly-allocated fd. Freeing an fd 96 * requires only the last step: we ascend the tree to decrement allocation 97 * counts. Each of these three steps (ascent to find non-full subtree, 98 * descent to find lowest fd, ascent to update allocation counts) is 99 * O(log n), thus the algorithm as a whole is O(log n). 100 * 101 * We don't implement the fd tree using the customary left/right/parent 102 * pointers, but instead take advantage of the glorious mathematics of 103 * full infix binary trees. For reference, here's an illustration of the 104 * logical structure of such a tree, rooted at 4 (binary 100), covering 105 * the range 1-7 (binary 001-111). Our canonical trees do not include 106 * fd 0; we'll deal with that later. 107 * 108 * 100 109 * / \ 110 * / \ 111 * 010 110 112 * / \ / \ 113 * 001 011 101 111 114 * 115 * We make the following observations, all of which are easily proven by 116 * induction on the depth of the tree: 117 * 118 * (T1) The least-significant bit (LSB) of any node is equal to its level 119 * in the tree. In our example, nodes 001, 011, 101 and 111 are at 120 * level 0; nodes 010 and 110 are at level 1; and node 100 is at level 2. 121 * 122 * (T2) The child size (CSIZE) of node N -- that is, the total number of 123 * right-branch descendants in a child of node N, including itself -- is 124 * given by clearing all but the least significant bit of N. This 125 * follows immediately from (T1). Applying this rule to our example, we 126 * see that CSIZE(100) = 100, CSIZE(x10) = 10, and CSIZE(xx1) = 1. 127 * 128 * (T3) The nearest left ancestor (LPARENT) of node N -- that is, the nearest 129 * ancestor containing node N in its right child -- is given by clearing 130 * the LSB of N. For example, LPARENT(111) = 110 and LPARENT(110) = 100. 131 * Clearing the LSB of nodes 001, 010 or 100 yields zero, reflecting 132 * the fact that these are leftmost nodes. Note that this algorithm 133 * automatically skips generations as necessary. For example, the parent 134 * of node 101 is 110, which is a *right* ancestor (not what we want); 135 * but its grandparent is 100, which is a left ancestor. Clearing the LSB 136 * of 101 gets us to 100 directly, skipping right past the uninteresting 137 * generation (110). 138 * 139 * Note that since LPARENT clears the LSB, whereas CSIZE clears all *but* 140 * the LSB, we can express LPARENT() nicely in terms of CSIZE(): 141 * 142 * LPARENT(N) = N - CSIZE(N) 143 * 144 * (T4) The nearest right ancestor (RPARENT) of node N is given by: 145 * 146 * RPARENT(N) = N + CSIZE(N) 147 * 148 * (T5) For every interior node, the children differ from their parent by 149 * CSIZE(parent) / 2. In our example, CSIZE(100) / 2 = 2 = 10 binary, 150 * and indeed, the children of 100 are 100 +/- 10 = 010 and 110. 151 * 152 * Next, we'll need a few two's-complement math tricks. Suppose a number, 153 * N, has the following form: 154 * 155 * N = xxxx10...0 156 * 157 * That is, the binary representation of N consists of some string of bits, 158 * then a 1, then all zeroes. This amounts to nothing more than saying that 159 * N has a least-significant bit, which is true for any N != 0. If we look 160 * at N and N - 1 together, we see that we can combine them in useful ways: 161 * 162 * N = xxxx10...0 163 * N - 1 = xxxx01...1 164 * ------------------------ 165 * N & (N - 1) = xxxx000000 166 * N | (N - 1) = xxxx111111 167 * N ^ (N - 1) = 111111 168 * 169 * In particular, this suggests several easy ways to clear all but the LSB, 170 * which by (T2) is exactly what we need to determine CSIZE(N) = 10...0. 171 * We'll opt for this formulation: 172 * 173 * (C1) CSIZE(N) = (N - 1) ^ (N | (N - 1)) 174 * 175 * Similarly, we have an easy way to determine LPARENT(N), which requires 176 * that we clear the LSB of N: 177 * 178 * (L1) LPARENT(N) = N & (N - 1) 179 * 180 * We note in the above relations that (N | (N - 1)) - N = CSIZE(N) - 1. 181 * When combined with (T4), this yields an easy way to compute RPARENT(N): 182 * 183 * (R1) RPARENT(N) = (N | (N - 1)) + 1 184 * 185 * Finally, to accommodate fd 0 we must adjust all of our results by +/-1 to 186 * move the fd range from [1, 2^n) to [0, 2^n - 1). This is straightforward, 187 * so there's no need to belabor the algebra; the revised relations become: 188 * 189 * (C1a) CSIZE(N) = N ^ (N | (N + 1)) 190 * 191 * (L1a) LPARENT(N) = (N & (N + 1)) - 1 192 * 193 * (R1a) RPARENT(N) = N | (N + 1) 194 * 195 * This completes the mathematical framework. We now have all the tools 196 * we need to implement fd_find() and fd_reserve(). 197 * 198 * fd_find(fip, minfd) finds the smallest available file descriptor >= minfd. 199 * It does not actually allocate the descriptor; that's done by fd_reserve(). 200 * fd_find() proceeds in two steps: 201 * 202 * (1) Find the leftmost subtree that contains a descriptor >= minfd. 203 * We start at the right subtree rooted at minfd. If this subtree is 204 * not full -- if fip->fi_list[minfd].uf_alloc != CSIZE(minfd) -- then 205 * step 1 is done. Otherwise, we know that all fds in this subtree 206 * are taken, so we ascend to RPARENT(minfd) using (R1a). We repeat 207 * this process until we either find a candidate subtree or exceed 208 * fip->fi_nfiles. We use (C1a) to compute CSIZE(). 209 * 210 * (2) Find the smallest fd in the subtree discovered by step 1. 211 * Starting at the root of this subtree, we descend to find the 212 * smallest available fd. Since the left children have the smaller 213 * fds, we will descend rightward only when the left child is full. 214 * 215 * We begin by comparing the number of allocated fds in the root 216 * to the number of allocated fds in its right child; if they differ 217 * by exactly CSIZE(child), we know the left subtree is full, so we 218 * descend right; that is, the right child becomes the search root. 219 * Otherwise we leave the root alone and start following the right 220 * child's left children. As fortune would have it, this is very 221 * simple computationally: by (T5), the right child of fd is just 222 * fd + size, where size = CSIZE(fd) / 2. Applying (T5) again, 223 * we find that the right child's left child is fd + size - (size / 2) = 224 * fd + (size / 2); *its* left child is fd + (size / 2) - (size / 4) = 225 * fd + (size / 4), and so on. In general, fd's right child's 226 * leftmost nth descendant is fd + (size >> n). Thus, to follow 227 * the right child's left descendants, we just halve the size in 228 * each iteration of the search. 229 * 230 * When we descend leftward, we must keep track of the number of fds 231 * that were allocated in all the right subtrees we rejected, so we 232 * know how many of the root fd's allocations are in the remaining 233 * (as yet unexplored) leftmost part of its right subtree. When we 234 * encounter a fully-allocated left child -- that is, when we find 235 * that fip->fi_list[fd].uf_alloc == ralloc + size -- we descend right 236 * (as described earlier), resetting ralloc to zero. 237 * 238 * fd_reserve(fip, fd, incr) either allocates or frees fd, depending 239 * on whether incr is 1 or -1. Starting at fd, fd_reserve() ascends 240 * the leftmost ancestors (see (T3)) and updates the allocation counts. 241 * At each step we use (L1a) to compute LPARENT(), the next left ancestor. 242 * 243 * flist_minsize() finds the minimal tree that still covers all 244 * used fds; as long as the allocation count of a root node is zero, we 245 * don't need that node or its right subtree. 246 * 247 * flist_nalloc() counts the number of allocated fds in the tree, by starting 248 * at the top of the tree and summing the right-subtree allocation counts as 249 * it descends leftwards. 250 * 251 * Note: we assume that flist_grow() will keep fip->fi_nfiles of the form 252 * 2^n - 1. This ensures that the fd trees are always full, which saves 253 * quite a bit of boundary checking. 254 */ 255 static int 256 fd_find(uf_info_t *fip, int minfd) 257 { 258 int size, ralloc, fd; 259 260 ASSERT(MUTEX_HELD(&fip->fi_lock)); 261 ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0); 262 263 for (fd = minfd; (uint_t)fd < fip->fi_nfiles; fd |= fd + 1) { 264 size = fd ^ (fd | (fd + 1)); 265 if (fip->fi_list[fd].uf_alloc == size) 266 continue; 267 for (ralloc = 0, size >>= 1; size != 0; size >>= 1) { 268 ralloc += fip->fi_list[fd + size].uf_alloc; 269 if (fip->fi_list[fd].uf_alloc == ralloc + size) { 270 fd += size; 271 ralloc = 0; 272 } 273 } 274 return (fd); 275 } 276 return (-1); 277 } 278 279 static void 280 fd_reserve(uf_info_t *fip, int fd, int incr) 281 { 282 int pfd; 283 uf_entry_t *ufp = &fip->fi_list[fd]; 284 285 ASSERT((uint_t)fd < fip->fi_nfiles); 286 ASSERT((ufp->uf_busy == 0 && incr == 1) || 287 (ufp->uf_busy == 1 && incr == -1)); 288 ASSERT(MUTEX_HELD(&ufp->uf_lock)); 289 ASSERT(MUTEX_HELD(&fip->fi_lock)); 290 291 for (pfd = fd; pfd >= 0; pfd = (pfd & (pfd + 1)) - 1) 292 fip->fi_list[pfd].uf_alloc += incr; 293 294 ufp->uf_busy += incr; 295 } 296 297 static int 298 flist_minsize(uf_info_t *fip) 299 { 300 int fd; 301 302 /* 303 * We'd like to ASSERT(MUTEX_HELD(&fip->fi_lock)), but we're called 304 * by flist_fork(), which relies on other mechanisms for mutual 305 * exclusion. 306 */ 307 ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0); 308 309 for (fd = fip->fi_nfiles; fd != 0; fd >>= 1) 310 if (fip->fi_list[fd >> 1].uf_alloc != 0) 311 break; 312 313 return (fd); 314 } 315 316 static int 317 flist_nalloc(uf_info_t *fip) 318 { 319 int fd; 320 int nalloc = 0; 321 322 ASSERT(MUTEX_HELD(&fip->fi_lock)); 323 ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0); 324 325 for (fd = fip->fi_nfiles; fd != 0; fd >>= 1) 326 nalloc += fip->fi_list[fd >> 1].uf_alloc; 327 328 return (nalloc); 329 } 330 331 /* 332 * Increase size of the fi_list array to accommodate at least maxfd. 333 * We keep the size of the form 2^n - 1 for benefit of fd_find(). 334 */ 335 static void 336 flist_grow(int maxfd) 337 { 338 uf_info_t *fip = P_FINFO(curproc); 339 int newcnt, oldcnt; 340 uf_entry_t *src, *dst, *newlist, *oldlist, *newend, *oldend; 341 uf_rlist_t *urp; 342 343 for (newcnt = 1; newcnt <= maxfd; newcnt = (newcnt << 1) | 1) 344 continue; 345 346 newlist = kmem_zalloc(newcnt * sizeof (uf_entry_t), KM_SLEEP); 347 348 mutex_enter(&fip->fi_lock); 349 oldcnt = fip->fi_nfiles; 350 if (newcnt <= oldcnt) { 351 mutex_exit(&fip->fi_lock); 352 kmem_free(newlist, newcnt * sizeof (uf_entry_t)); 353 return; 354 } 355 ASSERT((newcnt & (newcnt + 1)) == 0); 356 oldlist = fip->fi_list; 357 oldend = oldlist + oldcnt; 358 newend = newlist + oldcnt; /* no need to lock beyond old end */ 359 360 /* 361 * fi_list and fi_nfiles cannot change while any uf_lock is held, 362 * so we must grab all the old locks *and* the new locks up to oldcnt. 363 * (Locks beyond the end of oldcnt aren't visible until we store 364 * the new fi_nfiles, which is the last thing we do before dropping 365 * all the locks, so there's no need to acquire these locks). 366 * Holding the new locks is necessary because when fi_list changes 367 * to point to the new list, fi_nfiles won't have been stored yet. 368 * If we *didn't* hold the new locks, someone doing a UF_ENTER() 369 * could see the new fi_list, grab the new uf_lock, and then see 370 * fi_nfiles change while the lock is held -- in violation of 371 * UF_ENTER() semantics. 372 */ 373 for (src = oldlist; src < oldend; src++) 374 mutex_enter(&src->uf_lock); 375 376 for (dst = newlist; dst < newend; dst++) 377 mutex_enter(&dst->uf_lock); 378 379 for (src = oldlist, dst = newlist; src < oldend; src++, dst++) { 380 dst->uf_file = src->uf_file; 381 dst->uf_fpollinfo = src->uf_fpollinfo; 382 dst->uf_refcnt = src->uf_refcnt; 383 dst->uf_alloc = src->uf_alloc; 384 dst->uf_flag = src->uf_flag; 385 dst->uf_busy = src->uf_busy; 386 dst->uf_portfd = src->uf_portfd; 387 } 388 389 /* 390 * As soon as we store the new flist, future locking operations 391 * will use it. Therefore, we must ensure that all the state 392 * we've just established reaches global visibility before the 393 * new flist does. 394 */ 395 membar_producer(); 396 fip->fi_list = newlist; 397 398 /* 399 * Routines like getf() make an optimistic check on the validity 400 * of the supplied file descriptor: if it's less than the current 401 * value of fi_nfiles -- examined without any locks -- then it's 402 * safe to attempt a UF_ENTER() on that fd (which is a valid 403 * assumption because fi_nfiles only increases). Therefore, it 404 * is critical that the new value of fi_nfiles not reach global 405 * visibility until after the new fi_list: if it happened the 406 * other way around, getf() could see the new fi_nfiles and attempt 407 * a UF_ENTER() on the old fi_list, which would write beyond its 408 * end if the fd exceeded the old fi_nfiles. 409 */ 410 membar_producer(); 411 fip->fi_nfiles = newcnt; 412 413 /* 414 * The new state is consistent now, so we can drop all the locks. 415 */ 416 for (dst = newlist; dst < newend; dst++) 417 mutex_exit(&dst->uf_lock); 418 419 for (src = oldlist; src < oldend; src++) { 420 /* 421 * If any threads are blocked on the old cvs, wake them. 422 * This will force them to wake up, discover that fi_list 423 * has changed, and go back to sleep on the new cvs. 424 */ 425 cv_broadcast(&src->uf_wanted_cv); 426 cv_broadcast(&src->uf_closing_cv); 427 mutex_exit(&src->uf_lock); 428 } 429 430 mutex_exit(&fip->fi_lock); 431 432 /* 433 * Retire the old flist. We can't actually kmem_free() it now 434 * because someone may still have a pointer to it. Instead, 435 * we link it onto a list of retired flists. The new flist 436 * is at least double the size of the previous flist, so the 437 * total size of all retired flists will be less than the size 438 * of the current one (to prove, consider the sum of a geometric 439 * series in powers of 2). exit() frees the retired flists. 440 */ 441 urp = kmem_zalloc(sizeof (uf_rlist_t), KM_SLEEP); 442 urp->ur_list = oldlist; 443 urp->ur_nfiles = oldcnt; 444 445 mutex_enter(&fip->fi_lock); 446 urp->ur_next = fip->fi_rlist; 447 fip->fi_rlist = urp; 448 mutex_exit(&fip->fi_lock); 449 } 450 451 /* 452 * Utility functions for keeping track of the active file descriptors. 453 */ 454 void 455 clear_stale_fd() /* called from post_syscall() */ 456 { 457 afd_t *afd = &curthread->t_activefd; 458 int i; 459 460 /* uninitialized is ok here, a_nfd is then zero */ 461 for (i = 0; i < afd->a_nfd; i++) { 462 /* assert that this should not be necessary */ 463 ASSERT(afd->a_fd[i] == -1); 464 afd->a_fd[i] = -1; 465 } 466 afd->a_stale = 0; 467 } 468 469 void 470 free_afd(afd_t *afd) /* called below and from thread_free() */ 471 { 472 int i; 473 474 /* free the buffer if it was kmem_alloc()ed */ 475 if (afd->a_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) { 476 COUNT(afd_free); 477 kmem_free(afd->a_fd, afd->a_nfd * sizeof (afd->a_fd[0])); 478 } 479 480 /* (re)initialize the structure */ 481 afd->a_fd = &afd->a_buf[0]; 482 afd->a_nfd = sizeof (afd->a_buf) / sizeof (afd->a_buf[0]); 483 afd->a_stale = 0; 484 for (i = 0; i < afd->a_nfd; i++) 485 afd->a_fd[i] = -1; 486 } 487 488 static void 489 set_active_fd(int fd) 490 { 491 afd_t *afd = &curthread->t_activefd; 492 int i; 493 int *old_fd; 494 int old_nfd; 495 int *new_fd; 496 int new_nfd; 497 498 if (afd->a_nfd == 0) { /* first time initialization */ 499 ASSERT(fd == -1); 500 mutex_enter(&afd->a_fdlock); 501 free_afd(afd); 502 mutex_exit(&afd->a_fdlock); 503 } 504 505 /* insert fd into vacant slot, if any */ 506 for (i = 0; i < afd->a_nfd; i++) { 507 if (afd->a_fd[i] == -1) { 508 afd->a_fd[i] = fd; 509 return; 510 } 511 } 512 513 /* 514 * Reallocate the a_fd[] array to add one more slot. 515 */ 516 ASSERT(fd == -1); 517 old_nfd = afd->a_nfd; 518 old_fd = afd->a_fd; 519 new_nfd = old_nfd + 1; 520 new_fd = kmem_alloc(new_nfd * sizeof (afd->a_fd[0]), KM_SLEEP); 521 MAXFD(new_nfd); 522 COUNT(afd_alloc); 523 524 mutex_enter(&afd->a_fdlock); 525 afd->a_fd = new_fd; 526 afd->a_nfd = new_nfd; 527 for (i = 0; i < old_nfd; i++) 528 afd->a_fd[i] = old_fd[i]; 529 afd->a_fd[i] = fd; 530 mutex_exit(&afd->a_fdlock); 531 532 if (old_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) { 533 COUNT(afd_free); 534 kmem_free(old_fd, old_nfd * sizeof (afd->a_fd[0])); 535 } 536 } 537 538 void 539 clear_active_fd(int fd) /* called below and from aio.c */ 540 { 541 afd_t *afd = &curthread->t_activefd; 542 int i; 543 544 for (i = 0; i < afd->a_nfd; i++) { 545 if (afd->a_fd[i] == fd) { 546 afd->a_fd[i] = -1; 547 break; 548 } 549 } 550 ASSERT(i < afd->a_nfd); /* not found is not ok */ 551 } 552 553 /* 554 * Does this thread have this fd active? 555 */ 556 static int 557 is_active_fd(kthread_t *t, int fd) 558 { 559 afd_t *afd = &t->t_activefd; 560 int i; 561 562 ASSERT(t != curthread); 563 mutex_enter(&afd->a_fdlock); 564 /* uninitialized is ok here, a_nfd is then zero */ 565 for (i = 0; i < afd->a_nfd; i++) { 566 if (afd->a_fd[i] == fd) { 567 mutex_exit(&afd->a_fdlock); 568 return (1); 569 } 570 } 571 mutex_exit(&afd->a_fdlock); 572 return (0); 573 } 574 575 /* 576 * Convert a user supplied file descriptor into a pointer to a file 577 * structure. Only task is to check range of the descriptor (soft 578 * resource limit was enforced at open time and shouldn't be checked 579 * here). 580 */ 581 file_t * 582 getf(int fd) 583 { 584 uf_info_t *fip = P_FINFO(curproc); 585 uf_entry_t *ufp; 586 file_t *fp; 587 588 if ((uint_t)fd >= fip->fi_nfiles) 589 return (NULL); 590 591 /* 592 * Reserve a slot in the active fd array now so we can call 593 * set_active_fd(fd) for real below, while still inside UF_ENTER(). 594 */ 595 set_active_fd(-1); 596 597 UF_ENTER(ufp, fip, fd); 598 599 if ((fp = ufp->uf_file) == NULL) { 600 UF_EXIT(ufp); 601 602 if (fd == fip->fi_badfd && fip->fi_action > 0) 603 tsignal(curthread, fip->fi_action); 604 605 return (NULL); 606 } 607 ufp->uf_refcnt++; 608 609 /* 610 * archive per file audit data 611 */ 612 if (AU_AUDITING()) 613 (void) audit_getf(fd); 614 615 set_active_fd(fd); /* record the active file descriptor */ 616 617 UF_EXIT(ufp); 618 619 return (fp); 620 } 621 622 /* 623 * Close whatever file currently occupies the file descriptor slot 624 * and install the new file, usually NULL, in the file descriptor slot. 625 * The close must complete before we release the file descriptor slot. 626 * If newfp != NULL we only return an error if we can't allocate the 627 * slot so the caller knows that it needs to free the filep; 628 * in the other cases we return the error number from closef(). 629 */ 630 int 631 closeandsetf(int fd, file_t *newfp) 632 { 633 proc_t *p = curproc; 634 uf_info_t *fip = P_FINFO(p); 635 uf_entry_t *ufp; 636 file_t *fp; 637 fpollinfo_t *fpip; 638 portfd_t *pfd; 639 int error; 640 641 if ((uint_t)fd >= fip->fi_nfiles) { 642 if (newfp == NULL) 643 return (EBADF); 644 flist_grow(fd); 645 } 646 647 if (newfp != NULL) { 648 /* 649 * If ufp is reserved but has no file pointer, it's in the 650 * transition between ufalloc() and setf(). We must wait 651 * for this transition to complete before assigning the 652 * new non-NULL file pointer. 653 */ 654 mutex_enter(&fip->fi_lock); 655 if (fd == fip->fi_badfd) { 656 mutex_exit(&fip->fi_lock); 657 if (fip->fi_action > 0) 658 tsignal(curthread, fip->fi_action); 659 return (EBADF); 660 } 661 UF_ENTER(ufp, fip, fd); 662 while (ufp->uf_busy && ufp->uf_file == NULL) { 663 mutex_exit(&fip->fi_lock); 664 cv_wait_stop(&ufp->uf_wanted_cv, &ufp->uf_lock, 250); 665 UF_EXIT(ufp); 666 mutex_enter(&fip->fi_lock); 667 UF_ENTER(ufp, fip, fd); 668 } 669 if ((fp = ufp->uf_file) == NULL) { 670 ASSERT(ufp->uf_fpollinfo == NULL); 671 ASSERT(ufp->uf_flag == 0); 672 fd_reserve(fip, fd, 1); 673 ufp->uf_file = newfp; 674 UF_EXIT(ufp); 675 mutex_exit(&fip->fi_lock); 676 return (0); 677 } 678 mutex_exit(&fip->fi_lock); 679 } else { 680 UF_ENTER(ufp, fip, fd); 681 if ((fp = ufp->uf_file) == NULL) { 682 UF_EXIT(ufp); 683 return (EBADF); 684 } 685 } 686 687 /* 688 * archive per file audit data 689 */ 690 if (AU_AUDITING()) 691 (void) audit_getf(fd); 692 ASSERT(ufp->uf_busy); 693 ufp->uf_file = NULL; 694 ufp->uf_flag = 0; 695 696 /* 697 * If the file descriptor reference count is non-zero, then 698 * some other lwp in the process is performing system call 699 * activity on the file. To avoid blocking here for a long 700 * time (the other lwp might be in a long term sleep in its 701 * system call), we scan all other lwps in the process to 702 * find the ones with this fd as one of their active fds, 703 * set their a_stale flag, and set them running if they 704 * are in an interruptible sleep so they will emerge from 705 * their system calls immediately. post_syscall() will 706 * test the a_stale flag and set errno to EBADF. 707 */ 708 ASSERT(ufp->uf_refcnt == 0 || p->p_lwpcnt > 1); 709 if (ufp->uf_refcnt > 0) { 710 kthread_t *t; 711 712 /* 713 * We call sprlock_proc(p) to ensure that the thread 714 * list will not change while we are scanning it. 715 * To do this, we must drop ufp->uf_lock and then 716 * reacquire it (so we are not holding both p->p_lock 717 * and ufp->uf_lock at the same time). ufp->uf_lock 718 * must be held for is_active_fd() to be correct 719 * (set_active_fd() is called while holding ufp->uf_lock). 720 * 721 * This is a convoluted dance, but it is better than 722 * the old brute-force method of stopping every thread 723 * in the process by calling holdlwps(SHOLDFORK1). 724 */ 725 726 UF_EXIT(ufp); 727 COUNT(afd_wait); 728 729 mutex_enter(&p->p_lock); 730 sprlock_proc(p); 731 mutex_exit(&p->p_lock); 732 733 UF_ENTER(ufp, fip, fd); 734 ASSERT(ufp->uf_file == NULL); 735 736 if (ufp->uf_refcnt > 0) { 737 for (t = curthread->t_forw; 738 t != curthread; 739 t = t->t_forw) { 740 if (is_active_fd(t, fd)) { 741 thread_lock(t); 742 t->t_activefd.a_stale = 1; 743 t->t_post_sys = 1; 744 if (ISWAKEABLE(t)) 745 setrun_locked(t); 746 thread_unlock(t); 747 } 748 } 749 } 750 751 UF_EXIT(ufp); 752 753 mutex_enter(&p->p_lock); 754 sprunlock(p); 755 756 UF_ENTER(ufp, fip, fd); 757 ASSERT(ufp->uf_file == NULL); 758 } 759 760 /* 761 * Wait for other lwps to stop using this file descriptor. 762 */ 763 while (ufp->uf_refcnt > 0) { 764 cv_wait_stop(&ufp->uf_closing_cv, &ufp->uf_lock, 250); 765 /* 766 * cv_wait_stop() drops ufp->uf_lock, so the file list 767 * can change. Drop the lock on our (possibly) stale 768 * ufp and let UF_ENTER() find and lock the current ufp. 769 */ 770 UF_EXIT(ufp); 771 UF_ENTER(ufp, fip, fd); 772 } 773 774 #ifdef DEBUG 775 /* 776 * catch a watchfd on device's pollhead list but not on fpollinfo list 777 */ 778 if (ufp->uf_fpollinfo != NULL) 779 checkwfdlist(fp->f_vnode, ufp->uf_fpollinfo); 780 #endif /* DEBUG */ 781 782 /* 783 * We may need to cleanup some cached poll states in t_pollstate 784 * before the fd can be reused. It is important that we don't 785 * access a stale thread structure. We will do the cleanup in two 786 * phases to avoid deadlock and holding uf_lock for too long. 787 * In phase 1, hold the uf_lock and call pollblockexit() to set 788 * state in t_pollstate struct so that a thread does not exit on 789 * us. In phase 2, we drop the uf_lock and call pollcacheclean(). 790 */ 791 pfd = ufp->uf_portfd; 792 ufp->uf_portfd = NULL; 793 fpip = ufp->uf_fpollinfo; 794 ufp->uf_fpollinfo = NULL; 795 if (fpip != NULL) 796 pollblockexit(fpip); 797 UF_EXIT(ufp); 798 if (fpip != NULL) 799 pollcacheclean(fpip, fd); 800 if (pfd) 801 port_close_fd(pfd); 802 803 /* 804 * Keep the file descriptor entry reserved across the closef(). 805 */ 806 error = closef(fp); 807 808 setf(fd, newfp); 809 810 /* Only return closef() error when closing is all we do */ 811 return (newfp == NULL ? error : 0); 812 } 813 814 /* 815 * Decrement uf_refcnt; wakeup anyone waiting to close the file. 816 */ 817 void 818 releasef(int fd) 819 { 820 uf_info_t *fip = P_FINFO(curproc); 821 uf_entry_t *ufp; 822 823 UF_ENTER(ufp, fip, fd); 824 ASSERT(ufp->uf_refcnt > 0); 825 clear_active_fd(fd); /* clear the active file descriptor */ 826 if (--ufp->uf_refcnt == 0) 827 cv_broadcast(&ufp->uf_closing_cv); 828 UF_EXIT(ufp); 829 } 830 831 /* 832 * Identical to releasef() but can be called from another process. 833 */ 834 void 835 areleasef(int fd, uf_info_t *fip) 836 { 837 uf_entry_t *ufp; 838 839 UF_ENTER(ufp, fip, fd); 840 ASSERT(ufp->uf_refcnt > 0); 841 if (--ufp->uf_refcnt == 0) 842 cv_broadcast(&ufp->uf_closing_cv); 843 UF_EXIT(ufp); 844 } 845 846 /* 847 * Duplicate all file descriptors across a fork. 848 */ 849 void 850 flist_fork(uf_info_t *pfip, uf_info_t *cfip) 851 { 852 int fd, nfiles; 853 uf_entry_t *pufp, *cufp; 854 855 mutex_init(&cfip->fi_lock, NULL, MUTEX_DEFAULT, NULL); 856 cfip->fi_rlist = NULL; 857 858 /* 859 * We don't need to hold fi_lock because all other lwp's in the 860 * parent have been held. 861 */ 862 cfip->fi_nfiles = nfiles = flist_minsize(pfip); 863 864 cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); 865 866 for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles; 867 fd++, pufp++, cufp++) { 868 cufp->uf_file = pufp->uf_file; 869 cufp->uf_alloc = pufp->uf_alloc; 870 cufp->uf_flag = pufp->uf_flag; 871 cufp->uf_busy = pufp->uf_busy; 872 if (pufp->uf_file == NULL) { 873 ASSERT(pufp->uf_flag == 0); 874 if (pufp->uf_busy) { 875 /* 876 * Grab locks to appease ASSERTs in fd_reserve 877 */ 878 mutex_enter(&cfip->fi_lock); 879 mutex_enter(&cufp->uf_lock); 880 fd_reserve(cfip, fd, -1); 881 mutex_exit(&cufp->uf_lock); 882 mutex_exit(&cfip->fi_lock); 883 } 884 } 885 } 886 } 887 888 /* 889 * Close all open file descriptors for the current process. 890 * This is only called from exit(), which is single-threaded, 891 * so we don't need any locking. 892 */ 893 void 894 closeall(uf_info_t *fip) 895 { 896 int fd; 897 file_t *fp; 898 uf_entry_t *ufp; 899 900 ufp = fip->fi_list; 901 for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) { 902 if ((fp = ufp->uf_file) != NULL) { 903 ufp->uf_file = NULL; 904 if (ufp->uf_portfd != NULL) { 905 portfd_t *pfd; 906 /* remove event port association */ 907 pfd = ufp->uf_portfd; 908 ufp->uf_portfd = NULL; 909 port_close_fd(pfd); 910 } 911 ASSERT(ufp->uf_fpollinfo == NULL); 912 (void) closef(fp); 913 } 914 } 915 916 kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t)); 917 fip->fi_list = NULL; 918 fip->fi_nfiles = 0; 919 while (fip->fi_rlist != NULL) { 920 uf_rlist_t *urp = fip->fi_rlist; 921 fip->fi_rlist = urp->ur_next; 922 kmem_free(urp->ur_list, urp->ur_nfiles * sizeof (uf_entry_t)); 923 kmem_free(urp, sizeof (uf_rlist_t)); 924 } 925 } 926 927 /* 928 * Internal form of close. Decrement reference count on file 929 * structure. Decrement reference count on the vnode following 930 * removal of the referencing file structure. 931 */ 932 int 933 closef(file_t *fp) 934 { 935 vnode_t *vp; 936 int error; 937 int count; 938 int flag; 939 offset_t offset; 940 941 /* 942 * audit close of file (may be exit) 943 */ 944 if (AU_AUDITING()) 945 audit_closef(fp); 946 ASSERT(MUTEX_NOT_HELD(&P_FINFO(curproc)->fi_lock)); 947 948 mutex_enter(&fp->f_tlock); 949 950 ASSERT(fp->f_count > 0); 951 952 count = fp->f_count--; 953 flag = fp->f_flag; 954 offset = fp->f_offset; 955 956 vp = fp->f_vnode; 957 958 error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL); 959 960 if (count > 1) { 961 mutex_exit(&fp->f_tlock); 962 return (error); 963 } 964 ASSERT(fp->f_count == 0); 965 mutex_exit(&fp->f_tlock); 966 967 VN_RELE(vp); 968 /* 969 * deallocate resources to audit_data 970 */ 971 if (audit_active) 972 audit_unfalloc(fp); 973 crfree(fp->f_cred); 974 kmem_cache_free(file_cache, fp); 975 return (error); 976 } 977 978 /* 979 * This is a combination of ufalloc() and setf(). 980 */ 981 int 982 ufalloc_file(int start, file_t *fp) 983 { 984 proc_t *p = curproc; 985 uf_info_t *fip = P_FINFO(p); 986 int filelimit; 987 uf_entry_t *ufp; 988 int nfiles; 989 int fd; 990 991 /* 992 * Assertion is to convince the correctness of the following 993 * assignment for filelimit after casting to int. 994 */ 995 ASSERT(p->p_fno_ctl <= INT_MAX); 996 filelimit = (int)p->p_fno_ctl; 997 998 for (;;) { 999 mutex_enter(&fip->fi_lock); 1000 fd = fd_find(fip, start); 1001 if (fd >= 0 && fd == fip->fi_badfd) { 1002 start = fd + 1; 1003 mutex_exit(&fip->fi_lock); 1004 continue; 1005 } 1006 if ((uint_t)fd < filelimit) 1007 break; 1008 if (fd >= filelimit) { 1009 mutex_exit(&fip->fi_lock); 1010 mutex_enter(&p->p_lock); 1011 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 1012 p->p_rctls, p, RCA_SAFE); 1013 mutex_exit(&p->p_lock); 1014 return (-1); 1015 } 1016 /* fd_find() returned -1 */ 1017 nfiles = fip->fi_nfiles; 1018 mutex_exit(&fip->fi_lock); 1019 flist_grow(MAX(start, nfiles)); 1020 } 1021 1022 UF_ENTER(ufp, fip, fd); 1023 fd_reserve(fip, fd, 1); 1024 ASSERT(ufp->uf_file == NULL); 1025 ufp->uf_file = fp; 1026 UF_EXIT(ufp); 1027 mutex_exit(&fip->fi_lock); 1028 return (fd); 1029 } 1030 1031 /* 1032 * Allocate a user file descriptor greater than or equal to "start". 1033 */ 1034 int 1035 ufalloc(int start) 1036 { 1037 return (ufalloc_file(start, NULL)); 1038 } 1039 1040 /* 1041 * Check that a future allocation of count fds on proc p has a good 1042 * chance of succeeding. If not, do rctl processing as if we'd failed 1043 * the allocation. 1044 * 1045 * Our caller must guarantee that p cannot disappear underneath us. 1046 */ 1047 int 1048 ufcanalloc(proc_t *p, uint_t count) 1049 { 1050 uf_info_t *fip = P_FINFO(p); 1051 int filelimit; 1052 int current; 1053 1054 if (count == 0) 1055 return (1); 1056 1057 ASSERT(p->p_fno_ctl <= INT_MAX); 1058 filelimit = (int)p->p_fno_ctl; 1059 1060 mutex_enter(&fip->fi_lock); 1061 current = flist_nalloc(fip); /* # of in-use descriptors */ 1062 mutex_exit(&fip->fi_lock); 1063 1064 /* 1065 * If count is a positive integer, the worst that can happen is 1066 * an overflow to a negative value, which is caught by the >= 0 check. 1067 */ 1068 current += count; 1069 if (count <= INT_MAX && current >= 0 && current <= filelimit) 1070 return (1); 1071 1072 mutex_enter(&p->p_lock); 1073 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 1074 p->p_rctls, p, RCA_SAFE); 1075 mutex_exit(&p->p_lock); 1076 return (0); 1077 } 1078 1079 /* 1080 * Allocate a user file descriptor and a file structure. 1081 * Initialize the descriptor to point at the file structure. 1082 * If fdp is NULL, the user file descriptor will not be allocated. 1083 */ 1084 int 1085 falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp) 1086 { 1087 file_t *fp; 1088 int fd; 1089 1090 if (fdp) { 1091 if ((fd = ufalloc(0)) == -1) 1092 return (EMFILE); 1093 } 1094 fp = kmem_cache_alloc(file_cache, KM_SLEEP); 1095 /* 1096 * Note: falloc returns the fp locked 1097 */ 1098 mutex_enter(&fp->f_tlock); 1099 fp->f_count = 1; 1100 fp->f_flag = (ushort_t)flag; 1101 fp->f_vnode = vp; 1102 fp->f_offset = 0; 1103 fp->f_audit_data = 0; 1104 crhold(fp->f_cred = CRED()); 1105 /* 1106 * allocate resources to audit_data 1107 */ 1108 if (audit_active) 1109 audit_falloc(fp); 1110 *fpp = fp; 1111 if (fdp) 1112 *fdp = fd; 1113 return (0); 1114 } 1115 1116 /*ARGSUSED*/ 1117 static int 1118 file_cache_constructor(void *buf, void *cdrarg, int kmflags) 1119 { 1120 file_t *fp = buf; 1121 1122 mutex_init(&fp->f_tlock, NULL, MUTEX_DEFAULT, NULL); 1123 return (0); 1124 } 1125 1126 /*ARGSUSED*/ 1127 static void 1128 file_cache_destructor(void *buf, void *cdrarg) 1129 { 1130 file_t *fp = buf; 1131 1132 mutex_destroy(&fp->f_tlock); 1133 } 1134 1135 void 1136 finit() 1137 { 1138 file_cache = kmem_cache_create("file_cache", sizeof (file_t), 0, 1139 file_cache_constructor, file_cache_destructor, NULL, NULL, NULL, 0); 1140 } 1141 1142 void 1143 unfalloc(file_t *fp) 1144 { 1145 ASSERT(MUTEX_HELD(&fp->f_tlock)); 1146 if (--fp->f_count <= 0) { 1147 /* 1148 * deallocate resources to audit_data 1149 */ 1150 if (audit_active) 1151 audit_unfalloc(fp); 1152 crfree(fp->f_cred); 1153 mutex_exit(&fp->f_tlock); 1154 kmem_cache_free(file_cache, fp); 1155 } else 1156 mutex_exit(&fp->f_tlock); 1157 } 1158 1159 /* 1160 * Given a file descriptor, set the user's 1161 * file pointer to the given parameter. 1162 */ 1163 void 1164 setf(int fd, file_t *fp) 1165 { 1166 uf_info_t *fip = P_FINFO(curproc); 1167 uf_entry_t *ufp; 1168 1169 if (AU_AUDITING()) 1170 audit_setf(fp, fd); 1171 1172 if (fp == NULL) { 1173 mutex_enter(&fip->fi_lock); 1174 UF_ENTER(ufp, fip, fd); 1175 fd_reserve(fip, fd, -1); 1176 mutex_exit(&fip->fi_lock); 1177 } else { 1178 UF_ENTER(ufp, fip, fd); 1179 ASSERT(ufp->uf_busy); 1180 } 1181 ASSERT(ufp->uf_fpollinfo == NULL); 1182 ASSERT(ufp->uf_flag == 0); 1183 ufp->uf_file = fp; 1184 cv_broadcast(&ufp->uf_wanted_cv); 1185 UF_EXIT(ufp); 1186 } 1187 1188 /* 1189 * Given a file descriptor, return the file table flags, plus, 1190 * if this is a socket in asynchronous mode, the FASYNC flag. 1191 * getf() may or may not have been called before calling f_getfl(). 1192 */ 1193 int 1194 f_getfl(int fd, int *flagp) 1195 { 1196 uf_info_t *fip = P_FINFO(curproc); 1197 uf_entry_t *ufp; 1198 file_t *fp; 1199 int error; 1200 1201 if ((uint_t)fd >= fip->fi_nfiles) 1202 error = EBADF; 1203 else { 1204 UF_ENTER(ufp, fip, fd); 1205 if ((fp = ufp->uf_file) == NULL) 1206 error = EBADF; 1207 else { 1208 vnode_t *vp = fp->f_vnode; 1209 int flag = fp->f_flag; 1210 1211 /* 1212 * BSD fcntl() FASYNC compatibility. 1213 */ 1214 if (vp->v_type == VSOCK) 1215 flag |= sock_getfasync(vp); 1216 *flagp = flag; 1217 error = 0; 1218 } 1219 UF_EXIT(ufp); 1220 } 1221 1222 return (error); 1223 } 1224 1225 /* 1226 * Given a file descriptor, return the user's file flags. 1227 * Force the FD_CLOEXEC flag for writable self-open /proc files. 1228 * getf() may or may not have been called before calling f_getfd_error(). 1229 */ 1230 int 1231 f_getfd_error(int fd, int *flagp) 1232 { 1233 uf_info_t *fip = P_FINFO(curproc); 1234 uf_entry_t *ufp; 1235 file_t *fp; 1236 int flag; 1237 int error; 1238 1239 if ((uint_t)fd >= fip->fi_nfiles) 1240 error = EBADF; 1241 else { 1242 UF_ENTER(ufp, fip, fd); 1243 if ((fp = ufp->uf_file) == NULL) 1244 error = EBADF; 1245 else { 1246 flag = ufp->uf_flag; 1247 if ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode)) 1248 flag |= FD_CLOEXEC; 1249 *flagp = flag; 1250 error = 0; 1251 } 1252 UF_EXIT(ufp); 1253 } 1254 1255 return (error); 1256 } 1257 1258 /* 1259 * getf() must have been called before calling f_getfd(). 1260 */ 1261 char 1262 f_getfd(int fd) 1263 { 1264 int flag = 0; 1265 (void) f_getfd_error(fd, &flag); 1266 return ((char)flag); 1267 } 1268 1269 /* 1270 * Given a file descriptor and file flags, set the user's file flags. 1271 * At present, the only valid flag is FD_CLOEXEC. 1272 * getf() may or may not have been called before calling f_setfd_error(). 1273 */ 1274 int 1275 f_setfd_error(int fd, int flags) 1276 { 1277 uf_info_t *fip = P_FINFO(curproc); 1278 uf_entry_t *ufp; 1279 int error; 1280 1281 if ((uint_t)fd >= fip->fi_nfiles) 1282 error = EBADF; 1283 else { 1284 UF_ENTER(ufp, fip, fd); 1285 if (ufp->uf_file == NULL) 1286 error = EBADF; 1287 else { 1288 ufp->uf_flag = flags & FD_CLOEXEC; 1289 error = 0; 1290 } 1291 UF_EXIT(ufp); 1292 } 1293 return (error); 1294 } 1295 1296 void 1297 f_setfd(int fd, char flags) 1298 { 1299 (void) f_setfd_error(fd, flags); 1300 } 1301 1302 #define BADFD_MIN 3 1303 #define BADFD_MAX 255 1304 1305 /* 1306 * Attempt to allocate a file descriptor which is bad and which 1307 * is "poison" to the application. It cannot be closed (except 1308 * on exec), allocated for a different use, etc. 1309 */ 1310 int 1311 f_badfd(int start, int *fdp, int action) 1312 { 1313 int fdr; 1314 int badfd; 1315 uf_info_t *fip = P_FINFO(curproc); 1316 1317 #ifdef _LP64 1318 /* No restrictions on 64 bit _file */ 1319 if (get_udatamodel() != DATAMODEL_ILP32) 1320 return (EINVAL); 1321 #endif 1322 1323 if (start > BADFD_MAX || start < BADFD_MIN) 1324 return (EINVAL); 1325 1326 if (action >= NSIG || action < 0) 1327 return (EINVAL); 1328 1329 mutex_enter(&fip->fi_lock); 1330 badfd = fip->fi_badfd; 1331 mutex_exit(&fip->fi_lock); 1332 1333 if (badfd != -1) 1334 return (EAGAIN); 1335 1336 fdr = ufalloc(start); 1337 1338 if (fdr > BADFD_MAX) { 1339 setf(fdr, NULL); 1340 return (EMFILE); 1341 } 1342 if (fdr < 0) 1343 return (EMFILE); 1344 1345 mutex_enter(&fip->fi_lock); 1346 if (fip->fi_badfd != -1) { 1347 /* Lost race */ 1348 mutex_exit(&fip->fi_lock); 1349 setf(fdr, NULL); 1350 return (EAGAIN); 1351 } 1352 fip->fi_action = action; 1353 fip->fi_badfd = fdr; 1354 mutex_exit(&fip->fi_lock); 1355 setf(fdr, NULL); 1356 1357 *fdp = fdr; 1358 1359 return (0); 1360 } 1361 1362 /* 1363 * Allocate a file descriptor and assign it to the vnode "*vpp", 1364 * performing the usual open protocol upon it and returning the 1365 * file descriptor allocated. It is the responsibility of the 1366 * caller to dispose of "*vpp" if any error occurs. 1367 */ 1368 int 1369 fassign(vnode_t **vpp, int mode, int *fdp) 1370 { 1371 file_t *fp; 1372 int error; 1373 int fd; 1374 1375 if (error = falloc((vnode_t *)NULL, mode, &fp, &fd)) 1376 return (error); 1377 if (error = VOP_OPEN(vpp, mode, fp->f_cred, NULL)) { 1378 setf(fd, NULL); 1379 unfalloc(fp); 1380 return (error); 1381 } 1382 fp->f_vnode = *vpp; 1383 mutex_exit(&fp->f_tlock); 1384 /* 1385 * Fill in the slot falloc reserved. 1386 */ 1387 setf(fd, fp); 1388 *fdp = fd; 1389 return (0); 1390 } 1391 1392 /* 1393 * When a process forks it must increment the f_count of all file pointers 1394 * since there is a new process pointing at them. fcnt_add(fip, 1) does this. 1395 * Since we are called when there is only 1 active lwp we don't need to 1396 * hold fi_lock or any uf_lock. If the fork fails, fork_fail() calls 1397 * fcnt_add(fip, -1) to restore the counts. 1398 */ 1399 void 1400 fcnt_add(uf_info_t *fip, int incr) 1401 { 1402 int i; 1403 uf_entry_t *ufp; 1404 file_t *fp; 1405 1406 ufp = fip->fi_list; 1407 for (i = 0; i < fip->fi_nfiles; i++, ufp++) { 1408 if ((fp = ufp->uf_file) != NULL) { 1409 mutex_enter(&fp->f_tlock); 1410 ASSERT((incr == 1 && fp->f_count >= 1) || 1411 (incr == -1 && fp->f_count >= 2)); 1412 fp->f_count += incr; 1413 mutex_exit(&fp->f_tlock); 1414 } 1415 } 1416 } 1417 1418 /* 1419 * This is called from exec to close all fd's that have the FD_CLOEXEC flag 1420 * set and also to close all self-open for write /proc file descriptors. 1421 */ 1422 void 1423 close_exec(uf_info_t *fip) 1424 { 1425 int fd; 1426 file_t *fp; 1427 fpollinfo_t *fpip; 1428 uf_entry_t *ufp; 1429 portfd_t *pfd; 1430 1431 ufp = fip->fi_list; 1432 for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) { 1433 if ((fp = ufp->uf_file) != NULL && 1434 ((ufp->uf_flag & FD_CLOEXEC) || 1435 ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode)))) { 1436 fpip = ufp->uf_fpollinfo; 1437 mutex_enter(&fip->fi_lock); 1438 mutex_enter(&ufp->uf_lock); 1439 fd_reserve(fip, fd, -1); 1440 mutex_exit(&fip->fi_lock); 1441 ufp->uf_file = NULL; 1442 ufp->uf_fpollinfo = NULL; 1443 ufp->uf_flag = 0; 1444 /* 1445 * We may need to cleanup some cached poll states 1446 * in t_pollstate before the fd can be reused. It 1447 * is important that we don't access a stale thread 1448 * structure. We will do the cleanup in two 1449 * phases to avoid deadlock and holding uf_lock for 1450 * too long. In phase 1, hold the uf_lock and call 1451 * pollblockexit() to set state in t_pollstate struct 1452 * so that a thread does not exit on us. In phase 2, 1453 * we drop the uf_lock and call pollcacheclean(). 1454 */ 1455 pfd = ufp->uf_portfd; 1456 ufp->uf_portfd = NULL; 1457 if (fpip != NULL) 1458 pollblockexit(fpip); 1459 mutex_exit(&ufp->uf_lock); 1460 if (fpip != NULL) 1461 pollcacheclean(fpip, fd); 1462 if (pfd) 1463 port_close_fd(pfd); 1464 (void) closef(fp); 1465 } 1466 } 1467 1468 /* Reset bad fd */ 1469 fip->fi_badfd = -1; 1470 fip->fi_action = -1; 1471 } 1472 1473 /* 1474 * Common routine for modifying attributes of named files. 1475 */ 1476 int 1477 namesetattr(char *fnamep, enum symfollow followlink, vattr_t *vap, int flags) 1478 { 1479 vnode_t *vp; 1480 int error = 0; 1481 1482 if (error = lookupname(fnamep, UIO_USERSPACE, followlink, NULLVPP, &vp)) 1483 return (set_errno(error)); 1484 if (error = vpsetattr(vp, vap, flags)) 1485 (void) set_errno(error); 1486 VN_RELE(vp); 1487 return (error); 1488 } 1489 1490 /* 1491 * Common routine for modifying attributes of files referenced 1492 * by descriptor. 1493 */ 1494 int 1495 fdsetattr(int fd, vattr_t *vap) 1496 { 1497 file_t *fp; 1498 vnode_t *vp; 1499 int error = 0; 1500 1501 if ((fp = getf(fd)) != NULL) { 1502 vp = fp->f_vnode; 1503 if (error = vpsetattr(vp, vap, 0)) { 1504 (void) set_errno(error); 1505 } 1506 releasef(fd); 1507 } else 1508 error = set_errno(EBADF); 1509 return (error); 1510 } 1511 1512 /* 1513 * Common routine to set the attributes for the given vnode. 1514 * If the vnode is a file and the filesize is being manipulated, 1515 * this makes sure that there are no conflicting non-blocking 1516 * mandatory locks in that region. 1517 */ 1518 static int 1519 vpsetattr(vnode_t *vp, vattr_t *vap, int flags) 1520 { 1521 int error = 0; 1522 int in_crit = 0; 1523 u_offset_t begin; 1524 vattr_t vattr; 1525 ssize_t length; 1526 1527 if (vn_is_readonly(vp)) { 1528 error = EROFS; 1529 } 1530 if (!error && (vap->va_mask & AT_SIZE) && 1531 nbl_need_check(vp)) { 1532 nbl_start_crit(vp, RW_READER); 1533 in_crit = 1; 1534 vattr.va_mask = AT_SIZE; 1535 if (!(error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) { 1536 begin = vap->va_size > vattr.va_size ? 1537 vattr.va_size : vap->va_size; 1538 length = vattr.va_size > vap->va_size ? 1539 vattr.va_size - vap->va_size : 1540 vap->va_size - vattr.va_size; 1541 1542 if (nbl_conflict(vp, NBL_WRITE, begin, length, 0, 1543 NULL)) { 1544 error = EACCES; 1545 } 1546 } 1547 } 1548 if (!error) 1549 error = VOP_SETATTR(vp, vap, flags, CRED(), NULL); 1550 1551 if (in_crit) 1552 nbl_end_crit(vp); 1553 1554 return (error); 1555 } 1556 1557 /* 1558 * Return true if the given vnode is referenced by any 1559 * entry in the current process's file descriptor table. 1560 */ 1561 int 1562 fisopen(vnode_t *vp) 1563 { 1564 int fd; 1565 file_t *fp; 1566 vnode_t *ovp; 1567 uf_info_t *fip = P_FINFO(curproc); 1568 uf_entry_t *ufp; 1569 1570 mutex_enter(&fip->fi_lock); 1571 for (fd = 0; fd < fip->fi_nfiles; fd++) { 1572 UF_ENTER(ufp, fip, fd); 1573 if ((fp = ufp->uf_file) != NULL && 1574 (ovp = fp->f_vnode) != NULL && VN_CMP(vp, ovp)) { 1575 UF_EXIT(ufp); 1576 mutex_exit(&fip->fi_lock); 1577 return (1); 1578 } 1579 UF_EXIT(ufp); 1580 } 1581 mutex_exit(&fip->fi_lock); 1582 return (0); 1583 } 1584 1585 /* 1586 * Return zero if at least one file currently open (by curproc) shouldn't be 1587 * allowed to change zones. 1588 */ 1589 int 1590 files_can_change_zones(void) 1591 { 1592 int fd; 1593 file_t *fp; 1594 uf_info_t *fip = P_FINFO(curproc); 1595 uf_entry_t *ufp; 1596 1597 mutex_enter(&fip->fi_lock); 1598 for (fd = 0; fd < fip->fi_nfiles; fd++) { 1599 UF_ENTER(ufp, fip, fd); 1600 if ((fp = ufp->uf_file) != NULL && 1601 !vn_can_change_zones(fp->f_vnode)) { 1602 UF_EXIT(ufp); 1603 mutex_exit(&fip->fi_lock); 1604 return (0); 1605 } 1606 UF_EXIT(ufp); 1607 } 1608 mutex_exit(&fip->fi_lock); 1609 return (1); 1610 } 1611 1612 #ifdef DEBUG 1613 1614 /* 1615 * The following functions are only used in ASSERT()s elsewhere. 1616 * They do not modify the state of the system. 1617 */ 1618 1619 /* 1620 * Return true (1) if the current thread is in the fpollinfo 1621 * list for this file descriptor, else false (0). 1622 */ 1623 static int 1624 curthread_in_plist(uf_entry_t *ufp) 1625 { 1626 fpollinfo_t *fpip; 1627 1628 ASSERT(MUTEX_HELD(&ufp->uf_lock)); 1629 for (fpip = ufp->uf_fpollinfo; fpip; fpip = fpip->fp_next) 1630 if (fpip->fp_thread == curthread) 1631 return (1); 1632 return (0); 1633 } 1634 1635 /* 1636 * Sanity check to make sure that after lwp_exit(), 1637 * curthread does not appear on any fd's fpollinfo list. 1638 */ 1639 void 1640 checkfpollinfo(void) 1641 { 1642 int fd; 1643 uf_info_t *fip = P_FINFO(curproc); 1644 uf_entry_t *ufp; 1645 1646 mutex_enter(&fip->fi_lock); 1647 for (fd = 0; fd < fip->fi_nfiles; fd++) { 1648 UF_ENTER(ufp, fip, fd); 1649 ASSERT(!curthread_in_plist(ufp)); 1650 UF_EXIT(ufp); 1651 } 1652 mutex_exit(&fip->fi_lock); 1653 } 1654 1655 /* 1656 * Return true (1) if the current thread is in the fpollinfo 1657 * list for this file descriptor, else false (0). 1658 * This is the same as curthread_in_plist(), 1659 * but is called w/o holding uf_lock. 1660 */ 1661 int 1662 infpollinfo(int fd) 1663 { 1664 uf_info_t *fip = P_FINFO(curproc); 1665 uf_entry_t *ufp; 1666 int rc; 1667 1668 UF_ENTER(ufp, fip, fd); 1669 rc = curthread_in_plist(ufp); 1670 UF_EXIT(ufp); 1671 return (rc); 1672 } 1673 1674 #endif /* DEBUG */ 1675 1676 /* 1677 * Add the curthread to fpollinfo list, meaning this fd is currently in the 1678 * thread's poll cache. Each lwp polling this file descriptor should call 1679 * this routine once. 1680 */ 1681 void 1682 addfpollinfo(int fd) 1683 { 1684 struct uf_entry *ufp; 1685 fpollinfo_t *fpip; 1686 uf_info_t *fip = P_FINFO(curproc); 1687 1688 fpip = kmem_zalloc(sizeof (fpollinfo_t), KM_SLEEP); 1689 fpip->fp_thread = curthread; 1690 UF_ENTER(ufp, fip, fd); 1691 /* 1692 * Assert we are not already on the list, that is, that 1693 * this lwp did not call addfpollinfo twice for the same fd. 1694 */ 1695 ASSERT(!curthread_in_plist(ufp)); 1696 /* 1697 * addfpollinfo is always done inside the getf/releasef pair. 1698 */ 1699 ASSERT(ufp->uf_refcnt >= 1); 1700 fpip->fp_next = ufp->uf_fpollinfo; 1701 ufp->uf_fpollinfo = fpip; 1702 UF_EXIT(ufp); 1703 } 1704 1705 /* 1706 * Delete curthread from fpollinfo list if it is there. 1707 */ 1708 void 1709 delfpollinfo(int fd) 1710 { 1711 struct uf_entry *ufp; 1712 struct fpollinfo *fpip; 1713 struct fpollinfo **fpipp; 1714 uf_info_t *fip = P_FINFO(curproc); 1715 1716 UF_ENTER(ufp, fip, fd); 1717 for (fpipp = &ufp->uf_fpollinfo; 1718 (fpip = *fpipp) != NULL; 1719 fpipp = &fpip->fp_next) { 1720 if (fpip->fp_thread == curthread) { 1721 *fpipp = fpip->fp_next; 1722 kmem_free(fpip, sizeof (fpollinfo_t)); 1723 break; 1724 } 1725 } 1726 /* 1727 * Assert that we are not still on the list, that is, that 1728 * this lwp did not call addfpollinfo twice for the same fd. 1729 */ 1730 ASSERT(!curthread_in_plist(ufp)); 1731 UF_EXIT(ufp); 1732 } 1733 1734 /* 1735 * fd is associated with a port. pfd is a pointer to the fd entry in the 1736 * cache of the port. 1737 */ 1738 1739 void 1740 addfd_port(int fd, portfd_t *pfd) 1741 { 1742 struct uf_entry *ufp; 1743 uf_info_t *fip = P_FINFO(curproc); 1744 1745 UF_ENTER(ufp, fip, fd); 1746 /* 1747 * addfd_port is always done inside the getf/releasef pair. 1748 */ 1749 ASSERT(ufp->uf_refcnt >= 1); 1750 if (ufp->uf_portfd == NULL) { 1751 /* first entry */ 1752 ufp->uf_portfd = pfd; 1753 pfd->pfd_next = NULL; 1754 } else { 1755 pfd->pfd_next = ufp->uf_portfd; 1756 ufp->uf_portfd = pfd; 1757 pfd->pfd_next->pfd_prev = pfd; 1758 } 1759 UF_EXIT(ufp); 1760 } 1761 1762 void 1763 delfd_port(int fd, portfd_t *pfd) 1764 { 1765 struct uf_entry *ufp; 1766 uf_info_t *fip = P_FINFO(curproc); 1767 1768 UF_ENTER(ufp, fip, fd); 1769 /* 1770 * delfd_port is always done inside the getf/releasef pair. 1771 */ 1772 ASSERT(ufp->uf_refcnt >= 1); 1773 if (ufp->uf_portfd == pfd) { 1774 /* remove first entry */ 1775 ufp->uf_portfd = pfd->pfd_next; 1776 } else { 1777 pfd->pfd_prev->pfd_next = pfd->pfd_next; 1778 if (pfd->pfd_next != NULL) 1779 pfd->pfd_next->pfd_prev = pfd->pfd_prev; 1780 } 1781 UF_EXIT(ufp); 1782 } 1783 1784 static void 1785 port_close_fd(portfd_t *pfd) 1786 { 1787 portfd_t *pfdn; 1788 1789 /* 1790 * At this point, no other thread should access 1791 * the portfd_t list for this fd. The uf_file, uf_portfd 1792 * pointers in the uf_entry_t struct for this fd would 1793 * be set to NULL. 1794 */ 1795 for (; pfd != NULL; pfd = pfdn) { 1796 pfdn = pfd->pfd_next; 1797 port_close_pfd(pfd); 1798 } 1799 } 1800