1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 #include <sys/types.h> 30 #include <sys/sysmacros.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/errno.h> 34 #include <sys/signal.h> 35 #include <sys/cred.h> 36 #include <sys/user.h> 37 #include <sys/conf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/pathname.h> 41 #include <sys/file.h> 42 #include <sys/proc.h> 43 #include <sys/var.h> 44 #include <sys/cpuvar.h> 45 #include <sys/open.h> 46 #include <sys/cmn_err.h> 47 #include <sys/priocntl.h> 48 #include <sys/procset.h> 49 #include <sys/prsystm.h> 50 #include <sys/debug.h> 51 #include <sys/kmem.h> 52 #include <sys/atomic.h> 53 #include <sys/fcntl.h> 54 #include <sys/poll.h> 55 #include <sys/rctl.h> 56 #include <sys/port_impl.h> 57 58 #include <c2/audit.h> 59 #include <sys/nbmlock.h> 60 61 #ifdef DEBUG 62 63 static uint32_t afd_maxfd; /* # of entries in maximum allocated array */ 64 static uint32_t afd_alloc; /* count of kmem_alloc()s */ 65 static uint32_t afd_free; /* count of kmem_free()s */ 66 static uint32_t afd_wait; /* count of waits on non-zero ref count */ 67 #define MAXFD(x) (afd_maxfd = ((afd_maxfd >= (x))? afd_maxfd : (x))) 68 #define COUNT(x) atomic_add_32(&x, 1) 69 70 #else /* DEBUG */ 71 72 #define MAXFD(x) 73 #define COUNT(x) 74 75 #endif /* DEBUG */ 76 77 kmem_cache_t *file_cache; 78 static int vpsetattr(vnode_t *, vattr_t *, int); 79 80 static void port_close_fd(portfd_t *); 81 82 /* 83 * File descriptor allocation. 84 * 85 * fd_find(fip, minfd) finds the first available descriptor >= minfd. 86 * The most common case is open(2), in which minfd = 0, but we must also 87 * support fcntl(fd, F_DUPFD, minfd). 88 * 89 * The algorithm is as follows: we keep all file descriptors in an infix 90 * binary tree in which each node records the number of descriptors 91 * allocated in its right subtree, including itself. Starting at minfd, 92 * we ascend the tree until we find a non-fully allocated right subtree. 93 * We then descend that subtree in a binary search for the smallest fd. 94 * Finally, we ascend the tree again to increment the allocation count 95 * of every subtree containing the newly-allocated fd. Freeing an fd 96 * requires only the last step: we ascend the tree to decrement allocation 97 * counts. Each of these three steps (ascent to find non-full subtree, 98 * descent to find lowest fd, ascent to update allocation counts) is 99 * O(log n), thus the algorithm as a whole is O(log n). 100 * 101 * We don't implement the fd tree using the customary left/right/parent 102 * pointers, but instead take advantage of the glorious mathematics of 103 * full infix binary trees. For reference, here's an illustration of the 104 * logical structure of such a tree, rooted at 4 (binary 100), covering 105 * the range 1-7 (binary 001-111). Our canonical trees do not include 106 * fd 0; we'll deal with that later. 107 * 108 * 100 109 * / \ 110 * / \ 111 * 010 110 112 * / \ / \ 113 * 001 011 101 111 114 * 115 * We make the following observations, all of which are easily proven by 116 * induction on the depth of the tree: 117 * 118 * (T1) The least-significant bit (LSB) of any node is equal to its level 119 * in the tree. In our example, nodes 001, 011, 101 and 111 are at 120 * level 0; nodes 010 and 110 are at level 1; and node 100 is at level 2. 121 * 122 * (T2) The child size (CSIZE) of node N -- that is, the total number of 123 * right-branch descendants in a child of node N, including itself -- is 124 * given by clearing all but the least significant bit of N. This 125 * follows immediately from (T1). Applying this rule to our example, we 126 * see that CSIZE(100) = 100, CSIZE(x10) = 10, and CSIZE(xx1) = 1. 127 * 128 * (T3) The nearest left ancestor (LPARENT) of node N -- that is, the nearest 129 * ancestor containing node N in its right child -- is given by clearing 130 * the LSB of N. For example, LPARENT(111) = 110 and LPARENT(110) = 100. 131 * Clearing the LSB of nodes 001, 010 or 100 yields zero, reflecting 132 * the fact that these are leftmost nodes. Note that this algorithm 133 * automatically skips generations as necessary. For example, the parent 134 * of node 101 is 110, which is a *right* ancestor (not what we want); 135 * but its grandparent is 100, which is a left ancestor. Clearing the LSB 136 * of 101 gets us to 100 directly, skipping right past the uninteresting 137 * generation (110). 138 * 139 * Note that since LPARENT clears the LSB, whereas CSIZE clears all *but* 140 * the LSB, we can express LPARENT() nicely in terms of CSIZE(): 141 * 142 * LPARENT(N) = N - CSIZE(N) 143 * 144 * (T4) The nearest right ancestor (RPARENT) of node N is given by: 145 * 146 * RPARENT(N) = N + CSIZE(N) 147 * 148 * (T5) For every interior node, the children differ from their parent by 149 * CSIZE(parent) / 2. In our example, CSIZE(100) / 2 = 2 = 10 binary, 150 * and indeed, the children of 100 are 100 +/- 10 = 010 and 110. 151 * 152 * Next, we'll need a few two's-complement math tricks. Suppose a number, 153 * N, has the following form: 154 * 155 * N = xxxx10...0 156 * 157 * That is, the binary representation of N consists of some string of bits, 158 * then a 1, then all zeroes. This amounts to nothing more than saying that 159 * N has a least-significant bit, which is true for any N != 0. If we look 160 * at N and N - 1 together, we see that we can combine them in useful ways: 161 * 162 * N = xxxx10...0 163 * N - 1 = xxxx01...1 164 * ------------------------ 165 * N & (N - 1) = xxxx000000 166 * N | (N - 1) = xxxx111111 167 * N ^ (N - 1) = 111111 168 * 169 * In particular, this suggests several easy ways to clear all but the LSB, 170 * which by (T2) is exactly what we need to determine CSIZE(N) = 10...0. 171 * We'll opt for this formulation: 172 * 173 * (C1) CSIZE(N) = (N - 1) ^ (N | (N - 1)) 174 * 175 * Similarly, we have an easy way to determine LPARENT(N), which requires 176 * that we clear the LSB of N: 177 * 178 * (L1) LPARENT(N) = N & (N - 1) 179 * 180 * We note in the above relations that (N | (N - 1)) - N = CSIZE(N) - 1. 181 * When combined with (T4), this yields an easy way to compute RPARENT(N): 182 * 183 * (R1) RPARENT(N) = (N | (N - 1)) + 1 184 * 185 * Finally, to accommodate fd 0 we must adjust all of our results by +/-1 to 186 * move the fd range from [1, 2^n) to [0, 2^n - 1). This is straightforward, 187 * so there's no need to belabor the algebra; the revised relations become: 188 * 189 * (C1a) CSIZE(N) = N ^ (N | (N + 1)) 190 * 191 * (L1a) LPARENT(N) = (N & (N + 1)) - 1 192 * 193 * (R1a) RPARENT(N) = N | (N + 1) 194 * 195 * This completes the mathematical framework. We now have all the tools 196 * we need to implement fd_find() and fd_reserve(). 197 * 198 * fd_find(fip, minfd) finds the smallest available file descriptor >= minfd. 199 * It does not actually allocate the descriptor; that's done by fd_reserve(). 200 * fd_find() proceeds in two steps: 201 * 202 * (1) Find the leftmost subtree that contains a descriptor >= minfd. 203 * We start at the right subtree rooted at minfd. If this subtree is 204 * not full -- if fip->fi_list[minfd].uf_alloc != CSIZE(minfd) -- then 205 * step 1 is done. Otherwise, we know that all fds in this subtree 206 * are taken, so we ascend to RPARENT(minfd) using (R1a). We repeat 207 * this process until we either find a candidate subtree or exceed 208 * fip->fi_nfiles. We use (C1a) to compute CSIZE(). 209 * 210 * (2) Find the smallest fd in the subtree discovered by step 1. 211 * Starting at the root of this subtree, we descend to find the 212 * smallest available fd. Since the left children have the smaller 213 * fds, we will descend rightward only when the left child is full. 214 * 215 * We begin by comparing the number of allocated fds in the root 216 * to the number of allocated fds in its right child; if they differ 217 * by exactly CSIZE(child), we know the left subtree is full, so we 218 * descend right; that is, the right child becomes the search root. 219 * Otherwise we leave the root alone and start following the right 220 * child's left children. As fortune would have it, this is very 221 * simple computationally: by (T5), the right child of fd is just 222 * fd + size, where size = CSIZE(fd) / 2. Applying (T5) again, 223 * we find that the right child's left child is fd + size - (size / 2) = 224 * fd + (size / 2); *its* left child is fd + (size / 2) - (size / 4) = 225 * fd + (size / 4), and so on. In general, fd's right child's 226 * leftmost nth descendant is fd + (size >> n). Thus, to follow 227 * the right child's left descendants, we just halve the size in 228 * each iteration of the search. 229 * 230 * When we descend leftward, we must keep track of the number of fds 231 * that were allocated in all the right subtrees we rejected, so we 232 * know how many of the root fd's allocations are in the remaining 233 * (as yet unexplored) leftmost part of its right subtree. When we 234 * encounter a fully-allocated left child -- that is, when we find 235 * that fip->fi_list[fd].uf_alloc == ralloc + size -- we descend right 236 * (as described earlier), resetting ralloc to zero. 237 * 238 * fd_reserve(fip, fd, incr) either allocates or frees fd, depending 239 * on whether incr is 1 or -1. Starting at fd, fd_reserve() ascends 240 * the leftmost ancestors (see (T3)) and updates the allocation counts. 241 * At each step we use (L1a) to compute LPARENT(), the next left ancestor. 242 * 243 * flist_minsize() finds the minimal tree that still covers all 244 * used fds; as long as the allocation count of a root node is zero, we 245 * don't need that node or its right subtree. 246 * 247 * flist_nalloc() counts the number of allocated fds in the tree, by starting 248 * at the top of the tree and summing the right-subtree allocation counts as 249 * it descends leftwards. 250 * 251 * Note: we assume that flist_grow() will keep fip->fi_nfiles of the form 252 * 2^n - 1. This ensures that the fd trees are always full, which saves 253 * quite a bit of boundary checking. 254 */ 255 static int 256 fd_find(uf_info_t *fip, int minfd) 257 { 258 int size, ralloc, fd; 259 260 ASSERT(MUTEX_HELD(&fip->fi_lock)); 261 ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0); 262 263 for (fd = minfd; (uint_t)fd < fip->fi_nfiles; fd |= fd + 1) { 264 size = fd ^ (fd | (fd + 1)); 265 if (fip->fi_list[fd].uf_alloc == size) 266 continue; 267 for (ralloc = 0, size >>= 1; size != 0; size >>= 1) { 268 ralloc += fip->fi_list[fd + size].uf_alloc; 269 if (fip->fi_list[fd].uf_alloc == ralloc + size) { 270 fd += size; 271 ralloc = 0; 272 } 273 } 274 return (fd); 275 } 276 return (-1); 277 } 278 279 static void 280 fd_reserve(uf_info_t *fip, int fd, int incr) 281 { 282 int pfd; 283 uf_entry_t *ufp = &fip->fi_list[fd]; 284 285 ASSERT((uint_t)fd < fip->fi_nfiles); 286 ASSERT((ufp->uf_busy == 0 && incr == 1) || 287 (ufp->uf_busy == 1 && incr == -1)); 288 ASSERT(MUTEX_HELD(&ufp->uf_lock)); 289 ASSERT(MUTEX_HELD(&fip->fi_lock)); 290 291 for (pfd = fd; pfd >= 0; pfd = (pfd & (pfd + 1)) - 1) 292 fip->fi_list[pfd].uf_alloc += incr; 293 294 ufp->uf_busy += incr; 295 } 296 297 static int 298 flist_minsize(uf_info_t *fip) 299 { 300 int fd; 301 302 /* 303 * We'd like to ASSERT(MUTEX_HELD(&fip->fi_lock)), but we're called 304 * by flist_fork(), which relies on other mechanisms for mutual 305 * exclusion. 306 */ 307 ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0); 308 309 for (fd = fip->fi_nfiles; fd != 0; fd >>= 1) 310 if (fip->fi_list[fd >> 1].uf_alloc != 0) 311 break; 312 313 return (fd); 314 } 315 316 static int 317 flist_nalloc(uf_info_t *fip) 318 { 319 int fd; 320 int nalloc = 0; 321 322 ASSERT(MUTEX_HELD(&fip->fi_lock)); 323 ASSERT((fip->fi_nfiles & (fip->fi_nfiles + 1)) == 0); 324 325 for (fd = fip->fi_nfiles; fd != 0; fd >>= 1) 326 nalloc += fip->fi_list[fd >> 1].uf_alloc; 327 328 return (nalloc); 329 } 330 331 /* 332 * Increase size of the fi_list array to accommodate at least maxfd. 333 * We keep the size of the form 2^n - 1 for benefit of fd_find(). 334 */ 335 static void 336 flist_grow(int maxfd) 337 { 338 uf_info_t *fip = P_FINFO(curproc); 339 int newcnt, oldcnt; 340 uf_entry_t *src, *dst, *newlist, *oldlist, *newend, *oldend; 341 uf_rlist_t *urp; 342 343 for (newcnt = 1; newcnt <= maxfd; newcnt = (newcnt << 1) | 1) 344 continue; 345 346 newlist = kmem_zalloc(newcnt * sizeof (uf_entry_t), KM_SLEEP); 347 348 mutex_enter(&fip->fi_lock); 349 oldcnt = fip->fi_nfiles; 350 if (newcnt <= oldcnt) { 351 mutex_exit(&fip->fi_lock); 352 kmem_free(newlist, newcnt * sizeof (uf_entry_t)); 353 return; 354 } 355 ASSERT((newcnt & (newcnt + 1)) == 0); 356 oldlist = fip->fi_list; 357 oldend = oldlist + oldcnt; 358 newend = newlist + oldcnt; /* no need to lock beyond old end */ 359 360 /* 361 * fi_list and fi_nfiles cannot change while any uf_lock is held, 362 * so we must grab all the old locks *and* the new locks up to oldcnt. 363 * (Locks beyond the end of oldcnt aren't visible until we store 364 * the new fi_nfiles, which is the last thing we do before dropping 365 * all the locks, so there's no need to acquire these locks). 366 * Holding the new locks is necessary because when fi_list changes 367 * to point to the new list, fi_nfiles won't have been stored yet. 368 * If we *didn't* hold the new locks, someone doing a UF_ENTER() 369 * could see the new fi_list, grab the new uf_lock, and then see 370 * fi_nfiles change while the lock is held -- in violation of 371 * UF_ENTER() semantics. 372 */ 373 for (src = oldlist; src < oldend; src++) 374 mutex_enter(&src->uf_lock); 375 376 for (dst = newlist; dst < newend; dst++) 377 mutex_enter(&dst->uf_lock); 378 379 for (src = oldlist, dst = newlist; src < oldend; src++, dst++) { 380 dst->uf_file = src->uf_file; 381 dst->uf_fpollinfo = src->uf_fpollinfo; 382 dst->uf_refcnt = src->uf_refcnt; 383 dst->uf_alloc = src->uf_alloc; 384 dst->uf_flag = src->uf_flag; 385 dst->uf_busy = src->uf_busy; 386 dst->uf_portfd = src->uf_portfd; 387 } 388 389 /* 390 * As soon as we store the new flist, future locking operations 391 * will use it. Therefore, we must ensure that all the state 392 * we've just established reaches global visibility before the 393 * new flist does. 394 */ 395 membar_producer(); 396 fip->fi_list = newlist; 397 398 /* 399 * Routines like getf() make an optimistic check on the validity 400 * of the supplied file descriptor: if it's less than the current 401 * value of fi_nfiles -- examined without any locks -- then it's 402 * safe to attempt a UF_ENTER() on that fd (which is a valid 403 * assumption because fi_nfiles only increases). Therefore, it 404 * is critical that the new value of fi_nfiles not reach global 405 * visibility until after the new fi_list: if it happened the 406 * other way around, getf() could see the new fi_nfiles and attempt 407 * a UF_ENTER() on the old fi_list, which would write beyond its 408 * end if the fd exceeded the old fi_nfiles. 409 */ 410 membar_producer(); 411 fip->fi_nfiles = newcnt; 412 413 /* 414 * The new state is consistent now, so we can drop all the locks. 415 */ 416 for (dst = newlist; dst < newend; dst++) 417 mutex_exit(&dst->uf_lock); 418 419 for (src = oldlist; src < oldend; src++) { 420 /* 421 * If any threads are blocked on the old cvs, wake them. 422 * This will force them to wake up, discover that fi_list 423 * has changed, and go back to sleep on the new cvs. 424 */ 425 cv_broadcast(&src->uf_wanted_cv); 426 cv_broadcast(&src->uf_closing_cv); 427 mutex_exit(&src->uf_lock); 428 } 429 430 mutex_exit(&fip->fi_lock); 431 432 /* 433 * Retire the old flist. We can't actually kmem_free() it now 434 * because someone may still have a pointer to it. Instead, 435 * we link it onto a list of retired flists. The new flist 436 * is at least double the size of the previous flist, so the 437 * total size of all retired flists will be less than the size 438 * of the current one (to prove, consider the sum of a geometric 439 * series in powers of 2). exit() frees the retired flists. 440 */ 441 urp = kmem_zalloc(sizeof (uf_rlist_t), KM_SLEEP); 442 urp->ur_list = oldlist; 443 urp->ur_nfiles = oldcnt; 444 445 mutex_enter(&fip->fi_lock); 446 urp->ur_next = fip->fi_rlist; 447 fip->fi_rlist = urp; 448 mutex_exit(&fip->fi_lock); 449 } 450 451 /* 452 * Utility functions for keeping track of the active file descriptors. 453 */ 454 void 455 clear_stale_fd() /* called from post_syscall() */ 456 { 457 afd_t *afd = &curthread->t_activefd; 458 int i; 459 460 /* uninitialized is ok here, a_nfd is then zero */ 461 for (i = 0; i < afd->a_nfd; i++) { 462 /* assert that this should not be necessary */ 463 ASSERT(afd->a_fd[i] == -1); 464 afd->a_fd[i] = -1; 465 } 466 afd->a_stale = 0; 467 } 468 469 void 470 free_afd(afd_t *afd) /* called below and from thread_free() */ 471 { 472 int i; 473 474 /* free the buffer if it was kmem_alloc()ed */ 475 if (afd->a_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) { 476 COUNT(afd_free); 477 kmem_free(afd->a_fd, afd->a_nfd * sizeof (afd->a_fd[0])); 478 } 479 480 /* (re)initialize the structure */ 481 afd->a_fd = &afd->a_buf[0]; 482 afd->a_nfd = sizeof (afd->a_buf) / sizeof (afd->a_buf[0]); 483 afd->a_stale = 0; 484 for (i = 0; i < afd->a_nfd; i++) 485 afd->a_fd[i] = -1; 486 } 487 488 static void 489 set_active_fd(int fd) 490 { 491 afd_t *afd = &curthread->t_activefd; 492 int i; 493 int *old_fd; 494 int old_nfd; 495 int *new_fd; 496 int new_nfd; 497 498 if (afd->a_nfd == 0) { /* first time initialization */ 499 ASSERT(fd == -1); 500 mutex_enter(&afd->a_fdlock); 501 free_afd(afd); 502 mutex_exit(&afd->a_fdlock); 503 } 504 505 /* insert fd into vacant slot, if any */ 506 for (i = 0; i < afd->a_nfd; i++) { 507 if (afd->a_fd[i] == -1) { 508 afd->a_fd[i] = fd; 509 return; 510 } 511 } 512 513 /* 514 * Reallocate the a_fd[] array to add one more slot. 515 */ 516 ASSERT(fd == -1); 517 old_nfd = afd->a_nfd; 518 old_fd = afd->a_fd; 519 new_nfd = old_nfd + 1; 520 new_fd = kmem_alloc(new_nfd * sizeof (afd->a_fd[0]), KM_SLEEP); 521 MAXFD(new_nfd); 522 COUNT(afd_alloc); 523 524 mutex_enter(&afd->a_fdlock); 525 afd->a_fd = new_fd; 526 afd->a_nfd = new_nfd; 527 for (i = 0; i < old_nfd; i++) 528 afd->a_fd[i] = old_fd[i]; 529 afd->a_fd[i] = fd; 530 mutex_exit(&afd->a_fdlock); 531 532 if (old_nfd > sizeof (afd->a_buf) / sizeof (afd->a_buf[0])) { 533 COUNT(afd_free); 534 kmem_free(old_fd, old_nfd * sizeof (afd->a_fd[0])); 535 } 536 } 537 538 void 539 clear_active_fd(int fd) /* called below and from aio.c */ 540 { 541 afd_t *afd = &curthread->t_activefd; 542 int i; 543 544 for (i = 0; i < afd->a_nfd; i++) { 545 if (afd->a_fd[i] == fd) { 546 afd->a_fd[i] = -1; 547 break; 548 } 549 } 550 ASSERT(i < afd->a_nfd); /* not found is not ok */ 551 } 552 553 /* 554 * Does this thread have this fd active? 555 */ 556 static int 557 is_active_fd(kthread_t *t, int fd) 558 { 559 afd_t *afd = &t->t_activefd; 560 int i; 561 562 ASSERT(t != curthread); 563 mutex_enter(&afd->a_fdlock); 564 /* uninitialized is ok here, a_nfd is then zero */ 565 for (i = 0; i < afd->a_nfd; i++) { 566 if (afd->a_fd[i] == fd) { 567 mutex_exit(&afd->a_fdlock); 568 return (1); 569 } 570 } 571 mutex_exit(&afd->a_fdlock); 572 return (0); 573 } 574 575 /* 576 * Convert a user supplied file descriptor into a pointer to a file 577 * structure. Only task is to check range of the descriptor (soft 578 * resource limit was enforced at open time and shouldn't be checked 579 * here). 580 */ 581 file_t * 582 getf(int fd) 583 { 584 uf_info_t *fip = P_FINFO(curproc); 585 uf_entry_t *ufp; 586 file_t *fp; 587 588 if ((uint_t)fd >= fip->fi_nfiles) 589 return (NULL); 590 591 /* 592 * Reserve a slot in the active fd array now so we can call 593 * set_active_fd(fd) for real below, while still inside UF_ENTER(). 594 */ 595 set_active_fd(-1); 596 597 UF_ENTER(ufp, fip, fd); 598 599 if ((fp = ufp->uf_file) == NULL) { 600 UF_EXIT(ufp); 601 602 if (fd == fip->fi_badfd && fip->fi_action > 0) 603 tsignal(curthread, fip->fi_action); 604 605 return (NULL); 606 } 607 ufp->uf_refcnt++; 608 609 set_active_fd(fd); /* record the active file descriptor */ 610 611 UF_EXIT(ufp); 612 613 return (fp); 614 } 615 616 /* 617 * Close whatever file currently occupies the file descriptor slot 618 * and install the new file, usually NULL, in the file descriptor slot. 619 * The close must complete before we release the file descriptor slot. 620 * If newfp != NULL we only return an error if we can't allocate the 621 * slot so the caller knows that it needs to free the filep; 622 * in the other cases we return the error number from closef(). 623 */ 624 int 625 closeandsetf(int fd, file_t *newfp) 626 { 627 proc_t *p = curproc; 628 uf_info_t *fip = P_FINFO(p); 629 uf_entry_t *ufp; 630 file_t *fp; 631 fpollinfo_t *fpip; 632 portfd_t *pfd; 633 int error; 634 635 if ((uint_t)fd >= fip->fi_nfiles) { 636 if (newfp == NULL) 637 return (EBADF); 638 flist_grow(fd); 639 } 640 641 if (newfp != NULL) { 642 /* 643 * If ufp is reserved but has no file pointer, it's in the 644 * transition between ufalloc() and setf(). We must wait 645 * for this transition to complete before assigning the 646 * new non-NULL file pointer. 647 */ 648 mutex_enter(&fip->fi_lock); 649 if (fd == fip->fi_badfd) { 650 mutex_exit(&fip->fi_lock); 651 if (fip->fi_action > 0) 652 tsignal(curthread, fip->fi_action); 653 return (EBADF); 654 } 655 UF_ENTER(ufp, fip, fd); 656 while (ufp->uf_busy && ufp->uf_file == NULL) { 657 mutex_exit(&fip->fi_lock); 658 cv_wait_stop(&ufp->uf_wanted_cv, &ufp->uf_lock, 250); 659 UF_EXIT(ufp); 660 mutex_enter(&fip->fi_lock); 661 UF_ENTER(ufp, fip, fd); 662 } 663 if ((fp = ufp->uf_file) == NULL) { 664 ASSERT(ufp->uf_fpollinfo == NULL); 665 ASSERT(ufp->uf_flag == 0); 666 fd_reserve(fip, fd, 1); 667 ufp->uf_file = newfp; 668 UF_EXIT(ufp); 669 mutex_exit(&fip->fi_lock); 670 return (0); 671 } 672 mutex_exit(&fip->fi_lock); 673 } else { 674 UF_ENTER(ufp, fip, fd); 675 if ((fp = ufp->uf_file) == NULL) { 676 UF_EXIT(ufp); 677 return (EBADF); 678 } 679 } 680 681 ASSERT(ufp->uf_busy); 682 ufp->uf_file = NULL; 683 ufp->uf_flag = 0; 684 685 /* 686 * If the file descriptor reference count is non-zero, then 687 * some other lwp in the process is performing system call 688 * activity on the file. To avoid blocking here for a long 689 * time (the other lwp might be in a long term sleep in its 690 * system call), we scan all other lwps in the process to 691 * find the ones with this fd as one of their active fds, 692 * set their a_stale flag, and set them running if they 693 * are in an interruptible sleep so they will emerge from 694 * their system calls immediately. post_syscall() will 695 * test the a_stale flag and set errno to EBADF. 696 */ 697 ASSERT(ufp->uf_refcnt == 0 || p->p_lwpcnt > 1); 698 if (ufp->uf_refcnt > 0) { 699 kthread_t *t; 700 701 /* 702 * We call sprlock_proc(p) to ensure that the thread 703 * list will not change while we are scanning it. 704 * To do this, we must drop ufp->uf_lock and then 705 * reacquire it (so we are not holding both p->p_lock 706 * and ufp->uf_lock at the same time). ufp->uf_lock 707 * must be held for is_active_fd() to be correct 708 * (set_active_fd() is called while holding ufp->uf_lock). 709 * 710 * This is a convoluted dance, but it is better than 711 * the old brute-force method of stopping every thread 712 * in the process by calling holdlwps(SHOLDFORK1). 713 */ 714 715 UF_EXIT(ufp); 716 COUNT(afd_wait); 717 718 mutex_enter(&p->p_lock); 719 sprlock_proc(p); 720 mutex_exit(&p->p_lock); 721 722 UF_ENTER(ufp, fip, fd); 723 ASSERT(ufp->uf_file == NULL); 724 725 if (ufp->uf_refcnt > 0) { 726 for (t = curthread->t_forw; 727 t != curthread; 728 t = t->t_forw) { 729 if (is_active_fd(t, fd)) { 730 thread_lock(t); 731 t->t_activefd.a_stale = 1; 732 t->t_post_sys = 1; 733 if (ISWAKEABLE(t)) 734 setrun_locked(t); 735 thread_unlock(t); 736 } 737 } 738 } 739 740 UF_EXIT(ufp); 741 742 mutex_enter(&p->p_lock); 743 sprunlock(p); 744 745 UF_ENTER(ufp, fip, fd); 746 ASSERT(ufp->uf_file == NULL); 747 } 748 749 /* 750 * Wait for other lwps to stop using this file descriptor. 751 */ 752 while (ufp->uf_refcnt > 0) { 753 cv_wait_stop(&ufp->uf_closing_cv, &ufp->uf_lock, 250); 754 /* 755 * cv_wait_stop() drops ufp->uf_lock, so the file list 756 * can change. Drop the lock on our (possibly) stale 757 * ufp and let UF_ENTER() find and lock the current ufp. 758 */ 759 UF_EXIT(ufp); 760 UF_ENTER(ufp, fip, fd); 761 } 762 763 #ifdef DEBUG 764 /* 765 * catch a watchfd on device's pollhead list but not on fpollinfo list 766 */ 767 if (ufp->uf_fpollinfo != NULL) 768 checkwfdlist(fp->f_vnode, ufp->uf_fpollinfo); 769 #endif /* DEBUG */ 770 771 /* 772 * We may need to cleanup some cached poll states in t_pollstate 773 * before the fd can be reused. It is important that we don't 774 * access a stale thread structure. We will do the cleanup in two 775 * phases to avoid deadlock and holding uf_lock for too long. 776 * In phase 1, hold the uf_lock and call pollblockexit() to set 777 * state in t_pollstate struct so that a thread does not exit on 778 * us. In phase 2, we drop the uf_lock and call pollcacheclean(). 779 */ 780 pfd = ufp->uf_portfd; 781 ufp->uf_portfd = NULL; 782 fpip = ufp->uf_fpollinfo; 783 ufp->uf_fpollinfo = NULL; 784 if (fpip != NULL) 785 pollblockexit(fpip); 786 UF_EXIT(ufp); 787 if (fpip != NULL) 788 pollcacheclean(fpip, fd); 789 if (pfd) 790 port_close_fd(pfd); 791 792 /* 793 * Keep the file descriptor entry reserved across the closef(). 794 */ 795 error = closef(fp); 796 797 setf(fd, newfp); 798 799 /* Only return closef() error when closing is all we do */ 800 return (newfp == NULL ? error : 0); 801 } 802 803 /* 804 * Decrement uf_refcnt; wakeup anyone waiting to close the file. 805 */ 806 void 807 releasef(int fd) 808 { 809 uf_info_t *fip = P_FINFO(curproc); 810 uf_entry_t *ufp; 811 812 UF_ENTER(ufp, fip, fd); 813 ASSERT(ufp->uf_refcnt > 0); 814 clear_active_fd(fd); /* clear the active file descriptor */ 815 if (--ufp->uf_refcnt == 0) 816 cv_broadcast(&ufp->uf_closing_cv); 817 UF_EXIT(ufp); 818 } 819 820 /* 821 * Identical to releasef() but can be called from another process. 822 */ 823 void 824 areleasef(int fd, uf_info_t *fip) 825 { 826 uf_entry_t *ufp; 827 828 UF_ENTER(ufp, fip, fd); 829 ASSERT(ufp->uf_refcnt > 0); 830 if (--ufp->uf_refcnt == 0) 831 cv_broadcast(&ufp->uf_closing_cv); 832 UF_EXIT(ufp); 833 } 834 835 /* 836 * Duplicate all file descriptors across a fork. 837 */ 838 void 839 flist_fork(uf_info_t *pfip, uf_info_t *cfip) 840 { 841 int fd, nfiles; 842 uf_entry_t *pufp, *cufp; 843 844 mutex_init(&cfip->fi_lock, NULL, MUTEX_DEFAULT, NULL); 845 cfip->fi_rlist = NULL; 846 847 /* 848 * We don't need to hold fi_lock because all other lwp's in the 849 * parent have been held. 850 */ 851 cfip->fi_nfiles = nfiles = flist_minsize(pfip); 852 853 cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); 854 855 for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles; 856 fd++, pufp++, cufp++) { 857 cufp->uf_file = pufp->uf_file; 858 cufp->uf_alloc = pufp->uf_alloc; 859 cufp->uf_flag = pufp->uf_flag; 860 cufp->uf_busy = pufp->uf_busy; 861 if (pufp->uf_file == NULL) { 862 ASSERT(pufp->uf_flag == 0); 863 if (pufp->uf_busy) { 864 /* 865 * Grab locks to appease ASSERTs in fd_reserve 866 */ 867 mutex_enter(&cfip->fi_lock); 868 mutex_enter(&cufp->uf_lock); 869 fd_reserve(cfip, fd, -1); 870 mutex_exit(&cufp->uf_lock); 871 mutex_exit(&cfip->fi_lock); 872 } 873 } 874 } 875 } 876 877 /* 878 * Close all open file descriptors for the current process. 879 * This is only called from exit(), which is single-threaded, 880 * so we don't need any locking. 881 */ 882 void 883 closeall(uf_info_t *fip) 884 { 885 int fd; 886 file_t *fp; 887 uf_entry_t *ufp; 888 889 ufp = fip->fi_list; 890 for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) { 891 if ((fp = ufp->uf_file) != NULL) { 892 ufp->uf_file = NULL; 893 if (ufp->uf_portfd != NULL) { 894 portfd_t *pfd; 895 /* remove event port association */ 896 pfd = ufp->uf_portfd; 897 ufp->uf_portfd = NULL; 898 port_close_fd(pfd); 899 } 900 ASSERT(ufp->uf_fpollinfo == NULL); 901 (void) closef(fp); 902 } 903 } 904 905 kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t)); 906 fip->fi_list = NULL; 907 fip->fi_nfiles = 0; 908 while (fip->fi_rlist != NULL) { 909 uf_rlist_t *urp = fip->fi_rlist; 910 fip->fi_rlist = urp->ur_next; 911 kmem_free(urp->ur_list, urp->ur_nfiles * sizeof (uf_entry_t)); 912 kmem_free(urp, sizeof (uf_rlist_t)); 913 } 914 } 915 916 /* 917 * Internal form of close. Decrement reference count on file 918 * structure. Decrement reference count on the vnode following 919 * removal of the referencing file structure. 920 */ 921 int 922 closef(file_t *fp) 923 { 924 vnode_t *vp; 925 int error; 926 int count; 927 int flag; 928 offset_t offset; 929 930 /* 931 * audit close of file (may be exit) 932 */ 933 if (AU_AUDITING()) 934 audit_closef(fp); 935 ASSERT(MUTEX_NOT_HELD(&P_FINFO(curproc)->fi_lock)); 936 937 mutex_enter(&fp->f_tlock); 938 939 ASSERT(fp->f_count > 0); 940 941 count = fp->f_count--; 942 flag = fp->f_flag; 943 offset = fp->f_offset; 944 945 vp = fp->f_vnode; 946 947 error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL); 948 949 if (count > 1) { 950 mutex_exit(&fp->f_tlock); 951 return (error); 952 } 953 ASSERT(fp->f_count == 0); 954 mutex_exit(&fp->f_tlock); 955 956 VN_RELE(vp); 957 /* 958 * deallocate resources to audit_data 959 */ 960 if (audit_active) 961 audit_unfalloc(fp); 962 crfree(fp->f_cred); 963 kmem_cache_free(file_cache, fp); 964 return (error); 965 } 966 967 /* 968 * This is a combination of ufalloc() and setf(). 969 */ 970 int 971 ufalloc_file(int start, file_t *fp) 972 { 973 proc_t *p = curproc; 974 uf_info_t *fip = P_FINFO(p); 975 int filelimit; 976 uf_entry_t *ufp; 977 int nfiles; 978 int fd; 979 980 /* 981 * Assertion is to convince the correctness of the following 982 * assignment for filelimit after casting to int. 983 */ 984 ASSERT(p->p_fno_ctl <= INT_MAX); 985 filelimit = (int)p->p_fno_ctl; 986 987 for (;;) { 988 mutex_enter(&fip->fi_lock); 989 fd = fd_find(fip, start); 990 if (fd >= 0 && fd == fip->fi_badfd) { 991 start = fd + 1; 992 mutex_exit(&fip->fi_lock); 993 continue; 994 } 995 if ((uint_t)fd < filelimit) 996 break; 997 if (fd >= filelimit) { 998 mutex_exit(&fip->fi_lock); 999 mutex_enter(&p->p_lock); 1000 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 1001 p->p_rctls, p, RCA_SAFE); 1002 mutex_exit(&p->p_lock); 1003 return (-1); 1004 } 1005 /* fd_find() returned -1 */ 1006 nfiles = fip->fi_nfiles; 1007 mutex_exit(&fip->fi_lock); 1008 flist_grow(MAX(start, nfiles)); 1009 } 1010 1011 UF_ENTER(ufp, fip, fd); 1012 fd_reserve(fip, fd, 1); 1013 ASSERT(ufp->uf_file == NULL); 1014 ufp->uf_file = fp; 1015 UF_EXIT(ufp); 1016 mutex_exit(&fip->fi_lock); 1017 return (fd); 1018 } 1019 1020 /* 1021 * Allocate a user file descriptor greater than or equal to "start". 1022 */ 1023 int 1024 ufalloc(int start) 1025 { 1026 return (ufalloc_file(start, NULL)); 1027 } 1028 1029 /* 1030 * Check that a future allocation of count fds on proc p has a good 1031 * chance of succeeding. If not, do rctl processing as if we'd failed 1032 * the allocation. 1033 * 1034 * Our caller must guarantee that p cannot disappear underneath us. 1035 */ 1036 int 1037 ufcanalloc(proc_t *p, uint_t count) 1038 { 1039 uf_info_t *fip = P_FINFO(p); 1040 int filelimit; 1041 int current; 1042 1043 if (count == 0) 1044 return (1); 1045 1046 ASSERT(p->p_fno_ctl <= INT_MAX); 1047 filelimit = (int)p->p_fno_ctl; 1048 1049 mutex_enter(&fip->fi_lock); 1050 current = flist_nalloc(fip); /* # of in-use descriptors */ 1051 mutex_exit(&fip->fi_lock); 1052 1053 /* 1054 * If count is a positive integer, the worst that can happen is 1055 * an overflow to a negative value, which is caught by the >= 0 check. 1056 */ 1057 current += count; 1058 if (count <= INT_MAX && current >= 0 && current <= filelimit) 1059 return (1); 1060 1061 mutex_enter(&p->p_lock); 1062 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 1063 p->p_rctls, p, RCA_SAFE); 1064 mutex_exit(&p->p_lock); 1065 return (0); 1066 } 1067 1068 /* 1069 * Allocate a user file descriptor and a file structure. 1070 * Initialize the descriptor to point at the file structure. 1071 * If fdp is NULL, the user file descriptor will not be allocated. 1072 */ 1073 int 1074 falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp) 1075 { 1076 file_t *fp; 1077 int fd; 1078 1079 if (fdp) { 1080 if ((fd = ufalloc(0)) == -1) 1081 return (EMFILE); 1082 } 1083 fp = kmem_cache_alloc(file_cache, KM_SLEEP); 1084 /* 1085 * Note: falloc returns the fp locked 1086 */ 1087 mutex_enter(&fp->f_tlock); 1088 fp->f_count = 1; 1089 fp->f_flag = (ushort_t)flag; 1090 fp->f_vnode = vp; 1091 fp->f_offset = 0; 1092 fp->f_audit_data = 0; 1093 crhold(fp->f_cred = CRED()); 1094 /* 1095 * allocate resources to audit_data 1096 */ 1097 if (audit_active) 1098 audit_falloc(fp); 1099 *fpp = fp; 1100 if (fdp) 1101 *fdp = fd; 1102 return (0); 1103 } 1104 1105 /*ARGSUSED*/ 1106 static int 1107 file_cache_constructor(void *buf, void *cdrarg, int kmflags) 1108 { 1109 file_t *fp = buf; 1110 1111 mutex_init(&fp->f_tlock, NULL, MUTEX_DEFAULT, NULL); 1112 return (0); 1113 } 1114 1115 /*ARGSUSED*/ 1116 static void 1117 file_cache_destructor(void *buf, void *cdrarg) 1118 { 1119 file_t *fp = buf; 1120 1121 mutex_destroy(&fp->f_tlock); 1122 } 1123 1124 void 1125 finit() 1126 { 1127 file_cache = kmem_cache_create("file_cache", sizeof (file_t), 0, 1128 file_cache_constructor, file_cache_destructor, NULL, NULL, NULL, 0); 1129 } 1130 1131 void 1132 unfalloc(file_t *fp) 1133 { 1134 ASSERT(MUTEX_HELD(&fp->f_tlock)); 1135 if (--fp->f_count <= 0) { 1136 /* 1137 * deallocate resources to audit_data 1138 */ 1139 if (audit_active) 1140 audit_unfalloc(fp); 1141 crfree(fp->f_cred); 1142 mutex_exit(&fp->f_tlock); 1143 kmem_cache_free(file_cache, fp); 1144 } else 1145 mutex_exit(&fp->f_tlock); 1146 } 1147 1148 /* 1149 * Given a file descriptor, set the user's 1150 * file pointer to the given parameter. 1151 */ 1152 void 1153 setf(int fd, file_t *fp) 1154 { 1155 uf_info_t *fip = P_FINFO(curproc); 1156 uf_entry_t *ufp; 1157 1158 if (AU_AUDITING()) 1159 audit_setf(fp, fd); 1160 1161 if (fp == NULL) { 1162 mutex_enter(&fip->fi_lock); 1163 UF_ENTER(ufp, fip, fd); 1164 fd_reserve(fip, fd, -1); 1165 mutex_exit(&fip->fi_lock); 1166 } else { 1167 UF_ENTER(ufp, fip, fd); 1168 ASSERT(ufp->uf_busy); 1169 } 1170 ASSERT(ufp->uf_fpollinfo == NULL); 1171 ASSERT(ufp->uf_flag == 0); 1172 ufp->uf_file = fp; 1173 cv_broadcast(&ufp->uf_wanted_cv); 1174 UF_EXIT(ufp); 1175 } 1176 1177 /* 1178 * Given a file descriptor, return the file table flags, plus, 1179 * if this is a socket in asynchronous mode, the FASYNC flag. 1180 * getf() may or may not have been called before calling f_getfl(). 1181 */ 1182 int 1183 f_getfl(int fd, int *flagp) 1184 { 1185 uf_info_t *fip = P_FINFO(curproc); 1186 uf_entry_t *ufp; 1187 file_t *fp; 1188 int error; 1189 1190 if ((uint_t)fd >= fip->fi_nfiles) 1191 error = EBADF; 1192 else { 1193 UF_ENTER(ufp, fip, fd); 1194 if ((fp = ufp->uf_file) == NULL) 1195 error = EBADF; 1196 else { 1197 vnode_t *vp = fp->f_vnode; 1198 int flag = fp->f_flag; 1199 1200 /* 1201 * BSD fcntl() FASYNC compatibility. 1202 */ 1203 if (vp->v_type == VSOCK) 1204 flag |= sock_getfasync(vp); 1205 *flagp = flag; 1206 error = 0; 1207 } 1208 UF_EXIT(ufp); 1209 } 1210 1211 return (error); 1212 } 1213 1214 /* 1215 * Given a file descriptor, return the user's file flags. 1216 * Force the FD_CLOEXEC flag for writable self-open /proc files. 1217 * getf() may or may not have been called before calling f_getfd_error(). 1218 */ 1219 int 1220 f_getfd_error(int fd, int *flagp) 1221 { 1222 uf_info_t *fip = P_FINFO(curproc); 1223 uf_entry_t *ufp; 1224 file_t *fp; 1225 int flag; 1226 int error; 1227 1228 if ((uint_t)fd >= fip->fi_nfiles) 1229 error = EBADF; 1230 else { 1231 UF_ENTER(ufp, fip, fd); 1232 if ((fp = ufp->uf_file) == NULL) 1233 error = EBADF; 1234 else { 1235 flag = ufp->uf_flag; 1236 if ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode)) 1237 flag |= FD_CLOEXEC; 1238 *flagp = flag; 1239 error = 0; 1240 } 1241 UF_EXIT(ufp); 1242 } 1243 1244 return (error); 1245 } 1246 1247 /* 1248 * getf() must have been called before calling f_getfd(). 1249 */ 1250 char 1251 f_getfd(int fd) 1252 { 1253 int flag = 0; 1254 (void) f_getfd_error(fd, &flag); 1255 return ((char)flag); 1256 } 1257 1258 /* 1259 * Given a file descriptor and file flags, set the user's file flags. 1260 * At present, the only valid flag is FD_CLOEXEC. 1261 * getf() may or may not have been called before calling f_setfd_error(). 1262 */ 1263 int 1264 f_setfd_error(int fd, int flags) 1265 { 1266 uf_info_t *fip = P_FINFO(curproc); 1267 uf_entry_t *ufp; 1268 int error; 1269 1270 if ((uint_t)fd >= fip->fi_nfiles) 1271 error = EBADF; 1272 else { 1273 UF_ENTER(ufp, fip, fd); 1274 if (ufp->uf_file == NULL) 1275 error = EBADF; 1276 else { 1277 ufp->uf_flag = flags & FD_CLOEXEC; 1278 error = 0; 1279 } 1280 UF_EXIT(ufp); 1281 } 1282 return (error); 1283 } 1284 1285 void 1286 f_setfd(int fd, char flags) 1287 { 1288 (void) f_setfd_error(fd, flags); 1289 } 1290 1291 #define BADFD_MIN 3 1292 #define BADFD_MAX 255 1293 1294 /* 1295 * Attempt to allocate a file descriptor which is bad and which 1296 * is "poison" to the application. It cannot be closed (except 1297 * on exec), allocated for a different use, etc. 1298 */ 1299 int 1300 f_badfd(int start, int *fdp, int action) 1301 { 1302 int fdr; 1303 int badfd; 1304 uf_info_t *fip = P_FINFO(curproc); 1305 1306 #ifdef _LP64 1307 /* No restrictions on 64 bit _file */ 1308 if (get_udatamodel() != DATAMODEL_ILP32) 1309 return (EINVAL); 1310 #endif 1311 1312 if (start > BADFD_MAX || start < BADFD_MIN) 1313 return (EINVAL); 1314 1315 if (action >= NSIG || action < 0) 1316 return (EINVAL); 1317 1318 mutex_enter(&fip->fi_lock); 1319 badfd = fip->fi_badfd; 1320 mutex_exit(&fip->fi_lock); 1321 1322 if (badfd != -1) 1323 return (EAGAIN); 1324 1325 fdr = ufalloc(start); 1326 1327 if (fdr > BADFD_MAX) { 1328 setf(fdr, NULL); 1329 return (EMFILE); 1330 } 1331 if (fdr < 0) 1332 return (EMFILE); 1333 1334 mutex_enter(&fip->fi_lock); 1335 if (fip->fi_badfd != -1) { 1336 /* Lost race */ 1337 mutex_exit(&fip->fi_lock); 1338 setf(fdr, NULL); 1339 return (EAGAIN); 1340 } 1341 fip->fi_action = action; 1342 fip->fi_badfd = fdr; 1343 mutex_exit(&fip->fi_lock); 1344 setf(fdr, NULL); 1345 1346 *fdp = fdr; 1347 1348 return (0); 1349 } 1350 1351 /* 1352 * Allocate a file descriptor and assign it to the vnode "*vpp", 1353 * performing the usual open protocol upon it and returning the 1354 * file descriptor allocated. It is the responsibility of the 1355 * caller to dispose of "*vpp" if any error occurs. 1356 */ 1357 int 1358 fassign(vnode_t **vpp, int mode, int *fdp) 1359 { 1360 file_t *fp; 1361 int error; 1362 int fd; 1363 1364 if (error = falloc((vnode_t *)NULL, mode, &fp, &fd)) 1365 return (error); 1366 if (error = VOP_OPEN(vpp, mode, fp->f_cred, NULL)) { 1367 setf(fd, NULL); 1368 unfalloc(fp); 1369 return (error); 1370 } 1371 fp->f_vnode = *vpp; 1372 mutex_exit(&fp->f_tlock); 1373 /* 1374 * Fill in the slot falloc reserved. 1375 */ 1376 setf(fd, fp); 1377 *fdp = fd; 1378 return (0); 1379 } 1380 1381 /* 1382 * When a process forks it must increment the f_count of all file pointers 1383 * since there is a new process pointing at them. fcnt_add(fip, 1) does this. 1384 * Since we are called when there is only 1 active lwp we don't need to 1385 * hold fi_lock or any uf_lock. If the fork fails, fork_fail() calls 1386 * fcnt_add(fip, -1) to restore the counts. 1387 */ 1388 void 1389 fcnt_add(uf_info_t *fip, int incr) 1390 { 1391 int i; 1392 uf_entry_t *ufp; 1393 file_t *fp; 1394 1395 ufp = fip->fi_list; 1396 for (i = 0; i < fip->fi_nfiles; i++, ufp++) { 1397 if ((fp = ufp->uf_file) != NULL) { 1398 mutex_enter(&fp->f_tlock); 1399 ASSERT((incr == 1 && fp->f_count >= 1) || 1400 (incr == -1 && fp->f_count >= 2)); 1401 fp->f_count += incr; 1402 mutex_exit(&fp->f_tlock); 1403 } 1404 } 1405 } 1406 1407 /* 1408 * This is called from exec to close all fd's that have the FD_CLOEXEC flag 1409 * set and also to close all self-open for write /proc file descriptors. 1410 */ 1411 void 1412 close_exec(uf_info_t *fip) 1413 { 1414 int fd; 1415 file_t *fp; 1416 fpollinfo_t *fpip; 1417 uf_entry_t *ufp; 1418 portfd_t *pfd; 1419 1420 ufp = fip->fi_list; 1421 for (fd = 0; fd < fip->fi_nfiles; fd++, ufp++) { 1422 if ((fp = ufp->uf_file) != NULL && 1423 ((ufp->uf_flag & FD_CLOEXEC) || 1424 ((fp->f_flag & FWRITE) && pr_isself(fp->f_vnode)))) { 1425 fpip = ufp->uf_fpollinfo; 1426 mutex_enter(&fip->fi_lock); 1427 mutex_enter(&ufp->uf_lock); 1428 fd_reserve(fip, fd, -1); 1429 mutex_exit(&fip->fi_lock); 1430 ufp->uf_file = NULL; 1431 ufp->uf_fpollinfo = NULL; 1432 ufp->uf_flag = 0; 1433 /* 1434 * We may need to cleanup some cached poll states 1435 * in t_pollstate before the fd can be reused. It 1436 * is important that we don't access a stale thread 1437 * structure. We will do the cleanup in two 1438 * phases to avoid deadlock and holding uf_lock for 1439 * too long. In phase 1, hold the uf_lock and call 1440 * pollblockexit() to set state in t_pollstate struct 1441 * so that a thread does not exit on us. In phase 2, 1442 * we drop the uf_lock and call pollcacheclean(). 1443 */ 1444 pfd = ufp->uf_portfd; 1445 ufp->uf_portfd = NULL; 1446 if (fpip != NULL) 1447 pollblockexit(fpip); 1448 mutex_exit(&ufp->uf_lock); 1449 if (fpip != NULL) 1450 pollcacheclean(fpip, fd); 1451 if (pfd) 1452 port_close_fd(pfd); 1453 (void) closef(fp); 1454 } 1455 } 1456 1457 /* Reset bad fd */ 1458 fip->fi_badfd = -1; 1459 fip->fi_action = -1; 1460 } 1461 1462 /* 1463 * Common routine for modifying attributes of named files. 1464 */ 1465 int 1466 namesetattr(char *fnamep, enum symfollow followlink, vattr_t *vap, int flags) 1467 { 1468 vnode_t *vp; 1469 int error = 0; 1470 1471 if (error = lookupname(fnamep, UIO_USERSPACE, followlink, NULLVPP, &vp)) 1472 return (set_errno(error)); 1473 if (error = vpsetattr(vp, vap, flags)) 1474 (void) set_errno(error); 1475 VN_RELE(vp); 1476 return (error); 1477 } 1478 1479 /* 1480 * Common routine for modifying attributes of files referenced 1481 * by descriptor. 1482 */ 1483 int 1484 fdsetattr(int fd, vattr_t *vap) 1485 { 1486 file_t *fp; 1487 vnode_t *vp; 1488 int error = 0; 1489 1490 if ((fp = getf(fd)) != NULL) { 1491 vp = fp->f_vnode; 1492 if (error = vpsetattr(vp, vap, 0)) { 1493 (void) set_errno(error); 1494 } 1495 releasef(fd); 1496 } else 1497 error = set_errno(EBADF); 1498 return (error); 1499 } 1500 1501 /* 1502 * Common routine to set the attributes for the given vnode. 1503 * If the vnode is a file and the filesize is being manipulated, 1504 * this makes sure that there are no conflicting non-blocking 1505 * mandatory locks in that region. 1506 */ 1507 static int 1508 vpsetattr(vnode_t *vp, vattr_t *vap, int flags) 1509 { 1510 int error = 0; 1511 int in_crit = 0; 1512 u_offset_t begin; 1513 vattr_t vattr; 1514 ssize_t length; 1515 1516 if (vn_is_readonly(vp)) { 1517 error = EROFS; 1518 } 1519 if (!error && (vap->va_mask & AT_SIZE) && 1520 nbl_need_check(vp)) { 1521 nbl_start_crit(vp, RW_READER); 1522 in_crit = 1; 1523 vattr.va_mask = AT_SIZE; 1524 if (!(error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) { 1525 begin = vap->va_size > vattr.va_size ? 1526 vattr.va_size : vap->va_size; 1527 length = vattr.va_size > vap->va_size ? 1528 vattr.va_size - vap->va_size : 1529 vap->va_size - vattr.va_size; 1530 1531 if (nbl_conflict(vp, NBL_WRITE, begin, length, 0, 1532 NULL)) { 1533 error = EACCES; 1534 } 1535 } 1536 } 1537 if (!error) 1538 error = VOP_SETATTR(vp, vap, flags, CRED(), NULL); 1539 1540 if (in_crit) 1541 nbl_end_crit(vp); 1542 1543 return (error); 1544 } 1545 1546 /* 1547 * Return true if the given vnode is referenced by any 1548 * entry in the current process's file descriptor table. 1549 */ 1550 int 1551 fisopen(vnode_t *vp) 1552 { 1553 int fd; 1554 file_t *fp; 1555 vnode_t *ovp; 1556 uf_info_t *fip = P_FINFO(curproc); 1557 uf_entry_t *ufp; 1558 1559 mutex_enter(&fip->fi_lock); 1560 for (fd = 0; fd < fip->fi_nfiles; fd++) { 1561 UF_ENTER(ufp, fip, fd); 1562 if ((fp = ufp->uf_file) != NULL && 1563 (ovp = fp->f_vnode) != NULL && VN_CMP(vp, ovp)) { 1564 UF_EXIT(ufp); 1565 mutex_exit(&fip->fi_lock); 1566 return (1); 1567 } 1568 UF_EXIT(ufp); 1569 } 1570 mutex_exit(&fip->fi_lock); 1571 return (0); 1572 } 1573 1574 /* 1575 * Return zero if at least one file currently open (by curproc) shouldn't be 1576 * allowed to change zones. 1577 */ 1578 int 1579 files_can_change_zones(void) 1580 { 1581 int fd; 1582 file_t *fp; 1583 uf_info_t *fip = P_FINFO(curproc); 1584 uf_entry_t *ufp; 1585 1586 mutex_enter(&fip->fi_lock); 1587 for (fd = 0; fd < fip->fi_nfiles; fd++) { 1588 UF_ENTER(ufp, fip, fd); 1589 if ((fp = ufp->uf_file) != NULL && 1590 !vn_can_change_zones(fp->f_vnode)) { 1591 UF_EXIT(ufp); 1592 mutex_exit(&fip->fi_lock); 1593 return (0); 1594 } 1595 UF_EXIT(ufp); 1596 } 1597 mutex_exit(&fip->fi_lock); 1598 return (1); 1599 } 1600 1601 #ifdef DEBUG 1602 1603 /* 1604 * The following functions are only used in ASSERT()s elsewhere. 1605 * They do not modify the state of the system. 1606 */ 1607 1608 /* 1609 * Return true (1) if the current thread is in the fpollinfo 1610 * list for this file descriptor, else false (0). 1611 */ 1612 static int 1613 curthread_in_plist(uf_entry_t *ufp) 1614 { 1615 fpollinfo_t *fpip; 1616 1617 ASSERT(MUTEX_HELD(&ufp->uf_lock)); 1618 for (fpip = ufp->uf_fpollinfo; fpip; fpip = fpip->fp_next) 1619 if (fpip->fp_thread == curthread) 1620 return (1); 1621 return (0); 1622 } 1623 1624 /* 1625 * Sanity check to make sure that after lwp_exit(), 1626 * curthread does not appear on any fd's fpollinfo list. 1627 */ 1628 void 1629 checkfpollinfo(void) 1630 { 1631 int fd; 1632 uf_info_t *fip = P_FINFO(curproc); 1633 uf_entry_t *ufp; 1634 1635 mutex_enter(&fip->fi_lock); 1636 for (fd = 0; fd < fip->fi_nfiles; fd++) { 1637 UF_ENTER(ufp, fip, fd); 1638 ASSERT(!curthread_in_plist(ufp)); 1639 UF_EXIT(ufp); 1640 } 1641 mutex_exit(&fip->fi_lock); 1642 } 1643 1644 /* 1645 * Return true (1) if the current thread is in the fpollinfo 1646 * list for this file descriptor, else false (0). 1647 * This is the same as curthread_in_plist(), 1648 * but is called w/o holding uf_lock. 1649 */ 1650 int 1651 infpollinfo(int fd) 1652 { 1653 uf_info_t *fip = P_FINFO(curproc); 1654 uf_entry_t *ufp; 1655 int rc; 1656 1657 UF_ENTER(ufp, fip, fd); 1658 rc = curthread_in_plist(ufp); 1659 UF_EXIT(ufp); 1660 return (rc); 1661 } 1662 1663 #endif /* DEBUG */ 1664 1665 /* 1666 * Add the curthread to fpollinfo list, meaning this fd is currently in the 1667 * thread's poll cache. Each lwp polling this file descriptor should call 1668 * this routine once. 1669 */ 1670 void 1671 addfpollinfo(int fd) 1672 { 1673 struct uf_entry *ufp; 1674 fpollinfo_t *fpip; 1675 uf_info_t *fip = P_FINFO(curproc); 1676 1677 fpip = kmem_zalloc(sizeof (fpollinfo_t), KM_SLEEP); 1678 fpip->fp_thread = curthread; 1679 UF_ENTER(ufp, fip, fd); 1680 /* 1681 * Assert we are not already on the list, that is, that 1682 * this lwp did not call addfpollinfo twice for the same fd. 1683 */ 1684 ASSERT(!curthread_in_plist(ufp)); 1685 /* 1686 * addfpollinfo is always done inside the getf/releasef pair. 1687 */ 1688 ASSERT(ufp->uf_refcnt >= 1); 1689 fpip->fp_next = ufp->uf_fpollinfo; 1690 ufp->uf_fpollinfo = fpip; 1691 UF_EXIT(ufp); 1692 } 1693 1694 /* 1695 * Delete curthread from fpollinfo list if it is there. 1696 */ 1697 void 1698 delfpollinfo(int fd) 1699 { 1700 struct uf_entry *ufp; 1701 struct fpollinfo *fpip; 1702 struct fpollinfo **fpipp; 1703 uf_info_t *fip = P_FINFO(curproc); 1704 1705 UF_ENTER(ufp, fip, fd); 1706 for (fpipp = &ufp->uf_fpollinfo; 1707 (fpip = *fpipp) != NULL; 1708 fpipp = &fpip->fp_next) { 1709 if (fpip->fp_thread == curthread) { 1710 *fpipp = fpip->fp_next; 1711 kmem_free(fpip, sizeof (fpollinfo_t)); 1712 break; 1713 } 1714 } 1715 /* 1716 * Assert that we are not still on the list, that is, that 1717 * this lwp did not call addfpollinfo twice for the same fd. 1718 */ 1719 ASSERT(!curthread_in_plist(ufp)); 1720 UF_EXIT(ufp); 1721 } 1722 1723 /* 1724 * fd is associated with a port. pfd is a pointer to the fd entry in the 1725 * cache of the port. 1726 */ 1727 1728 void 1729 addfd_port(int fd, portfd_t *pfd) 1730 { 1731 struct uf_entry *ufp; 1732 uf_info_t *fip = P_FINFO(curproc); 1733 1734 UF_ENTER(ufp, fip, fd); 1735 /* 1736 * addfd_port is always done inside the getf/releasef pair. 1737 */ 1738 ASSERT(ufp->uf_refcnt >= 1); 1739 if (ufp->uf_portfd == NULL) { 1740 /* first entry */ 1741 ufp->uf_portfd = pfd; 1742 pfd->pfd_next = NULL; 1743 } else { 1744 pfd->pfd_next = ufp->uf_portfd; 1745 ufp->uf_portfd = pfd; 1746 pfd->pfd_next->pfd_prev = pfd; 1747 } 1748 UF_EXIT(ufp); 1749 } 1750 1751 void 1752 delfd_port(int fd, portfd_t *pfd) 1753 { 1754 struct uf_entry *ufp; 1755 uf_info_t *fip = P_FINFO(curproc); 1756 1757 UF_ENTER(ufp, fip, fd); 1758 /* 1759 * delfd_port is always done inside the getf/releasef pair. 1760 */ 1761 ASSERT(ufp->uf_refcnt >= 1); 1762 if (ufp->uf_portfd == pfd) { 1763 /* remove first entry */ 1764 ufp->uf_portfd = pfd->pfd_next; 1765 } else { 1766 pfd->pfd_prev->pfd_next = pfd->pfd_next; 1767 if (pfd->pfd_next != NULL) 1768 pfd->pfd_next->pfd_prev = pfd->pfd_prev; 1769 } 1770 UF_EXIT(ufp); 1771 } 1772 1773 static void 1774 port_close_fd(portfd_t *pfd) 1775 { 1776 portfd_t *pfdn; 1777 1778 /* 1779 * At this point, no other thread should access 1780 * the portfd_t list for this fd. The uf_file, uf_portfd 1781 * pointers in the uf_entry_t struct for this fd would 1782 * be set to NULL. 1783 */ 1784 for (; pfd != NULL; pfd = pfdn) { 1785 pfdn = pfd->pfd_next; 1786 port_close_pfd(pfd); 1787 } 1788 } 1789