1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/t_lock.h> 43 #include <sys/errno.h> 44 #include <sys/cred.h> 45 #include <sys/user.h> 46 #include <sys/uio.h> 47 #include <sys/file.h> 48 #include <sys/pathname.h> 49 #include <sys/vfs.h> 50 #include <sys/vfs_opreg.h> 51 #include <sys/vnode.h> 52 #include <sys/rwstlock.h> 53 #include <sys/fem.h> 54 #include <sys/stat.h> 55 #include <sys/mode.h> 56 #include <sys/conf.h> 57 #include <sys/sysmacros.h> 58 #include <sys/cmn_err.h> 59 #include <sys/systm.h> 60 #include <sys/kmem.h> 61 #include <sys/debug.h> 62 #include <c2/audit.h> 63 #include <sys/acl.h> 64 #include <sys/nbmlock.h> 65 #include <sys/fcntl.h> 66 #include <fs/fs_subr.h> 67 #include <sys/taskq.h> 68 #include <fs/fs_reparse.h> 69 70 /* Determine if this vnode is a file that is read-only */ 71 #define ISROFILE(vp) \ 72 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \ 73 (vp)->v_type != VFIFO && vn_is_readonly(vp)) 74 75 /* Tunable via /etc/system; used only by admin/install */ 76 int nfs_global_client_only; 77 78 /* 79 * Array of vopstats_t for per-FS-type vopstats. This array has the same 80 * number of entries as and parallel to the vfssw table. (Arguably, it could 81 * be part of the vfssw table.) Once it's initialized, it's accessed using 82 * the same fstype index that is used to index into the vfssw table. 83 */ 84 vopstats_t **vopstats_fstype; 85 86 /* vopstats initialization template used for fast initialization via bcopy() */ 87 static vopstats_t *vs_templatep; 88 89 /* Kmem cache handle for vsk_anchor_t allocations */ 90 kmem_cache_t *vsk_anchor_cache; 91 92 /* file events cleanup routine */ 93 extern void free_fopdata(vnode_t *); 94 95 /* 96 * Root of AVL tree for the kstats associated with vopstats. Lock protects 97 * updates to vsktat_tree. 98 */ 99 avl_tree_t vskstat_tree; 100 kmutex_t vskstat_tree_lock; 101 102 /* Global variable which enables/disables the vopstats collection */ 103 int vopstats_enabled = 1; 104 105 /* 106 * forward declarations for internal vnode specific data (vsd) 107 */ 108 static void *vsd_realloc(void *, size_t, size_t); 109 110 /* 111 * forward declarations for reparse point functions 112 */ 113 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr); 114 115 /* 116 * VSD -- VNODE SPECIFIC DATA 117 * The v_data pointer is typically used by a file system to store a 118 * pointer to the file system's private node (e.g. ufs inode, nfs rnode). 119 * However, there are times when additional project private data needs 120 * to be stored separately from the data (node) pointed to by v_data. 121 * This additional data could be stored by the file system itself or 122 * by a completely different kernel entity. VSD provides a way for 123 * callers to obtain a key and store a pointer to private data associated 124 * with a vnode. 125 * 126 * Callers are responsible for protecting the vsd by holding v_vsd_lock 127 * for calls to vsd_set() and vsd_get(). 128 */ 129 130 /* 131 * vsd_lock protects: 132 * vsd_nkeys - creation and deletion of vsd keys 133 * vsd_list - insertion and deletion of vsd_node in the vsd_list 134 * vsd_destructor - adding and removing destructors to the list 135 */ 136 static kmutex_t vsd_lock; 137 static uint_t vsd_nkeys; /* size of destructor array */ 138 /* list of vsd_node's */ 139 static list_t *vsd_list = NULL; 140 /* per-key destructor funcs */ 141 static void (**vsd_destructor)(void *); 142 143 /* 144 * The following is the common set of actions needed to update the 145 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and 146 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the 147 * recording of the bytes transferred. Since the code is similar 148 * but small, it is nearly a duplicate. Consequently any changes 149 * to one may need to be reflected in the other. 150 * Rundown of the variables: 151 * vp - Pointer to the vnode 152 * counter - Partial name structure member to update in vopstats for counts 153 * bytecounter - Partial name structure member to update in vopstats for bytes 154 * bytesval - Value to update in vopstats for bytes 155 * fstype - Index into vsanchor_fstype[], same as index into vfssw[] 156 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i]) 157 */ 158 159 #define VOPSTATS_UPDATE(vp, counter) { \ 160 vfs_t *vfsp = (vp)->v_vfsp; \ 161 if (vfsp && vfsp->vfs_implp && \ 162 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ 163 vopstats_t *vsp = &vfsp->vfs_vopstats; \ 164 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ 165 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ 166 size_t, uint64_t *); \ 167 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \ 168 (*stataddr)++; \ 169 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ 170 vsp->n##counter.value.ui64++; \ 171 } \ 172 } \ 173 } 174 175 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \ 176 vfs_t *vfsp = (vp)->v_vfsp; \ 177 if (vfsp && vfsp->vfs_implp && \ 178 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ 179 vopstats_t *vsp = &vfsp->vfs_vopstats; \ 180 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ 181 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ 182 size_t, uint64_t *); \ 183 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \ 184 (*stataddr)++; \ 185 vsp->bytecounter.value.ui64 += bytesval; \ 186 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ 187 vsp->n##counter.value.ui64++; \ 188 vsp->bytecounter.value.ui64 += bytesval; \ 189 } \ 190 } \ 191 } 192 193 /* 194 * If the filesystem does not support XIDs map credential 195 * If the vfsp is NULL, perhaps we should also map? 196 */ 197 #define VOPXID_MAP_CR(vp, cr) { \ 198 vfs_t *vfsp = (vp)->v_vfsp; \ 199 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \ 200 cr = crgetmapped(cr); \ 201 } 202 203 /* 204 * Convert stat(2) formats to vnode types and vice versa. (Knows about 205 * numerical order of S_IFMT and vnode types.) 206 */ 207 enum vtype iftovt_tab[] = { 208 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 209 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 210 }; 211 212 ushort_t vttoif_tab[] = { 213 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 214 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0 215 }; 216 217 /* 218 * The system vnode cache. 219 */ 220 221 kmem_cache_t *vn_cache; 222 223 224 /* 225 * Vnode operations vector. 226 */ 227 228 static const fs_operation_trans_def_t vn_ops_table[] = { 229 VOPNAME_OPEN, offsetof(struct vnodeops, vop_open), 230 fs_nosys, fs_nosys, 231 232 VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close), 233 fs_nosys, fs_nosys, 234 235 VOPNAME_READ, offsetof(struct vnodeops, vop_read), 236 fs_nosys, fs_nosys, 237 238 VOPNAME_WRITE, offsetof(struct vnodeops, vop_write), 239 fs_nosys, fs_nosys, 240 241 VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl), 242 fs_nosys, fs_nosys, 243 244 VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl), 245 fs_setfl, fs_nosys, 246 247 VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr), 248 fs_nosys, fs_nosys, 249 250 VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr), 251 fs_nosys, fs_nosys, 252 253 VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access), 254 fs_nosys, fs_nosys, 255 256 VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup), 257 fs_nosys, fs_nosys, 258 259 VOPNAME_CREATE, offsetof(struct vnodeops, vop_create), 260 fs_nosys, fs_nosys, 261 262 VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove), 263 fs_nosys, fs_nosys, 264 265 VOPNAME_LINK, offsetof(struct vnodeops, vop_link), 266 fs_nosys, fs_nosys, 267 268 VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename), 269 fs_nosys, fs_nosys, 270 271 VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir), 272 fs_nosys, fs_nosys, 273 274 VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir), 275 fs_nosys, fs_nosys, 276 277 VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir), 278 fs_nosys, fs_nosys, 279 280 VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink), 281 fs_nosys, fs_nosys, 282 283 VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink), 284 fs_nosys, fs_nosys, 285 286 VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync), 287 fs_nosys, fs_nosys, 288 289 VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive), 290 fs_nosys, fs_nosys, 291 292 VOPNAME_FID, offsetof(struct vnodeops, vop_fid), 293 fs_nosys, fs_nosys, 294 295 VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock), 296 fs_rwlock, fs_rwlock, 297 298 VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock), 299 (fs_generic_func_p) fs_rwunlock, 300 (fs_generic_func_p) fs_rwunlock, /* no errors allowed */ 301 302 VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek), 303 fs_nosys, fs_nosys, 304 305 VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp), 306 fs_cmp, fs_cmp, /* no errors allowed */ 307 308 VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock), 309 fs_frlock, fs_nosys, 310 311 VOPNAME_SPACE, offsetof(struct vnodeops, vop_space), 312 fs_nosys, fs_nosys, 313 314 VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp), 315 fs_nosys, fs_nosys, 316 317 VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage), 318 fs_nosys, fs_nosys, 319 320 VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage), 321 fs_nosys, fs_nosys, 322 323 VOPNAME_MAP, offsetof(struct vnodeops, vop_map), 324 (fs_generic_func_p) fs_nosys_map, 325 (fs_generic_func_p) fs_nosys_map, 326 327 VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap), 328 (fs_generic_func_p) fs_nosys_addmap, 329 (fs_generic_func_p) fs_nosys_addmap, 330 331 VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap), 332 fs_nosys, fs_nosys, 333 334 VOPNAME_POLL, offsetof(struct vnodeops, vop_poll), 335 (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll, 336 337 VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump), 338 fs_nosys, fs_nosys, 339 340 VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf), 341 fs_pathconf, fs_nosys, 342 343 VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio), 344 fs_nosys, fs_nosys, 345 346 VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl), 347 fs_nosys, fs_nosys, 348 349 VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose), 350 (fs_generic_func_p) fs_dispose, 351 (fs_generic_func_p) fs_nodispose, 352 353 VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr), 354 fs_nosys, fs_nosys, 355 356 VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr), 357 fs_fab_acl, fs_nosys, 358 359 VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock), 360 fs_shrlock, fs_nosys, 361 362 VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent), 363 (fs_generic_func_p) fs_vnevent_nosupport, 364 (fs_generic_func_p) fs_vnevent_nosupport, 365 366 VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf), 367 fs_nosys, fs_nosys, 368 369 VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf), 370 fs_nosys, fs_nosys, 371 372 NULL, 0, NULL, NULL 373 }; 374 375 /* Extensible attribute (xva) routines. */ 376 377 /* 378 * Zero out the structure, set the size of the requested/returned bitmaps, 379 * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer 380 * to the returned attributes array. 381 */ 382 void 383 xva_init(xvattr_t *xvap) 384 { 385 bzero(xvap, sizeof (xvattr_t)); 386 xvap->xva_mapsize = XVA_MAPSIZE; 387 xvap->xva_magic = XVA_MAGIC; 388 xvap->xva_vattr.va_mask = AT_XVATTR; 389 xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; 390 } 391 392 /* 393 * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t 394 * structure. Otherwise, returns NULL. 395 */ 396 xoptattr_t * 397 xva_getxoptattr(xvattr_t *xvap) 398 { 399 xoptattr_t *xoap = NULL; 400 if (xvap->xva_vattr.va_mask & AT_XVATTR) 401 xoap = &xvap->xva_xoptattrs; 402 return (xoap); 403 } 404 405 /* 406 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree. 407 * We use the f_fsid reported by VFS_STATVFS() since we use that for the 408 * kstat name. 409 */ 410 static int 411 vska_compar(const void *n1, const void *n2) 412 { 413 int ret; 414 ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid; 415 ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid; 416 417 if (p1 < p2) { 418 ret = -1; 419 } else if (p1 > p2) { 420 ret = 1; 421 } else { 422 ret = 0; 423 } 424 425 return (ret); 426 } 427 428 /* 429 * Used to create a single template which will be bcopy()ed to a newly 430 * allocated vsanchor_combo_t structure in new_vsanchor(), below. 431 */ 432 static vopstats_t * 433 create_vopstats_template() 434 { 435 vopstats_t *vsp; 436 437 vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP); 438 bzero(vsp, sizeof (*vsp)); /* Start fresh */ 439 440 /* VOP_OPEN */ 441 kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64); 442 /* VOP_CLOSE */ 443 kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64); 444 /* VOP_READ I/O */ 445 kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64); 446 kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64); 447 /* VOP_WRITE I/O */ 448 kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64); 449 kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64); 450 /* VOP_IOCTL */ 451 kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64); 452 /* VOP_SETFL */ 453 kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64); 454 /* VOP_GETATTR */ 455 kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64); 456 /* VOP_SETATTR */ 457 kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64); 458 /* VOP_ACCESS */ 459 kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64); 460 /* VOP_LOOKUP */ 461 kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64); 462 /* VOP_CREATE */ 463 kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64); 464 /* VOP_REMOVE */ 465 kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64); 466 /* VOP_LINK */ 467 kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64); 468 /* VOP_RENAME */ 469 kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64); 470 /* VOP_MKDIR */ 471 kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64); 472 /* VOP_RMDIR */ 473 kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64); 474 /* VOP_READDIR I/O */ 475 kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64); 476 kstat_named_init(&vsp->readdir_bytes, "readdir_bytes", 477 KSTAT_DATA_UINT64); 478 /* VOP_SYMLINK */ 479 kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64); 480 /* VOP_READLINK */ 481 kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64); 482 /* VOP_FSYNC */ 483 kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64); 484 /* VOP_INACTIVE */ 485 kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64); 486 /* VOP_FID */ 487 kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64); 488 /* VOP_RWLOCK */ 489 kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64); 490 /* VOP_RWUNLOCK */ 491 kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64); 492 /* VOP_SEEK */ 493 kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64); 494 /* VOP_CMP */ 495 kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64); 496 /* VOP_FRLOCK */ 497 kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64); 498 /* VOP_SPACE */ 499 kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64); 500 /* VOP_REALVP */ 501 kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64); 502 /* VOP_GETPAGE */ 503 kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64); 504 /* VOP_PUTPAGE */ 505 kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64); 506 /* VOP_MAP */ 507 kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64); 508 /* VOP_ADDMAP */ 509 kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64); 510 /* VOP_DELMAP */ 511 kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64); 512 /* VOP_POLL */ 513 kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64); 514 /* VOP_DUMP */ 515 kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64); 516 /* VOP_PATHCONF */ 517 kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64); 518 /* VOP_PAGEIO */ 519 kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64); 520 /* VOP_DUMPCTL */ 521 kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64); 522 /* VOP_DISPOSE */ 523 kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64); 524 /* VOP_SETSECATTR */ 525 kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64); 526 /* VOP_GETSECATTR */ 527 kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64); 528 /* VOP_SHRLOCK */ 529 kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64); 530 /* VOP_VNEVENT */ 531 kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64); 532 /* VOP_REQZCBUF */ 533 kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64); 534 /* VOP_RETZCBUF */ 535 kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64); 536 537 return (vsp); 538 } 539 540 /* 541 * Creates a kstat structure associated with a vopstats structure. 542 */ 543 kstat_t * 544 new_vskstat(char *ksname, vopstats_t *vsp) 545 { 546 kstat_t *ksp; 547 548 if (!vopstats_enabled) { 549 return (NULL); 550 } 551 552 ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED, 553 sizeof (vopstats_t)/sizeof (kstat_named_t), 554 KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); 555 if (ksp) { 556 ksp->ks_data = vsp; 557 kstat_install(ksp); 558 } 559 560 return (ksp); 561 } 562 563 /* 564 * Called from vfsinit() to initialize the support mechanisms for vopstats 565 */ 566 void 567 vopstats_startup() 568 { 569 if (!vopstats_enabled) 570 return; 571 572 /* 573 * Creates the AVL tree which holds per-vfs vopstat anchors. This 574 * is necessary since we need to check if a kstat exists before we 575 * attempt to create it. Also, initialize its lock. 576 */ 577 avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t), 578 offsetof(vsk_anchor_t, vsk_node)); 579 mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL); 580 581 vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache", 582 sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL, 583 NULL, NULL, 0); 584 585 /* 586 * Set up the array of pointers for the vopstats-by-FS-type. 587 * The entries will be allocated/initialized as each file system 588 * goes through modload/mod_installfs. 589 */ 590 vopstats_fstype = (vopstats_t **)kmem_zalloc( 591 (sizeof (vopstats_t *) * nfstype), KM_SLEEP); 592 593 /* Set up the global vopstats initialization template */ 594 vs_templatep = create_vopstats_template(); 595 } 596 597 /* 598 * We need to have the all of the counters zeroed. 599 * The initialization of the vopstats_t includes on the order of 600 * 50 calls to kstat_named_init(). Rather that do that on every call, 601 * we do it once in a template (vs_templatep) then bcopy it over. 602 */ 603 void 604 initialize_vopstats(vopstats_t *vsp) 605 { 606 if (vsp == NULL) 607 return; 608 609 bcopy(vs_templatep, vsp, sizeof (vopstats_t)); 610 } 611 612 /* 613 * If possible, determine which vopstats by fstype to use and 614 * return a pointer to the caller. 615 */ 616 vopstats_t * 617 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp) 618 { 619 int fstype = 0; /* Index into vfssw[] */ 620 vopstats_t *vsp = NULL; 621 622 if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 || 623 !vopstats_enabled) 624 return (NULL); 625 /* 626 * Set up the fstype. We go to so much trouble because all versions 627 * of NFS use the same fstype in their vfs even though they have 628 * distinct entries in the vfssw[] table. 629 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry. 630 */ 631 if (vswp) { 632 fstype = vswp - vfssw; /* Gets us the index */ 633 } else { 634 fstype = vfsp->vfs_fstype; 635 } 636 637 /* 638 * Point to the per-fstype vopstats. The only valid values are 639 * non-zero positive values less than the number of vfssw[] table 640 * entries. 641 */ 642 if (fstype > 0 && fstype < nfstype) { 643 vsp = vopstats_fstype[fstype]; 644 } 645 646 return (vsp); 647 } 648 649 /* 650 * Generate a kstat name, create the kstat structure, and allocate a 651 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t 652 * to the caller. This must only be called from a mount. 653 */ 654 vsk_anchor_t * 655 get_vskstat_anchor(vfs_t *vfsp) 656 { 657 char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */ 658 statvfs64_t statvfsbuf; /* Needed to find f_fsid */ 659 vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */ 660 kstat_t *ksp; /* Ptr to new kstat */ 661 avl_index_t where; /* Location in the AVL tree */ 662 663 if (vfsp == NULL || vfsp->vfs_implp == NULL || 664 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) 665 return (NULL); 666 667 /* Need to get the fsid to build a kstat name */ 668 if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) { 669 /* Create a name for our kstats based on fsid */ 670 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx", 671 VOPSTATS_STR, statvfsbuf.f_fsid); 672 673 /* Allocate and initialize the vsk_anchor_t */ 674 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP); 675 bzero(vskp, sizeof (*vskp)); 676 vskp->vsk_fsid = statvfsbuf.f_fsid; 677 678 mutex_enter(&vskstat_tree_lock); 679 if (avl_find(&vskstat_tree, vskp, &where) == NULL) { 680 avl_insert(&vskstat_tree, vskp, where); 681 mutex_exit(&vskstat_tree_lock); 682 683 /* 684 * Now that we've got the anchor in the AVL 685 * tree, we can create the kstat. 686 */ 687 ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats); 688 if (ksp) { 689 vskp->vsk_ksp = ksp; 690 } 691 } else { 692 /* Oops, found one! Release memory and lock. */ 693 mutex_exit(&vskstat_tree_lock); 694 kmem_cache_free(vsk_anchor_cache, vskp); 695 vskp = NULL; 696 } 697 } 698 return (vskp); 699 } 700 701 /* 702 * We're in the process of tearing down the vfs and need to cleanup 703 * the data structures associated with the vopstats. Must only be called 704 * from dounmount(). 705 */ 706 void 707 teardown_vopstats(vfs_t *vfsp) 708 { 709 vsk_anchor_t *vskap; 710 avl_index_t where; 711 712 if (vfsp == NULL || vfsp->vfs_implp == NULL || 713 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) 714 return; 715 716 /* This is a safe check since VFS_STATS must be set (see above) */ 717 if ((vskap = vfsp->vfs_vskap) == NULL) 718 return; 719 720 /* Whack the pointer right away */ 721 vfsp->vfs_vskap = NULL; 722 723 /* Lock the tree, remove the node, and delete the kstat */ 724 mutex_enter(&vskstat_tree_lock); 725 if (avl_find(&vskstat_tree, vskap, &where)) { 726 avl_remove(&vskstat_tree, vskap); 727 } 728 729 if (vskap->vsk_ksp) { 730 kstat_delete(vskap->vsk_ksp); 731 } 732 mutex_exit(&vskstat_tree_lock); 733 734 kmem_cache_free(vsk_anchor_cache, vskap); 735 } 736 737 /* 738 * Read or write a vnode. Called from kernel code. 739 */ 740 int 741 vn_rdwr( 742 enum uio_rw rw, 743 struct vnode *vp, 744 caddr_t base, 745 ssize_t len, 746 offset_t offset, 747 enum uio_seg seg, 748 int ioflag, 749 rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */ 750 cred_t *cr, 751 ssize_t *residp) 752 { 753 struct uio uio; 754 struct iovec iov; 755 int error; 756 int in_crit = 0; 757 758 if (rw == UIO_WRITE && ISROFILE(vp)) 759 return (EROFS); 760 761 if (len < 0) 762 return (EIO); 763 764 VOPXID_MAP_CR(vp, cr); 765 766 iov.iov_base = base; 767 iov.iov_len = len; 768 uio.uio_iov = &iov; 769 uio.uio_iovcnt = 1; 770 uio.uio_loffset = offset; 771 uio.uio_segflg = (short)seg; 772 uio.uio_resid = len; 773 uio.uio_llimit = ulimit; 774 775 /* 776 * We have to enter the critical region before calling VOP_RWLOCK 777 * to avoid a deadlock with ufs. 778 */ 779 if (nbl_need_check(vp)) { 780 int svmand; 781 782 nbl_start_crit(vp, RW_READER); 783 in_crit = 1; 784 error = nbl_svmand(vp, cr, &svmand); 785 if (error != 0) 786 goto done; 787 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ, 788 uio.uio_offset, uio.uio_resid, svmand, NULL)) { 789 error = EACCES; 790 goto done; 791 } 792 } 793 794 (void) VOP_RWLOCK(vp, 795 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); 796 if (rw == UIO_WRITE) { 797 uio.uio_fmode = FWRITE; 798 uio.uio_extflg = UIO_COPY_DEFAULT; 799 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL); 800 } else { 801 uio.uio_fmode = FREAD; 802 uio.uio_extflg = UIO_COPY_CACHED; 803 error = VOP_READ(vp, &uio, ioflag, cr, NULL); 804 } 805 VOP_RWUNLOCK(vp, 806 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); 807 if (residp) 808 *residp = uio.uio_resid; 809 else if (uio.uio_resid) 810 error = EIO; 811 812 done: 813 if (in_crit) 814 nbl_end_crit(vp); 815 return (error); 816 } 817 818 /* 819 * Release a vnode. Call VOP_INACTIVE on last reference or 820 * decrement reference count. 821 * 822 * To avoid race conditions, the v_count is left at 1 for 823 * the call to VOP_INACTIVE. This prevents another thread 824 * from reclaiming and releasing the vnode *before* the 825 * VOP_INACTIVE routine has a chance to destroy the vnode. 826 * We can't have more than 1 thread calling VOP_INACTIVE 827 * on a vnode. 828 */ 829 void 830 vn_rele(vnode_t *vp) 831 { 832 VERIFY(vp->v_count > 0); 833 mutex_enter(&vp->v_lock); 834 if (vp->v_count == 1) { 835 mutex_exit(&vp->v_lock); 836 VOP_INACTIVE(vp, CRED(), NULL); 837 return; 838 } 839 vp->v_count--; 840 mutex_exit(&vp->v_lock); 841 } 842 843 /* 844 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated 845 * as a single reference, so v_count is not decremented until the last DNLC hold 846 * is released. This makes it possible to distinguish vnodes that are referenced 847 * only by the DNLC. 848 */ 849 void 850 vn_rele_dnlc(vnode_t *vp) 851 { 852 VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0)); 853 mutex_enter(&vp->v_lock); 854 if (--vp->v_count_dnlc == 0) { 855 if (vp->v_count == 1) { 856 mutex_exit(&vp->v_lock); 857 VOP_INACTIVE(vp, CRED(), NULL); 858 return; 859 } 860 vp->v_count--; 861 } 862 mutex_exit(&vp->v_lock); 863 } 864 865 /* 866 * Like vn_rele() except that it clears v_stream under v_lock. 867 * This is used by sockfs when it dismantels the association between 868 * the sockfs node and the vnode in the underlaying file system. 869 * v_lock has to be held to prevent a thread coming through the lookupname 870 * path from accessing a stream head that is going away. 871 */ 872 void 873 vn_rele_stream(vnode_t *vp) 874 { 875 VERIFY(vp->v_count > 0); 876 mutex_enter(&vp->v_lock); 877 vp->v_stream = NULL; 878 if (vp->v_count == 1) { 879 mutex_exit(&vp->v_lock); 880 VOP_INACTIVE(vp, CRED(), NULL); 881 return; 882 } 883 vp->v_count--; 884 mutex_exit(&vp->v_lock); 885 } 886 887 static void 888 vn_rele_inactive(vnode_t *vp) 889 { 890 VOP_INACTIVE(vp, CRED(), NULL); 891 } 892 893 /* 894 * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it 895 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering 896 * the file system as a result of releasing the vnode. Note, file systems 897 * already have to handle the race where the vnode is incremented before the 898 * inactive routine is called and does its locking. 899 * 900 * Warning: Excessive use of this routine can lead to performance problems. 901 * This is because taskqs throttle back allocation if too many are created. 902 */ 903 void 904 vn_rele_async(vnode_t *vp, taskq_t *taskq) 905 { 906 VERIFY(vp->v_count > 0); 907 mutex_enter(&vp->v_lock); 908 if (vp->v_count == 1) { 909 mutex_exit(&vp->v_lock); 910 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive, 911 vp, TQ_SLEEP) != NULL); 912 return; 913 } 914 vp->v_count--; 915 mutex_exit(&vp->v_lock); 916 } 917 918 int 919 vn_open( 920 char *pnamep, 921 enum uio_seg seg, 922 int filemode, 923 int createmode, 924 struct vnode **vpp, 925 enum create crwhy, 926 mode_t umask) 927 { 928 return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy, 929 umask, NULL, -1)); 930 } 931 932 933 /* 934 * Open/create a vnode. 935 * This may be callable by the kernel, the only known use 936 * of user context being that the current user credentials 937 * are used for permissions. crwhy is defined iff filemode & FCREAT. 938 */ 939 int 940 vn_openat( 941 char *pnamep, 942 enum uio_seg seg, 943 int filemode, 944 int createmode, 945 struct vnode **vpp, 946 enum create crwhy, 947 mode_t umask, 948 struct vnode *startvp, 949 int fd) 950 { 951 struct vnode *vp; 952 int mode; 953 int accessflags; 954 int error; 955 int in_crit = 0; 956 int open_done = 0; 957 int shrlock_done = 0; 958 struct vattr vattr; 959 enum symfollow follow; 960 int estale_retry = 0; 961 struct shrlock shr; 962 struct shr_locowner shr_own; 963 boolean_t create; 964 965 mode = 0; 966 accessflags = 0; 967 if (filemode & FREAD) 968 mode |= VREAD; 969 if (filemode & (FWRITE|FTRUNC)) 970 mode |= VWRITE; 971 if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN)) 972 mode |= VEXEC; 973 974 /* symlink interpretation */ 975 if (filemode & FNOFOLLOW) 976 follow = NO_FOLLOW; 977 else 978 follow = FOLLOW; 979 980 if (filemode & FAPPEND) 981 accessflags |= V_APPEND; 982 983 /* 984 * We need to handle the case of FCREAT | FDIRECTORY and the case of 985 * FEXCL. If all three are specified, then we always fail because we 986 * cannot create a directory through this interface and FEXCL says we 987 * need to fail the request if we can't create it. If, however, only 988 * FCREAT | FDIRECTORY are specified, then we can treat this as the case 989 * of opening a file that already exists. If it exists, we can do 990 * something and if not, we fail. Effectively FCREAT | FDIRECTORY is 991 * treated as FDIRECTORY. 992 */ 993 if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) == 994 (FCREAT | FDIRECTORY | FEXCL)) { 995 return (EINVAL); 996 } 997 998 if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) { 999 create = B_FALSE; 1000 } else if ((filemode & FCREAT) != 0) { 1001 create = B_TRUE; 1002 } else { 1003 create = B_FALSE; 1004 } 1005 1006 top: 1007 if (create) { 1008 enum vcexcl excl; 1009 1010 /* 1011 * Wish to create a file. 1012 */ 1013 vattr.va_type = VREG; 1014 vattr.va_mode = createmode; 1015 vattr.va_mask = AT_TYPE|AT_MODE; 1016 if (filemode & FTRUNC) { 1017 vattr.va_size = 0; 1018 vattr.va_mask |= AT_SIZE; 1019 } 1020 if (filemode & FEXCL) 1021 excl = EXCL; 1022 else 1023 excl = NONEXCL; 1024 1025 if (error = 1026 vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy, 1027 (filemode & ~(FTRUNC|FEXCL)), umask, startvp)) 1028 return (error); 1029 } else { 1030 /* 1031 * Wish to open a file. Just look it up. 1032 */ 1033 if (error = lookupnameat(pnamep, seg, follow, 1034 NULLVPP, &vp, startvp)) { 1035 if ((error == ESTALE) && 1036 fs_need_estale_retry(estale_retry++)) 1037 goto top; 1038 return (error); 1039 } 1040 1041 /* 1042 * Get the attributes to check whether file is large. 1043 * We do this only if the FOFFMAX flag is not set and 1044 * only for regular files. 1045 */ 1046 1047 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) { 1048 vattr.va_mask = AT_SIZE; 1049 if ((error = VOP_GETATTR(vp, &vattr, 0, 1050 CRED(), NULL))) { 1051 goto out; 1052 } 1053 if (vattr.va_size > (u_offset_t)MAXOFF32_T) { 1054 /* 1055 * Large File API - regular open fails 1056 * if FOFFMAX flag is set in file mode 1057 */ 1058 error = EOVERFLOW; 1059 goto out; 1060 } 1061 } 1062 /* 1063 * Can't write directories, active texts, or 1064 * read-only filesystems. Can't truncate files 1065 * on which mandatory locking is in effect. 1066 */ 1067 if (filemode & (FWRITE|FTRUNC)) { 1068 /* 1069 * Allow writable directory if VDIROPEN flag is set. 1070 */ 1071 if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) { 1072 error = EISDIR; 1073 goto out; 1074 } 1075 if (ISROFILE(vp)) { 1076 error = EROFS; 1077 goto out; 1078 } 1079 /* 1080 * Can't truncate files on which 1081 * sysv mandatory locking is in effect. 1082 */ 1083 if (filemode & FTRUNC) { 1084 vnode_t *rvp; 1085 1086 if (VOP_REALVP(vp, &rvp, NULL) != 0) 1087 rvp = vp; 1088 if (rvp->v_filocks != NULL) { 1089 vattr.va_mask = AT_MODE; 1090 if ((error = VOP_GETATTR(vp, 1091 &vattr, 0, CRED(), NULL)) == 0 && 1092 MANDLOCK(vp, vattr.va_mode)) 1093 error = EAGAIN; 1094 } 1095 } 1096 if (error) 1097 goto out; 1098 } 1099 /* 1100 * Check permissions. 1101 */ 1102 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL)) 1103 goto out; 1104 1105 /* 1106 * Require FSEARCH and FDIRECTORY to return a directory. Require 1107 * FEXEC to return a regular file. 1108 */ 1109 if ((filemode & (FSEARCH|FDIRECTORY)) != 0 && 1110 vp->v_type != VDIR) { 1111 error = ENOTDIR; 1112 goto out; 1113 } 1114 if ((filemode & FEXEC) && vp->v_type != VREG) { 1115 error = ENOEXEC; /* XXX: error code? */ 1116 goto out; 1117 } 1118 } 1119 1120 /* 1121 * Do remaining checks for FNOFOLLOW and FNOLINKS. 1122 */ 1123 if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) { 1124 error = ELOOP; 1125 goto out; 1126 } 1127 if (filemode & FNOLINKS) { 1128 vattr.va_mask = AT_NLINK; 1129 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) { 1130 goto out; 1131 } 1132 if (vattr.va_nlink != 1) { 1133 error = EMLINK; 1134 goto out; 1135 } 1136 } 1137 1138 /* 1139 * Opening a socket corresponding to the AF_UNIX pathname 1140 * in the filesystem name space is not supported. 1141 * However, VSOCK nodes in namefs are supported in order 1142 * to make fattach work for sockets. 1143 * 1144 * XXX This uses VOP_REALVP to distinguish between 1145 * an unopened namefs node (where VOP_REALVP returns a 1146 * different VSOCK vnode) and a VSOCK created by vn_create 1147 * in some file system (where VOP_REALVP would never return 1148 * a different vnode). 1149 */ 1150 if (vp->v_type == VSOCK) { 1151 struct vnode *nvp; 1152 1153 error = VOP_REALVP(vp, &nvp, NULL); 1154 if (error != 0 || nvp == NULL || nvp == vp || 1155 nvp->v_type != VSOCK) { 1156 error = EOPNOTSUPP; 1157 goto out; 1158 } 1159 } 1160 1161 if ((vp->v_type == VREG) && nbl_need_check(vp)) { 1162 /* get share reservation */ 1163 shr.s_access = 0; 1164 if (filemode & FWRITE) 1165 shr.s_access |= F_WRACC; 1166 if (filemode & FREAD) 1167 shr.s_access |= F_RDACC; 1168 shr.s_deny = 0; 1169 shr.s_sysid = 0; 1170 shr.s_pid = ttoproc(curthread)->p_pid; 1171 shr_own.sl_pid = shr.s_pid; 1172 shr_own.sl_id = fd; 1173 shr.s_own_len = sizeof (shr_own); 1174 shr.s_owner = (caddr_t)&shr_own; 1175 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(), 1176 NULL); 1177 if (error) 1178 goto out; 1179 shrlock_done = 1; 1180 1181 /* nbmand conflict check if truncating file */ 1182 if ((filemode & FTRUNC) && !(filemode & FCREAT)) { 1183 nbl_start_crit(vp, RW_READER); 1184 in_crit = 1; 1185 1186 vattr.va_mask = AT_SIZE; 1187 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) 1188 goto out; 1189 if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0, 1190 NULL)) { 1191 error = EACCES; 1192 goto out; 1193 } 1194 } 1195 } 1196 1197 /* 1198 * Do opening protocol. 1199 */ 1200 error = VOP_OPEN(&vp, filemode, CRED(), NULL); 1201 if (error) 1202 goto out; 1203 open_done = 1; 1204 1205 /* 1206 * Truncate if required. 1207 */ 1208 if ((filemode & FTRUNC) && !(filemode & FCREAT)) { 1209 vattr.va_size = 0; 1210 vattr.va_mask = AT_SIZE; 1211 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) 1212 goto out; 1213 } 1214 out: 1215 ASSERT(vp->v_count > 0); 1216 1217 if (in_crit) { 1218 nbl_end_crit(vp); 1219 in_crit = 0; 1220 } 1221 if (error) { 1222 if (open_done) { 1223 (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(), 1224 NULL); 1225 open_done = 0; 1226 shrlock_done = 0; 1227 } 1228 if (shrlock_done) { 1229 (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(), 1230 NULL); 1231 shrlock_done = 0; 1232 } 1233 1234 /* 1235 * The following clause was added to handle a problem 1236 * with NFS consistency. It is possible that a lookup 1237 * of the file to be opened succeeded, but the file 1238 * itself doesn't actually exist on the server. This 1239 * is chiefly due to the DNLC containing an entry for 1240 * the file which has been removed on the server. In 1241 * this case, we just start over. If there was some 1242 * other cause for the ESTALE error, then the lookup 1243 * of the file will fail and the error will be returned 1244 * above instead of looping around from here. 1245 */ 1246 VN_RELE(vp); 1247 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1248 goto top; 1249 } else 1250 *vpp = vp; 1251 return (error); 1252 } 1253 1254 /* 1255 * The following two accessor functions are for the NFSv4 server. Since there 1256 * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the 1257 * vnode open counts correct when a client "upgrades" an open or does an 1258 * open_downgrade. In NFS, an upgrade or downgrade can not only change the 1259 * open mode (add or subtract read or write), but also change the share/deny 1260 * modes. However, share reservations are not integrated with OPEN, yet, so 1261 * we need to handle each separately. These functions are cleaner than having 1262 * the NFS server manipulate the counts directly, however, nobody else should 1263 * use these functions. 1264 */ 1265 void 1266 vn_open_upgrade( 1267 vnode_t *vp, 1268 int filemode) 1269 { 1270 ASSERT(vp->v_type == VREG); 1271 1272 if (filemode & FREAD) 1273 atomic_inc_32(&vp->v_rdcnt); 1274 if (filemode & FWRITE) 1275 atomic_inc_32(&vp->v_wrcnt); 1276 1277 } 1278 1279 void 1280 vn_open_downgrade( 1281 vnode_t *vp, 1282 int filemode) 1283 { 1284 ASSERT(vp->v_type == VREG); 1285 1286 if (filemode & FREAD) { 1287 ASSERT(vp->v_rdcnt > 0); 1288 atomic_dec_32(&vp->v_rdcnt); 1289 } 1290 if (filemode & FWRITE) { 1291 ASSERT(vp->v_wrcnt > 0); 1292 atomic_dec_32(&vp->v_wrcnt); 1293 } 1294 1295 } 1296 1297 int 1298 vn_create( 1299 char *pnamep, 1300 enum uio_seg seg, 1301 struct vattr *vap, 1302 enum vcexcl excl, 1303 int mode, 1304 struct vnode **vpp, 1305 enum create why, 1306 int flag, 1307 mode_t umask) 1308 { 1309 return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag, 1310 umask, NULL)); 1311 } 1312 1313 /* 1314 * Create a vnode (makenode). 1315 */ 1316 int 1317 vn_createat( 1318 char *pnamep, 1319 enum uio_seg seg, 1320 struct vattr *vap, 1321 enum vcexcl excl, 1322 int mode, 1323 struct vnode **vpp, 1324 enum create why, 1325 int flag, 1326 mode_t umask, 1327 struct vnode *startvp) 1328 { 1329 struct vnode *dvp; /* ptr to parent dir vnode */ 1330 struct vnode *vp = NULL; 1331 struct pathname pn; 1332 int error; 1333 int in_crit = 0; 1334 struct vattr vattr; 1335 enum symfollow follow; 1336 int estale_retry = 0; 1337 uint32_t auditing = AU_AUDITING(); 1338 1339 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 1340 1341 /* symlink interpretation */ 1342 if ((flag & FNOFOLLOW) || excl == EXCL) 1343 follow = NO_FOLLOW; 1344 else 1345 follow = FOLLOW; 1346 flag &= ~(FNOFOLLOW|FNOLINKS); 1347 1348 top: 1349 /* 1350 * Lookup directory. 1351 * If new object is a file, call lower level to create it. 1352 * Note that it is up to the lower level to enforce exclusive 1353 * creation, if the file is already there. 1354 * This allows the lower level to do whatever 1355 * locking or protocol that is needed to prevent races. 1356 * If the new object is directory call lower level to make 1357 * the new directory, with "." and "..". 1358 */ 1359 if (error = pn_get(pnamep, seg, &pn)) 1360 return (error); 1361 if (auditing) 1362 audit_vncreate_start(); 1363 dvp = NULL; 1364 *vpp = NULL; 1365 /* 1366 * lookup will find the parent directory for the vnode. 1367 * When it is done the pn holds the name of the entry 1368 * in the directory. 1369 * If this is a non-exclusive create we also find the node itself. 1370 */ 1371 error = lookuppnat(&pn, NULL, follow, &dvp, 1372 (excl == EXCL) ? NULLVPP : vpp, startvp); 1373 if (error) { 1374 pn_free(&pn); 1375 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1376 goto top; 1377 if (why == CRMKDIR && error == EINVAL) 1378 error = EEXIST; /* SVID */ 1379 return (error); 1380 } 1381 1382 if (why != CRMKNOD) 1383 vap->va_mode &= ~VSVTX; 1384 1385 /* 1386 * If default ACLs are defined for the directory don't apply the 1387 * umask if umask is passed. 1388 */ 1389 1390 if (umask) { 1391 1392 vsecattr_t vsec; 1393 1394 vsec.vsa_aclcnt = 0; 1395 vsec.vsa_aclentp = NULL; 1396 vsec.vsa_dfaclcnt = 0; 1397 vsec.vsa_dfaclentp = NULL; 1398 vsec.vsa_mask = VSA_DFACLCNT; 1399 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL); 1400 /* 1401 * If error is ENOSYS then treat it as no error 1402 * Don't want to force all file systems to support 1403 * aclent_t style of ACL's. 1404 */ 1405 if (error == ENOSYS) 1406 error = 0; 1407 if (error) { 1408 if (*vpp != NULL) 1409 VN_RELE(*vpp); 1410 goto out; 1411 } else { 1412 /* 1413 * Apply the umask if no default ACLs. 1414 */ 1415 if (vsec.vsa_dfaclcnt == 0) 1416 vap->va_mode &= ~umask; 1417 1418 /* 1419 * VOP_GETSECATTR() may have allocated memory for 1420 * ACLs we didn't request, so double-check and 1421 * free it if necessary. 1422 */ 1423 if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL) 1424 kmem_free((caddr_t)vsec.vsa_aclentp, 1425 vsec.vsa_aclcnt * sizeof (aclent_t)); 1426 if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL) 1427 kmem_free((caddr_t)vsec.vsa_dfaclentp, 1428 vsec.vsa_dfaclcnt * sizeof (aclent_t)); 1429 } 1430 } 1431 1432 /* 1433 * In general we want to generate EROFS if the file system is 1434 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1 1435 * documents the open system call, and it says that O_CREAT has no 1436 * effect if the file already exists. Bug 1119649 states 1437 * that open(path, O_CREAT, ...) fails when attempting to open an 1438 * existing file on a read only file system. Thus, the first part 1439 * of the following if statement has 3 checks: 1440 * if the file exists && 1441 * it is being open with write access && 1442 * the file system is read only 1443 * then generate EROFS 1444 */ 1445 if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) || 1446 (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 1447 if (*vpp) 1448 VN_RELE(*vpp); 1449 error = EROFS; 1450 } else if (excl == NONEXCL && *vpp != NULL) { 1451 vnode_t *rvp; 1452 1453 /* 1454 * File already exists. If a mandatory lock has been 1455 * applied, return error. 1456 */ 1457 vp = *vpp; 1458 if (VOP_REALVP(vp, &rvp, NULL) != 0) 1459 rvp = vp; 1460 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) { 1461 nbl_start_crit(vp, RW_READER); 1462 in_crit = 1; 1463 } 1464 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) { 1465 vattr.va_mask = AT_MODE|AT_SIZE; 1466 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) { 1467 goto out; 1468 } 1469 if (MANDLOCK(vp, vattr.va_mode)) { 1470 error = EAGAIN; 1471 goto out; 1472 } 1473 /* 1474 * File cannot be truncated if non-blocking mandatory 1475 * locks are currently on the file. 1476 */ 1477 if ((vap->va_mask & AT_SIZE) && in_crit) { 1478 u_offset_t offset; 1479 ssize_t length; 1480 1481 offset = vap->va_size > vattr.va_size ? 1482 vattr.va_size : vap->va_size; 1483 length = vap->va_size > vattr.va_size ? 1484 vap->va_size - vattr.va_size : 1485 vattr.va_size - vap->va_size; 1486 if (nbl_conflict(vp, NBL_WRITE, offset, 1487 length, 0, NULL)) { 1488 error = EACCES; 1489 goto out; 1490 } 1491 } 1492 } 1493 1494 /* 1495 * If the file is the root of a VFS, we've crossed a 1496 * mount point and the "containing" directory that we 1497 * acquired above (dvp) is irrelevant because it's in 1498 * a different file system. We apply VOP_CREATE to the 1499 * target itself instead of to the containing directory 1500 * and supply a null path name to indicate (conventionally) 1501 * the node itself as the "component" of interest. 1502 * 1503 * The intercession of the file system is necessary to 1504 * ensure that the appropriate permission checks are 1505 * done. 1506 */ 1507 if (vp->v_flag & VROOT) { 1508 ASSERT(why != CRMKDIR); 1509 error = VOP_CREATE(vp, "", vap, excl, mode, vpp, 1510 CRED(), flag, NULL, NULL); 1511 /* 1512 * If the create succeeded, it will have created 1513 * a new reference to the vnode. Give up the 1514 * original reference. The assertion should not 1515 * get triggered because NBMAND locks only apply to 1516 * VREG files. And if in_crit is non-zero for some 1517 * reason, detect that here, rather than when we 1518 * deference a null vp. 1519 */ 1520 ASSERT(in_crit == 0); 1521 VN_RELE(vp); 1522 vp = NULL; 1523 goto out; 1524 } 1525 1526 /* 1527 * Large File API - non-large open (FOFFMAX flag not set) 1528 * of regular file fails if the file size exceeds MAXOFF32_T. 1529 */ 1530 if (why != CRMKDIR && 1531 !(flag & FOFFMAX) && 1532 (vp->v_type == VREG)) { 1533 vattr.va_mask = AT_SIZE; 1534 if ((error = VOP_GETATTR(vp, &vattr, 0, 1535 CRED(), NULL))) { 1536 goto out; 1537 } 1538 if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) { 1539 error = EOVERFLOW; 1540 goto out; 1541 } 1542 } 1543 } 1544 1545 if (error == 0) { 1546 /* 1547 * Call mkdir() if specified, otherwise create(). 1548 */ 1549 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */ 1550 1551 if (why == CRMKDIR) 1552 /* 1553 * N.B., if vn_createat() ever requests 1554 * case-insensitive behavior then it will need 1555 * to be passed to VOP_MKDIR(). VOP_CREATE() 1556 * will already get it via "flag" 1557 */ 1558 error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(), 1559 NULL, 0, NULL); 1560 else if (!must_be_dir) 1561 error = VOP_CREATE(dvp, pn.pn_path, vap, 1562 excl, mode, vpp, CRED(), flag, NULL, NULL); 1563 else 1564 error = ENOTDIR; 1565 } 1566 1567 out: 1568 1569 if (auditing) 1570 audit_vncreate_finish(*vpp, error); 1571 if (in_crit) { 1572 nbl_end_crit(vp); 1573 in_crit = 0; 1574 } 1575 if (vp != NULL) { 1576 VN_RELE(vp); 1577 vp = NULL; 1578 } 1579 pn_free(&pn); 1580 VN_RELE(dvp); 1581 /* 1582 * The following clause was added to handle a problem 1583 * with NFS consistency. It is possible that a lookup 1584 * of the file to be created succeeded, but the file 1585 * itself doesn't actually exist on the server. This 1586 * is chiefly due to the DNLC containing an entry for 1587 * the file which has been removed on the server. In 1588 * this case, we just start over. If there was some 1589 * other cause for the ESTALE error, then the lookup 1590 * of the file will fail and the error will be returned 1591 * above instead of looping around from here. 1592 */ 1593 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1594 goto top; 1595 return (error); 1596 } 1597 1598 int 1599 vn_link(char *from, char *to, enum uio_seg seg) 1600 { 1601 return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg)); 1602 } 1603 1604 int 1605 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow, 1606 vnode_t *tstartvp, char *to, enum uio_seg seg) 1607 { 1608 struct vnode *fvp; /* from vnode ptr */ 1609 struct vnode *tdvp; /* to directory vnode ptr */ 1610 struct pathname pn; 1611 int error; 1612 struct vattr vattr; 1613 dev_t fsid; 1614 int estale_retry = 0; 1615 uint32_t auditing = AU_AUDITING(); 1616 1617 top: 1618 fvp = tdvp = NULL; 1619 if (error = pn_get(to, seg, &pn)) 1620 return (error); 1621 if (auditing && fstartvp != NULL) 1622 audit_setfsat_path(1); 1623 if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp)) 1624 goto out; 1625 if (auditing && tstartvp != NULL) 1626 audit_setfsat_path(3); 1627 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp)) 1628 goto out; 1629 /* 1630 * Make sure both source vnode and target directory vnode are 1631 * in the same vfs and that it is writeable. 1632 */ 1633 vattr.va_mask = AT_FSID; 1634 if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL)) 1635 goto out; 1636 fsid = vattr.va_fsid; 1637 vattr.va_mask = AT_FSID; 1638 if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL)) 1639 goto out; 1640 if (fsid != vattr.va_fsid) { 1641 error = EXDEV; 1642 goto out; 1643 } 1644 if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) { 1645 error = EROFS; 1646 goto out; 1647 } 1648 /* 1649 * Do the link. 1650 */ 1651 (void) pn_fixslash(&pn); 1652 error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0); 1653 out: 1654 pn_free(&pn); 1655 if (fvp) 1656 VN_RELE(fvp); 1657 if (tdvp) 1658 VN_RELE(tdvp); 1659 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1660 goto top; 1661 return (error); 1662 } 1663 1664 int 1665 vn_rename(char *from, char *to, enum uio_seg seg) 1666 { 1667 return (vn_renameat(NULL, from, NULL, to, seg)); 1668 } 1669 1670 int 1671 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp, 1672 char *tname, enum uio_seg seg) 1673 { 1674 int error; 1675 struct vattr vattr; 1676 struct pathname fpn; /* from pathname */ 1677 struct pathname tpn; /* to pathname */ 1678 dev_t fsid; 1679 int in_crit_src, in_crit_targ; 1680 vnode_t *fromvp, *fvp; 1681 vnode_t *tovp, *targvp; 1682 int estale_retry = 0; 1683 uint32_t auditing = AU_AUDITING(); 1684 1685 top: 1686 fvp = fromvp = tovp = targvp = NULL; 1687 in_crit_src = in_crit_targ = 0; 1688 /* 1689 * Get to and from pathnames. 1690 */ 1691 if (error = pn_get(fname, seg, &fpn)) 1692 return (error); 1693 if (error = pn_get(tname, seg, &tpn)) { 1694 pn_free(&fpn); 1695 return (error); 1696 } 1697 1698 /* 1699 * First we need to resolve the correct directories 1700 * The passed in directories may only be a starting point, 1701 * but we need the real directories the file(s) live in. 1702 * For example the fname may be something like usr/lib/sparc 1703 * and we were passed in the / directory, but we need to 1704 * use the lib directory for the rename. 1705 */ 1706 1707 if (auditing && fdvp != NULL) 1708 audit_setfsat_path(1); 1709 /* 1710 * Lookup to and from directories. 1711 */ 1712 if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) { 1713 goto out; 1714 } 1715 1716 /* 1717 * Make sure there is an entry. 1718 */ 1719 if (fvp == NULL) { 1720 error = ENOENT; 1721 goto out; 1722 } 1723 1724 if (auditing && tdvp != NULL) 1725 audit_setfsat_path(3); 1726 if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) { 1727 goto out; 1728 } 1729 1730 /* 1731 * Make sure both the from vnode directory and the to directory 1732 * are in the same vfs and the to directory is writable. 1733 * We check fsid's, not vfs pointers, so loopback fs works. 1734 */ 1735 if (fromvp != tovp) { 1736 vattr.va_mask = AT_FSID; 1737 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL)) 1738 goto out; 1739 fsid = vattr.va_fsid; 1740 vattr.va_mask = AT_FSID; 1741 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL)) 1742 goto out; 1743 if (fsid != vattr.va_fsid) { 1744 error = EXDEV; 1745 goto out; 1746 } 1747 } 1748 1749 if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) { 1750 error = EROFS; 1751 goto out; 1752 } 1753 1754 if (targvp && (fvp != targvp)) { 1755 nbl_start_crit(targvp, RW_READER); 1756 in_crit_targ = 1; 1757 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) { 1758 error = EACCES; 1759 goto out; 1760 } 1761 } 1762 1763 if (nbl_need_check(fvp)) { 1764 nbl_start_crit(fvp, RW_READER); 1765 in_crit_src = 1; 1766 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) { 1767 error = EACCES; 1768 goto out; 1769 } 1770 } 1771 1772 /* 1773 * Do the rename. 1774 */ 1775 (void) pn_fixslash(&tpn); 1776 error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(), 1777 NULL, 0); 1778 1779 out: 1780 pn_free(&fpn); 1781 pn_free(&tpn); 1782 if (in_crit_src) 1783 nbl_end_crit(fvp); 1784 if (in_crit_targ) 1785 nbl_end_crit(targvp); 1786 if (fromvp) 1787 VN_RELE(fromvp); 1788 if (tovp) 1789 VN_RELE(tovp); 1790 if (targvp) 1791 VN_RELE(targvp); 1792 if (fvp) 1793 VN_RELE(fvp); 1794 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1795 goto top; 1796 return (error); 1797 } 1798 1799 /* 1800 * Remove a file or directory. 1801 */ 1802 int 1803 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) 1804 { 1805 return (vn_removeat(NULL, fnamep, seg, dirflag)); 1806 } 1807 1808 int 1809 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag) 1810 { 1811 struct vnode *vp; /* entry vnode */ 1812 struct vnode *dvp; /* ptr to parent dir vnode */ 1813 struct vnode *coveredvp; 1814 struct pathname pn; /* name of entry */ 1815 enum vtype vtype; 1816 int error; 1817 struct vfs *vfsp; 1818 struct vfs *dvfsp; /* ptr to parent dir vfs */ 1819 int in_crit = 0; 1820 int estale_retry = 0; 1821 1822 top: 1823 if (error = pn_get(fnamep, seg, &pn)) 1824 return (error); 1825 dvp = vp = NULL; 1826 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) { 1827 pn_free(&pn); 1828 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1829 goto top; 1830 return (error); 1831 } 1832 1833 /* 1834 * Make sure there is an entry. 1835 */ 1836 if (vp == NULL) { 1837 error = ENOENT; 1838 goto out; 1839 } 1840 1841 vfsp = vp->v_vfsp; 1842 dvfsp = dvp->v_vfsp; 1843 1844 /* 1845 * If the named file is the root of a mounted filesystem, fail, 1846 * unless it's marked unlinkable. In that case, unmount the 1847 * filesystem and proceed to unlink the covered vnode. (If the 1848 * covered vnode is a directory, use rmdir instead of unlink, 1849 * to avoid file system corruption.) 1850 */ 1851 if (vp->v_flag & VROOT) { 1852 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) { 1853 error = EBUSY; 1854 goto out; 1855 } 1856 1857 /* 1858 * Namefs specific code starts here. 1859 */ 1860 1861 if (dirflag == RMDIRECTORY) { 1862 /* 1863 * User called rmdir(2) on a file that has 1864 * been namefs mounted on top of. Since 1865 * namefs doesn't allow directories to 1866 * be mounted on other files we know 1867 * vp is not of type VDIR so fail to operation. 1868 */ 1869 error = ENOTDIR; 1870 goto out; 1871 } 1872 1873 /* 1874 * If VROOT is still set after grabbing vp->v_lock, 1875 * noone has finished nm_unmount so far and coveredvp 1876 * is valid. 1877 * If we manage to grab vn_vfswlock(coveredvp) before releasing 1878 * vp->v_lock, any race window is eliminated. 1879 */ 1880 1881 mutex_enter(&vp->v_lock); 1882 if ((vp->v_flag & VROOT) == 0) { 1883 /* Someone beat us to the unmount */ 1884 mutex_exit(&vp->v_lock); 1885 error = EBUSY; 1886 goto out; 1887 } 1888 vfsp = vp->v_vfsp; 1889 coveredvp = vfsp->vfs_vnodecovered; 1890 ASSERT(coveredvp); 1891 /* 1892 * Note: Implementation of vn_vfswlock shows that ordering of 1893 * v_lock / vn_vfswlock is not an issue here. 1894 */ 1895 error = vn_vfswlock(coveredvp); 1896 mutex_exit(&vp->v_lock); 1897 1898 if (error) 1899 goto out; 1900 1901 VN_HOLD(coveredvp); 1902 VN_RELE(vp); 1903 error = dounmount(vfsp, 0, CRED()); 1904 1905 /* 1906 * Unmounted the namefs file system; now get 1907 * the object it was mounted over. 1908 */ 1909 vp = coveredvp; 1910 /* 1911 * If namefs was mounted over a directory, then 1912 * we want to use rmdir() instead of unlink(). 1913 */ 1914 if (vp->v_type == VDIR) 1915 dirflag = RMDIRECTORY; 1916 1917 if (error) 1918 goto out; 1919 } 1920 1921 /* 1922 * Make sure filesystem is writeable. 1923 * We check the parent directory's vfs in case this is an lofs vnode. 1924 */ 1925 if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) { 1926 error = EROFS; 1927 goto out; 1928 } 1929 1930 vtype = vp->v_type; 1931 1932 /* 1933 * If there is the possibility of an nbmand share reservation, make 1934 * sure it's okay to remove the file. Keep a reference to the 1935 * vnode, so that we can exit the nbl critical region after 1936 * calling VOP_REMOVE. 1937 * If there is no possibility of an nbmand share reservation, 1938 * release the vnode reference now. Filesystems like NFS may 1939 * behave differently if there is an extra reference, so get rid of 1940 * this one. Fortunately, we can't have nbmand mounts on NFS 1941 * filesystems. 1942 */ 1943 if (nbl_need_check(vp)) { 1944 nbl_start_crit(vp, RW_READER); 1945 in_crit = 1; 1946 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) { 1947 error = EACCES; 1948 goto out; 1949 } 1950 } else { 1951 VN_RELE(vp); 1952 vp = NULL; 1953 } 1954 1955 if (dirflag == RMDIRECTORY) { 1956 /* 1957 * Caller is using rmdir(2), which can only be applied to 1958 * directories. 1959 */ 1960 if (vtype != VDIR) { 1961 error = ENOTDIR; 1962 } else { 1963 vnode_t *cwd; 1964 proc_t *pp = curproc; 1965 1966 mutex_enter(&pp->p_lock); 1967 cwd = PTOU(pp)->u_cdir; 1968 VN_HOLD(cwd); 1969 mutex_exit(&pp->p_lock); 1970 error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(), 1971 NULL, 0); 1972 VN_RELE(cwd); 1973 } 1974 } else { 1975 /* 1976 * Unlink(2) can be applied to anything. 1977 */ 1978 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0); 1979 } 1980 1981 out: 1982 pn_free(&pn); 1983 if (in_crit) { 1984 nbl_end_crit(vp); 1985 in_crit = 0; 1986 } 1987 if (vp != NULL) 1988 VN_RELE(vp); 1989 if (dvp != NULL) 1990 VN_RELE(dvp); 1991 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) 1992 goto top; 1993 return (error); 1994 } 1995 1996 /* 1997 * Utility function to compare equality of vnodes. 1998 * Compare the underlying real vnodes, if there are underlying vnodes. 1999 * This is a more thorough comparison than the VN_CMP() macro provides. 2000 */ 2001 int 2002 vn_compare(vnode_t *vp1, vnode_t *vp2) 2003 { 2004 vnode_t *realvp; 2005 2006 if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0) 2007 vp1 = realvp; 2008 if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0) 2009 vp2 = realvp; 2010 return (VN_CMP(vp1, vp2)); 2011 } 2012 2013 /* 2014 * The number of locks to hash into. This value must be a power 2015 * of 2 minus 1 and should probably also be prime. 2016 */ 2017 #define NUM_BUCKETS 1023 2018 2019 struct vn_vfslocks_bucket { 2020 kmutex_t vb_lock; 2021 vn_vfslocks_entry_t *vb_list; 2022 char pad[64 - sizeof (kmutex_t) - sizeof (void *)]; 2023 }; 2024 2025 /* 2026 * Total number of buckets will be NUM_BUCKETS + 1 . 2027 */ 2028 2029 #pragma align 64(vn_vfslocks_buckets) 2030 static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1]; 2031 2032 #define VN_VFSLOCKS_SHIFT 9 2033 2034 #define VN_VFSLOCKS_HASH(vfsvpptr) \ 2035 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS) 2036 2037 /* 2038 * vn_vfslocks_getlock() uses an HASH scheme to generate 2039 * rwstlock using vfs/vnode pointer passed to it. 2040 * 2041 * vn_vfslocks_rele() releases a reference in the 2042 * HASH table which allows the entry allocated by 2043 * vn_vfslocks_getlock() to be freed at a later 2044 * stage when the refcount drops to zero. 2045 */ 2046 2047 vn_vfslocks_entry_t * 2048 vn_vfslocks_getlock(void *vfsvpptr) 2049 { 2050 struct vn_vfslocks_bucket *bp; 2051 vn_vfslocks_entry_t *vep; 2052 vn_vfslocks_entry_t *tvep; 2053 2054 ASSERT(vfsvpptr != NULL); 2055 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)]; 2056 2057 mutex_enter(&bp->vb_lock); 2058 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) { 2059 if (vep->ve_vpvfs == vfsvpptr) { 2060 vep->ve_refcnt++; 2061 mutex_exit(&bp->vb_lock); 2062 return (vep); 2063 } 2064 } 2065 mutex_exit(&bp->vb_lock); 2066 vep = kmem_alloc(sizeof (*vep), KM_SLEEP); 2067 rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL); 2068 vep->ve_vpvfs = (char *)vfsvpptr; 2069 vep->ve_refcnt = 1; 2070 mutex_enter(&bp->vb_lock); 2071 for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) { 2072 if (tvep->ve_vpvfs == vfsvpptr) { 2073 tvep->ve_refcnt++; 2074 mutex_exit(&bp->vb_lock); 2075 2076 /* 2077 * There is already an entry in the hash 2078 * destroy what we just allocated. 2079 */ 2080 rwst_destroy(&vep->ve_lock); 2081 kmem_free(vep, sizeof (*vep)); 2082 return (tvep); 2083 } 2084 } 2085 vep->ve_next = bp->vb_list; 2086 bp->vb_list = vep; 2087 mutex_exit(&bp->vb_lock); 2088 return (vep); 2089 } 2090 2091 void 2092 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent) 2093 { 2094 struct vn_vfslocks_bucket *bp; 2095 vn_vfslocks_entry_t *vep; 2096 vn_vfslocks_entry_t *pvep; 2097 2098 ASSERT(vepent != NULL); 2099 ASSERT(vepent->ve_vpvfs != NULL); 2100 2101 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)]; 2102 2103 mutex_enter(&bp->vb_lock); 2104 vepent->ve_refcnt--; 2105 2106 if ((int32_t)vepent->ve_refcnt < 0) 2107 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative"); 2108 2109 if (vepent->ve_refcnt == 0) { 2110 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) { 2111 if (vep->ve_vpvfs == vepent->ve_vpvfs) { 2112 if (bp->vb_list == vep) 2113 bp->vb_list = vep->ve_next; 2114 else { 2115 /* LINTED */ 2116 pvep->ve_next = vep->ve_next; 2117 } 2118 mutex_exit(&bp->vb_lock); 2119 rwst_destroy(&vep->ve_lock); 2120 kmem_free(vep, sizeof (*vep)); 2121 return; 2122 } 2123 pvep = vep; 2124 } 2125 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found"); 2126 } 2127 mutex_exit(&bp->vb_lock); 2128 } 2129 2130 /* 2131 * vn_vfswlock_wait is used to implement a lock which is logically a writers 2132 * lock protecting the v_vfsmountedhere field. 2133 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock, 2134 * except that it blocks to acquire the lock VVFSLOCK. 2135 * 2136 * traverse() and routines re-implementing part of traverse (e.g. autofs) 2137 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on 2138 * need the non-blocking version of the writers lock i.e. vn_vfswlock 2139 */ 2140 int 2141 vn_vfswlock_wait(vnode_t *vp) 2142 { 2143 int retval; 2144 vn_vfslocks_entry_t *vpvfsentry; 2145 ASSERT(vp != NULL); 2146 2147 vpvfsentry = vn_vfslocks_getlock(vp); 2148 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER); 2149 2150 if (retval == EINTR) { 2151 vn_vfslocks_rele(vpvfsentry); 2152 return (EINTR); 2153 } 2154 return (retval); 2155 } 2156 2157 int 2158 vn_vfsrlock_wait(vnode_t *vp) 2159 { 2160 int retval; 2161 vn_vfslocks_entry_t *vpvfsentry; 2162 ASSERT(vp != NULL); 2163 2164 vpvfsentry = vn_vfslocks_getlock(vp); 2165 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER); 2166 2167 if (retval == EINTR) { 2168 vn_vfslocks_rele(vpvfsentry); 2169 return (EINTR); 2170 } 2171 2172 return (retval); 2173 } 2174 2175 2176 /* 2177 * vn_vfswlock is used to implement a lock which is logically a writers lock 2178 * protecting the v_vfsmountedhere field. 2179 */ 2180 int 2181 vn_vfswlock(vnode_t *vp) 2182 { 2183 vn_vfslocks_entry_t *vpvfsentry; 2184 2185 /* 2186 * If vp is NULL then somebody is trying to lock the covered vnode 2187 * of /. (vfs_vnodecovered is NULL for /). This situation will 2188 * only happen when unmounting /. Since that operation will fail 2189 * anyway, return EBUSY here instead of in VFS_UNMOUNT. 2190 */ 2191 if (vp == NULL) 2192 return (EBUSY); 2193 2194 vpvfsentry = vn_vfslocks_getlock(vp); 2195 2196 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER)) 2197 return (0); 2198 2199 vn_vfslocks_rele(vpvfsentry); 2200 return (EBUSY); 2201 } 2202 2203 int 2204 vn_vfsrlock(vnode_t *vp) 2205 { 2206 vn_vfslocks_entry_t *vpvfsentry; 2207 2208 /* 2209 * If vp is NULL then somebody is trying to lock the covered vnode 2210 * of /. (vfs_vnodecovered is NULL for /). This situation will 2211 * only happen when unmounting /. Since that operation will fail 2212 * anyway, return EBUSY here instead of in VFS_UNMOUNT. 2213 */ 2214 if (vp == NULL) 2215 return (EBUSY); 2216 2217 vpvfsentry = vn_vfslocks_getlock(vp); 2218 2219 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER)) 2220 return (0); 2221 2222 vn_vfslocks_rele(vpvfsentry); 2223 return (EBUSY); 2224 } 2225 2226 void 2227 vn_vfsunlock(vnode_t *vp) 2228 { 2229 vn_vfslocks_entry_t *vpvfsentry; 2230 2231 /* 2232 * ve_refcnt needs to be decremented twice. 2233 * 1. To release refernce after a call to vn_vfslocks_getlock() 2234 * 2. To release the reference from the locking routines like 2235 * vn_vfsrlock/vn_vfswlock etc,. 2236 */ 2237 vpvfsentry = vn_vfslocks_getlock(vp); 2238 vn_vfslocks_rele(vpvfsentry); 2239 2240 rwst_exit(&vpvfsentry->ve_lock); 2241 vn_vfslocks_rele(vpvfsentry); 2242 } 2243 2244 int 2245 vn_vfswlock_held(vnode_t *vp) 2246 { 2247 int held; 2248 vn_vfslocks_entry_t *vpvfsentry; 2249 2250 ASSERT(vp != NULL); 2251 2252 vpvfsentry = vn_vfslocks_getlock(vp); 2253 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER); 2254 2255 vn_vfslocks_rele(vpvfsentry); 2256 return (held); 2257 } 2258 2259 2260 int 2261 vn_make_ops( 2262 const char *name, /* Name of file system */ 2263 const fs_operation_def_t *templ, /* Operation specification */ 2264 vnodeops_t **actual) /* Return the vnodeops */ 2265 { 2266 int unused_ops; 2267 int error; 2268 2269 *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP); 2270 2271 (*actual)->vnop_name = name; 2272 2273 error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ); 2274 if (error) { 2275 kmem_free(*actual, sizeof (vnodeops_t)); 2276 } 2277 2278 #if DEBUG 2279 if (unused_ops != 0) 2280 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied " 2281 "but not used", name, unused_ops); 2282 #endif 2283 2284 return (error); 2285 } 2286 2287 /* 2288 * Free the vnodeops created as a result of vn_make_ops() 2289 */ 2290 void 2291 vn_freevnodeops(vnodeops_t *vnops) 2292 { 2293 kmem_free(vnops, sizeof (vnodeops_t)); 2294 } 2295 2296 /* 2297 * Vnode cache. 2298 */ 2299 2300 /* ARGSUSED */ 2301 static int 2302 vn_cache_constructor(void *buf, void *cdrarg, int kmflags) 2303 { 2304 struct vnode *vp; 2305 2306 vp = buf; 2307 2308 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); 2309 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL); 2310 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL); 2311 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL); 2312 vp->v_femhead = NULL; /* Must be done before vn_reinit() */ 2313 vp->v_path = NULL; 2314 vp->v_mpssdata = NULL; 2315 vp->v_vsd = NULL; 2316 vp->v_fopdata = NULL; 2317 2318 return (0); 2319 } 2320 2321 /* ARGSUSED */ 2322 static void 2323 vn_cache_destructor(void *buf, void *cdrarg) 2324 { 2325 struct vnode *vp; 2326 2327 vp = buf; 2328 2329 rw_destroy(&vp->v_nbllock); 2330 cv_destroy(&vp->v_cv); 2331 mutex_destroy(&vp->v_vsd_lock); 2332 mutex_destroy(&vp->v_lock); 2333 } 2334 2335 void 2336 vn_create_cache(void) 2337 { 2338 /* LINTED */ 2339 ASSERT((1 << VNODE_ALIGN_LOG2) == 2340 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN)); 2341 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode), 2342 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL, 2343 NULL, 0); 2344 } 2345 2346 void 2347 vn_destroy_cache(void) 2348 { 2349 kmem_cache_destroy(vn_cache); 2350 } 2351 2352 /* 2353 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are 2354 * cached by the file system and vnodes remain associated. 2355 */ 2356 void 2357 vn_recycle(vnode_t *vp) 2358 { 2359 ASSERT(vp->v_pages == NULL); 2360 2361 /* 2362 * XXX - This really belongs in vn_reinit(), but we have some issues 2363 * with the counts. Best to have it here for clean initialization. 2364 */ 2365 vp->v_rdcnt = 0; 2366 vp->v_wrcnt = 0; 2367 vp->v_mmap_read = 0; 2368 vp->v_mmap_write = 0; 2369 2370 /* 2371 * If FEM was in use, make sure everything gets cleaned up 2372 * NOTE: vp->v_femhead is initialized to NULL in the vnode 2373 * constructor. 2374 */ 2375 if (vp->v_femhead) { 2376 /* XXX - There should be a free_femhead() that does all this */ 2377 ASSERT(vp->v_femhead->femh_list == NULL); 2378 mutex_destroy(&vp->v_femhead->femh_lock); 2379 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); 2380 vp->v_femhead = NULL; 2381 } 2382 if (vp->v_path) { 2383 kmem_free(vp->v_path, strlen(vp->v_path) + 1); 2384 vp->v_path = NULL; 2385 } 2386 2387 if (vp->v_fopdata != NULL) { 2388 free_fopdata(vp); 2389 } 2390 vp->v_mpssdata = NULL; 2391 vsd_free(vp); 2392 } 2393 2394 /* 2395 * Used to reset the vnode fields including those that are directly accessible 2396 * as well as those which require an accessor function. 2397 * 2398 * Does not initialize: 2399 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv 2400 * v_data (since FS-nodes and vnodes point to each other and should 2401 * be updated simultaneously) 2402 * v_op (in case someone needs to make a VOP call on this object) 2403 */ 2404 void 2405 vn_reinit(vnode_t *vp) 2406 { 2407 vp->v_count = 1; 2408 vp->v_count_dnlc = 0; 2409 vp->v_vfsp = NULL; 2410 vp->v_stream = NULL; 2411 vp->v_vfsmountedhere = NULL; 2412 vp->v_flag = 0; 2413 vp->v_type = VNON; 2414 vp->v_rdev = NODEV; 2415 2416 vp->v_filocks = NULL; 2417 vp->v_shrlocks = NULL; 2418 vp->v_pages = NULL; 2419 2420 vp->v_locality = NULL; 2421 vp->v_xattrdir = NULL; 2422 2423 /* Handles v_femhead, v_path, and the r/w/map counts */ 2424 vn_recycle(vp); 2425 } 2426 2427 vnode_t * 2428 vn_alloc(int kmflag) 2429 { 2430 vnode_t *vp; 2431 2432 vp = kmem_cache_alloc(vn_cache, kmflag); 2433 2434 if (vp != NULL) { 2435 vp->v_femhead = NULL; /* Must be done before vn_reinit() */ 2436 vp->v_fopdata = NULL; 2437 vn_reinit(vp); 2438 } 2439 2440 return (vp); 2441 } 2442 2443 void 2444 vn_free(vnode_t *vp) 2445 { 2446 ASSERT(vp->v_shrlocks == NULL); 2447 ASSERT(vp->v_filocks == NULL); 2448 2449 /* 2450 * Some file systems call vn_free() with v_count of zero, 2451 * some with v_count of 1. In any case, the value should 2452 * never be anything else. 2453 */ 2454 ASSERT((vp->v_count == 0) || (vp->v_count == 1)); 2455 ASSERT(vp->v_count_dnlc == 0); 2456 if (vp->v_path != NULL) { 2457 kmem_free(vp->v_path, strlen(vp->v_path) + 1); 2458 vp->v_path = NULL; 2459 } 2460 2461 /* If FEM was in use, make sure everything gets cleaned up */ 2462 if (vp->v_femhead) { 2463 /* XXX - There should be a free_femhead() that does all this */ 2464 ASSERT(vp->v_femhead->femh_list == NULL); 2465 mutex_destroy(&vp->v_femhead->femh_lock); 2466 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); 2467 vp->v_femhead = NULL; 2468 } 2469 2470 if (vp->v_fopdata != NULL) { 2471 free_fopdata(vp); 2472 } 2473 vp->v_mpssdata = NULL; 2474 vsd_free(vp); 2475 kmem_cache_free(vn_cache, vp); 2476 } 2477 2478 /* 2479 * vnode status changes, should define better states than 1, 0. 2480 */ 2481 void 2482 vn_reclaim(vnode_t *vp) 2483 { 2484 vfs_t *vfsp = vp->v_vfsp; 2485 2486 if (vfsp == NULL || 2487 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { 2488 return; 2489 } 2490 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED); 2491 } 2492 2493 void 2494 vn_idle(vnode_t *vp) 2495 { 2496 vfs_t *vfsp = vp->v_vfsp; 2497 2498 if (vfsp == NULL || 2499 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { 2500 return; 2501 } 2502 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED); 2503 } 2504 void 2505 vn_exists(vnode_t *vp) 2506 { 2507 vfs_t *vfsp = vp->v_vfsp; 2508 2509 if (vfsp == NULL || 2510 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { 2511 return; 2512 } 2513 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS); 2514 } 2515 2516 void 2517 vn_invalid(vnode_t *vp) 2518 { 2519 vfs_t *vfsp = vp->v_vfsp; 2520 2521 if (vfsp == NULL || 2522 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) { 2523 return; 2524 } 2525 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED); 2526 } 2527 2528 /* Vnode event notification */ 2529 2530 int 2531 vnevent_support(vnode_t *vp, caller_context_t *ct) 2532 { 2533 if (vp == NULL) 2534 return (EINVAL); 2535 2536 return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct)); 2537 } 2538 2539 void 2540 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) 2541 { 2542 if (vp == NULL || vp->v_femhead == NULL) { 2543 return; 2544 } 2545 (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); 2546 } 2547 2548 void 2549 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, 2550 caller_context_t *ct) 2551 { 2552 if (vp == NULL || vp->v_femhead == NULL) { 2553 return; 2554 } 2555 (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct); 2556 } 2557 2558 void 2559 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) 2560 { 2561 if (vp == NULL || vp->v_femhead == NULL) { 2562 return; 2563 } 2564 (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); 2565 } 2566 2567 void 2568 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) 2569 { 2570 if (vp == NULL || vp->v_femhead == NULL) { 2571 return; 2572 } 2573 (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct); 2574 } 2575 2576 void 2577 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) 2578 { 2579 if (vp == NULL || vp->v_femhead == NULL) { 2580 return; 2581 } 2582 (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct); 2583 } 2584 2585 void 2586 vnevent_create(vnode_t *vp, caller_context_t *ct) 2587 { 2588 if (vp == NULL || vp->v_femhead == NULL) { 2589 return; 2590 } 2591 (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct); 2592 } 2593 2594 void 2595 vnevent_link(vnode_t *vp, caller_context_t *ct) 2596 { 2597 if (vp == NULL || vp->v_femhead == NULL) { 2598 return; 2599 } 2600 (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct); 2601 } 2602 2603 void 2604 vnevent_mountedover(vnode_t *vp, caller_context_t *ct) 2605 { 2606 if (vp == NULL || vp->v_femhead == NULL) { 2607 return; 2608 } 2609 (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct); 2610 } 2611 2612 void 2613 vnevent_truncate(vnode_t *vp, caller_context_t *ct) 2614 { 2615 if (vp == NULL || vp->v_femhead == NULL) { 2616 return; 2617 } 2618 (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct); 2619 } 2620 2621 /* 2622 * Vnode accessors. 2623 */ 2624 2625 int 2626 vn_is_readonly(vnode_t *vp) 2627 { 2628 return (vp->v_vfsp->vfs_flag & VFS_RDONLY); 2629 } 2630 2631 int 2632 vn_has_flocks(vnode_t *vp) 2633 { 2634 return (vp->v_filocks != NULL); 2635 } 2636 2637 int 2638 vn_has_mandatory_locks(vnode_t *vp, int mode) 2639 { 2640 return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode))); 2641 } 2642 2643 int 2644 vn_has_cached_data(vnode_t *vp) 2645 { 2646 return (vp->v_pages != NULL); 2647 } 2648 2649 /* 2650 * Return 0 if the vnode in question shouldn't be permitted into a zone via 2651 * zone_enter(2). 2652 */ 2653 int 2654 vn_can_change_zones(vnode_t *vp) 2655 { 2656 struct vfssw *vswp; 2657 int allow = 1; 2658 vnode_t *rvp; 2659 2660 if (nfs_global_client_only != 0) 2661 return (1); 2662 2663 /* 2664 * We always want to look at the underlying vnode if there is one. 2665 */ 2666 if (VOP_REALVP(vp, &rvp, NULL) != 0) 2667 rvp = vp; 2668 /* 2669 * Some pseudo filesystems (including doorfs) don't actually register 2670 * their vfsops_t, so the following may return NULL; we happily let 2671 * such vnodes switch zones. 2672 */ 2673 vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp)); 2674 if (vswp != NULL) { 2675 if (vswp->vsw_flag & VSW_NOTZONESAFE) 2676 allow = 0; 2677 vfs_unrefvfssw(vswp); 2678 } 2679 return (allow); 2680 } 2681 2682 /* 2683 * Return nonzero if the vnode is a mount point, zero if not. 2684 */ 2685 int 2686 vn_ismntpt(vnode_t *vp) 2687 { 2688 return (vp->v_vfsmountedhere != NULL); 2689 } 2690 2691 /* Retrieve the vfs (if any) mounted on this vnode */ 2692 vfs_t * 2693 vn_mountedvfs(vnode_t *vp) 2694 { 2695 return (vp->v_vfsmountedhere); 2696 } 2697 2698 /* 2699 * Return nonzero if the vnode is referenced by the dnlc, zero if not. 2700 */ 2701 int 2702 vn_in_dnlc(vnode_t *vp) 2703 { 2704 return (vp->v_count_dnlc > 0); 2705 } 2706 2707 /* 2708 * vn_has_other_opens() checks whether a particular file is opened by more than 2709 * just the caller and whether the open is for read and/or write. 2710 * This routine is for calling after the caller has already called VOP_OPEN() 2711 * and the caller wishes to know if they are the only one with it open for 2712 * the mode(s) specified. 2713 * 2714 * Vnode counts are only kept on regular files (v_type=VREG). 2715 */ 2716 int 2717 vn_has_other_opens( 2718 vnode_t *vp, 2719 v_mode_t mode) 2720 { 2721 2722 ASSERT(vp != NULL); 2723 2724 switch (mode) { 2725 case V_WRITE: 2726 if (vp->v_wrcnt > 1) 2727 return (V_TRUE); 2728 break; 2729 case V_RDORWR: 2730 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1)) 2731 return (V_TRUE); 2732 break; 2733 case V_RDANDWR: 2734 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1)) 2735 return (V_TRUE); 2736 break; 2737 case V_READ: 2738 if (vp->v_rdcnt > 1) 2739 return (V_TRUE); 2740 break; 2741 } 2742 2743 return (V_FALSE); 2744 } 2745 2746 /* 2747 * vn_is_opened() checks whether a particular file is opened and 2748 * whether the open is for read and/or write. 2749 * 2750 * Vnode counts are only kept on regular files (v_type=VREG). 2751 */ 2752 int 2753 vn_is_opened( 2754 vnode_t *vp, 2755 v_mode_t mode) 2756 { 2757 2758 ASSERT(vp != NULL); 2759 2760 switch (mode) { 2761 case V_WRITE: 2762 if (vp->v_wrcnt) 2763 return (V_TRUE); 2764 break; 2765 case V_RDANDWR: 2766 if (vp->v_rdcnt && vp->v_wrcnt) 2767 return (V_TRUE); 2768 break; 2769 case V_RDORWR: 2770 if (vp->v_rdcnt || vp->v_wrcnt) 2771 return (V_TRUE); 2772 break; 2773 case V_READ: 2774 if (vp->v_rdcnt) 2775 return (V_TRUE); 2776 break; 2777 } 2778 2779 return (V_FALSE); 2780 } 2781 2782 /* 2783 * vn_is_mapped() checks whether a particular file is mapped and whether 2784 * the file is mapped read and/or write. 2785 */ 2786 int 2787 vn_is_mapped( 2788 vnode_t *vp, 2789 v_mode_t mode) 2790 { 2791 2792 ASSERT(vp != NULL); 2793 2794 #if !defined(_LP64) 2795 switch (mode) { 2796 /* 2797 * The atomic_add_64_nv functions force atomicity in the 2798 * case of 32 bit architectures. Otherwise the 64 bit values 2799 * require two fetches. The value of the fields may be 2800 * (potentially) changed between the first fetch and the 2801 * second 2802 */ 2803 case V_WRITE: 2804 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0)) 2805 return (V_TRUE); 2806 break; 2807 case V_RDANDWR: 2808 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) && 2809 (atomic_add_64_nv((&(vp->v_mmap_write)), 0))) 2810 return (V_TRUE); 2811 break; 2812 case V_RDORWR: 2813 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) || 2814 (atomic_add_64_nv((&(vp->v_mmap_write)), 0))) 2815 return (V_TRUE); 2816 break; 2817 case V_READ: 2818 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0)) 2819 return (V_TRUE); 2820 break; 2821 } 2822 #else 2823 switch (mode) { 2824 case V_WRITE: 2825 if (vp->v_mmap_write) 2826 return (V_TRUE); 2827 break; 2828 case V_RDANDWR: 2829 if (vp->v_mmap_read && vp->v_mmap_write) 2830 return (V_TRUE); 2831 break; 2832 case V_RDORWR: 2833 if (vp->v_mmap_read || vp->v_mmap_write) 2834 return (V_TRUE); 2835 break; 2836 case V_READ: 2837 if (vp->v_mmap_read) 2838 return (V_TRUE); 2839 break; 2840 } 2841 #endif 2842 2843 return (V_FALSE); 2844 } 2845 2846 /* 2847 * Set the operations vector for a vnode. 2848 * 2849 * FEM ensures that the v_femhead pointer is filled in before the 2850 * v_op pointer is changed. This means that if the v_femhead pointer 2851 * is NULL, and the v_op field hasn't changed since before which checked 2852 * the v_femhead pointer; then our update is ok - we are not racing with 2853 * FEM. 2854 */ 2855 void 2856 vn_setops(vnode_t *vp, vnodeops_t *vnodeops) 2857 { 2858 vnodeops_t *op; 2859 2860 ASSERT(vp != NULL); 2861 ASSERT(vnodeops != NULL); 2862 2863 op = vp->v_op; 2864 membar_consumer(); 2865 /* 2866 * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do 2867 * the compare-and-swap on vp->v_op. If either fails, then FEM is 2868 * in effect on the vnode and we need to have FEM deal with it. 2869 */ 2870 if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) != 2871 op) { 2872 fem_setvnops(vp, vnodeops); 2873 } 2874 } 2875 2876 /* 2877 * Retrieve the operations vector for a vnode 2878 * As with vn_setops(above); make sure we aren't racing with FEM. 2879 * FEM sets the v_op to a special, internal, vnodeops that wouldn't 2880 * make sense to the callers of this routine. 2881 */ 2882 vnodeops_t * 2883 vn_getops(vnode_t *vp) 2884 { 2885 vnodeops_t *op; 2886 2887 ASSERT(vp != NULL); 2888 2889 op = vp->v_op; 2890 membar_consumer(); 2891 if (vp->v_femhead == NULL && op == vp->v_op) { 2892 return (op); 2893 } else { 2894 return (fem_getvnops(vp)); 2895 } 2896 } 2897 2898 /* 2899 * Returns non-zero (1) if the vnodeops matches that of the vnode. 2900 * Returns zero (0) if not. 2901 */ 2902 int 2903 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops) 2904 { 2905 return (vn_getops(vp) == vnodeops); 2906 } 2907 2908 /* 2909 * Returns non-zero (1) if the specified operation matches the 2910 * corresponding operation for that the vnode. 2911 * Returns zero (0) if not. 2912 */ 2913 2914 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0)) 2915 2916 int 2917 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp) 2918 { 2919 const fs_operation_trans_def_t *otdp; 2920 fs_generic_func_p *loc = NULL; 2921 vnodeops_t *vop = vn_getops(vp); 2922 2923 ASSERT(vopname != NULL); 2924 2925 for (otdp = vn_ops_table; otdp->name != NULL; otdp++) { 2926 if (MATCHNAME(otdp->name, vopname)) { 2927 loc = (fs_generic_func_p *) 2928 ((char *)(vop) + otdp->offset); 2929 break; 2930 } 2931 } 2932 2933 return ((loc != NULL) && (*loc == funcp)); 2934 } 2935 2936 /* 2937 * fs_new_caller_id() needs to return a unique ID on a given local system. 2938 * The IDs do not need to survive across reboots. These are primarily 2939 * used so that (FEM) monitors can detect particular callers (such as 2940 * the NFS server) to a given vnode/vfs operation. 2941 */ 2942 u_longlong_t 2943 fs_new_caller_id() 2944 { 2945 static uint64_t next_caller_id = 0LL; /* First call returns 1 */ 2946 2947 return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id)); 2948 } 2949 2950 /* 2951 * Given a starting vnode and a path, updates the path in the target vnode in 2952 * a safe manner. If the vnode already has path information embedded, then the 2953 * cached path is left untouched. 2954 */ 2955 2956 size_t max_vnode_path = 4 * MAXPATHLEN; 2957 2958 void 2959 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, 2960 const char *path, size_t plen) 2961 { 2962 char *rpath; 2963 vnode_t *base; 2964 size_t rpathlen, rpathalloc; 2965 int doslash = 1; 2966 2967 if (*path == '/') { 2968 base = rootvp; 2969 path++; 2970 plen--; 2971 } else { 2972 base = startvp; 2973 } 2974 2975 /* 2976 * We cannot grab base->v_lock while we hold vp->v_lock because of 2977 * the potential for deadlock. 2978 */ 2979 mutex_enter(&base->v_lock); 2980 if (base->v_path == NULL) { 2981 mutex_exit(&base->v_lock); 2982 return; 2983 } 2984 2985 rpathlen = strlen(base->v_path); 2986 rpathalloc = rpathlen + plen + 1; 2987 /* Avoid adding a slash if there's already one there */ 2988 if (base->v_path[rpathlen-1] == '/') 2989 doslash = 0; 2990 else 2991 rpathalloc++; 2992 2993 /* 2994 * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held, 2995 * so we must do this dance. If, by chance, something changes the path, 2996 * just give up since there is no real harm. 2997 */ 2998 mutex_exit(&base->v_lock); 2999 3000 /* Paths should stay within reason */ 3001 if (rpathalloc > max_vnode_path) 3002 return; 3003 3004 rpath = kmem_alloc(rpathalloc, KM_SLEEP); 3005 3006 mutex_enter(&base->v_lock); 3007 if (base->v_path == NULL || strlen(base->v_path) != rpathlen) { 3008 mutex_exit(&base->v_lock); 3009 kmem_free(rpath, rpathalloc); 3010 return; 3011 } 3012 bcopy(base->v_path, rpath, rpathlen); 3013 mutex_exit(&base->v_lock); 3014 3015 if (doslash) 3016 rpath[rpathlen++] = '/'; 3017 bcopy(path, rpath + rpathlen, plen); 3018 rpath[rpathlen + plen] = '\0'; 3019 3020 mutex_enter(&vp->v_lock); 3021 if (vp->v_path != NULL) { 3022 mutex_exit(&vp->v_lock); 3023 kmem_free(rpath, rpathalloc); 3024 } else { 3025 vp->v_path = rpath; 3026 mutex_exit(&vp->v_lock); 3027 } 3028 } 3029 3030 /* 3031 * Sets the path to the vnode to be the given string, regardless of current 3032 * context. The string must be a complete path from rootdir. This is only used 3033 * by fsop_root() for setting the path based on the mountpoint. 3034 */ 3035 void 3036 vn_setpath_str(struct vnode *vp, const char *str, size_t len) 3037 { 3038 char *buf = kmem_alloc(len + 1, KM_SLEEP); 3039 3040 mutex_enter(&vp->v_lock); 3041 if (vp->v_path != NULL) { 3042 mutex_exit(&vp->v_lock); 3043 kmem_free(buf, len + 1); 3044 return; 3045 } 3046 3047 vp->v_path = buf; 3048 bcopy(str, vp->v_path, len); 3049 vp->v_path[len] = '\0'; 3050 3051 mutex_exit(&vp->v_lock); 3052 } 3053 3054 /* 3055 * Called from within filesystem's vop_rename() to handle renames once the 3056 * target vnode is available. 3057 */ 3058 void 3059 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len) 3060 { 3061 char *tmp; 3062 3063 mutex_enter(&vp->v_lock); 3064 tmp = vp->v_path; 3065 vp->v_path = NULL; 3066 mutex_exit(&vp->v_lock); 3067 vn_setpath(rootdir, dvp, vp, nm, len); 3068 if (tmp != NULL) 3069 kmem_free(tmp, strlen(tmp) + 1); 3070 } 3071 3072 /* 3073 * Similar to vn_setpath_str(), this function sets the path of the destination 3074 * vnode to the be the same as the source vnode. 3075 */ 3076 void 3077 vn_copypath(struct vnode *src, struct vnode *dst) 3078 { 3079 char *buf; 3080 int alloc; 3081 3082 mutex_enter(&src->v_lock); 3083 if (src->v_path == NULL) { 3084 mutex_exit(&src->v_lock); 3085 return; 3086 } 3087 alloc = strlen(src->v_path) + 1; 3088 3089 /* avoid kmem_alloc() with lock held */ 3090 mutex_exit(&src->v_lock); 3091 buf = kmem_alloc(alloc, KM_SLEEP); 3092 mutex_enter(&src->v_lock); 3093 if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) { 3094 mutex_exit(&src->v_lock); 3095 kmem_free(buf, alloc); 3096 return; 3097 } 3098 bcopy(src->v_path, buf, alloc); 3099 mutex_exit(&src->v_lock); 3100 3101 mutex_enter(&dst->v_lock); 3102 if (dst->v_path != NULL) { 3103 mutex_exit(&dst->v_lock); 3104 kmem_free(buf, alloc); 3105 return; 3106 } 3107 dst->v_path = buf; 3108 mutex_exit(&dst->v_lock); 3109 } 3110 3111 /* 3112 * XXX Private interface for segvn routines that handle vnode 3113 * large page segments. 3114 * 3115 * return 1 if vp's file system VOP_PAGEIO() implementation 3116 * can be safely used instead of VOP_GETPAGE() for handling 3117 * pagefaults against regular non swap files. VOP_PAGEIO() 3118 * interface is considered safe here if its implementation 3119 * is very close to VOP_GETPAGE() implementation. 3120 * e.g. It zero's out the part of the page beyond EOF. Doesn't 3121 * panic if there're file holes but instead returns an error. 3122 * Doesn't assume file won't be changed by user writes, etc. 3123 * 3124 * return 0 otherwise. 3125 * 3126 * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs. 3127 */ 3128 int 3129 vn_vmpss_usepageio(vnode_t *vp) 3130 { 3131 vfs_t *vfsp = vp->v_vfsp; 3132 char *fsname = vfssw[vfsp->vfs_fstype].vsw_name; 3133 char *pageio_ok_fss[] = {"ufs", "nfs", NULL}; 3134 char **fsok = pageio_ok_fss; 3135 3136 if (fsname == NULL) { 3137 return (0); 3138 } 3139 3140 for (; *fsok; fsok++) { 3141 if (strcmp(*fsok, fsname) == 0) { 3142 return (1); 3143 } 3144 } 3145 return (0); 3146 } 3147 3148 /* VOP_XXX() macros call the corresponding fop_xxx() function */ 3149 3150 int 3151 fop_open( 3152 vnode_t **vpp, 3153 int mode, 3154 cred_t *cr, 3155 caller_context_t *ct) 3156 { 3157 int ret; 3158 vnode_t *vp = *vpp; 3159 3160 VN_HOLD(vp); 3161 /* 3162 * Adding to the vnode counts before calling open 3163 * avoids the need for a mutex. It circumvents a race 3164 * condition where a query made on the vnode counts results in a 3165 * false negative. The inquirer goes away believing the file is 3166 * not open when there is an open on the file already under way. 3167 * 3168 * The counts are meant to prevent NFS from granting a delegation 3169 * when it would be dangerous to do so. 3170 * 3171 * The vnode counts are only kept on regular files 3172 */ 3173 if ((*vpp)->v_type == VREG) { 3174 if (mode & FREAD) 3175 atomic_inc_32(&(*vpp)->v_rdcnt); 3176 if (mode & FWRITE) 3177 atomic_inc_32(&(*vpp)->v_wrcnt); 3178 } 3179 3180 VOPXID_MAP_CR(vp, cr); 3181 3182 ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct); 3183 3184 if (ret) { 3185 /* 3186 * Use the saved vp just in case the vnode ptr got trashed 3187 * by the error. 3188 */ 3189 VOPSTATS_UPDATE(vp, open); 3190 if ((vp->v_type == VREG) && (mode & FREAD)) 3191 atomic_dec_32(&vp->v_rdcnt); 3192 if ((vp->v_type == VREG) && (mode & FWRITE)) 3193 atomic_dec_32(&vp->v_wrcnt); 3194 } else { 3195 /* 3196 * Some filesystems will return a different vnode, 3197 * but the same path was still used to open it. 3198 * So if we do change the vnode and need to 3199 * copy over the path, do so here, rather than special 3200 * casing each filesystem. Adjust the vnode counts to 3201 * reflect the vnode switch. 3202 */ 3203 VOPSTATS_UPDATE(*vpp, open); 3204 if (*vpp != vp && *vpp != NULL) { 3205 vn_copypath(vp, *vpp); 3206 if (((*vpp)->v_type == VREG) && (mode & FREAD)) 3207 atomic_inc_32(&(*vpp)->v_rdcnt); 3208 if ((vp->v_type == VREG) && (mode & FREAD)) 3209 atomic_dec_32(&vp->v_rdcnt); 3210 if (((*vpp)->v_type == VREG) && (mode & FWRITE)) 3211 atomic_inc_32(&(*vpp)->v_wrcnt); 3212 if ((vp->v_type == VREG) && (mode & FWRITE)) 3213 atomic_dec_32(&vp->v_wrcnt); 3214 } 3215 } 3216 VN_RELE(vp); 3217 return (ret); 3218 } 3219 3220 int 3221 fop_close( 3222 vnode_t *vp, 3223 int flag, 3224 int count, 3225 offset_t offset, 3226 cred_t *cr, 3227 caller_context_t *ct) 3228 { 3229 int err; 3230 3231 VOPXID_MAP_CR(vp, cr); 3232 3233 err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct); 3234 VOPSTATS_UPDATE(vp, close); 3235 /* 3236 * Check passed in count to handle possible dups. Vnode counts are only 3237 * kept on regular files 3238 */ 3239 if ((vp->v_type == VREG) && (count == 1)) { 3240 if (flag & FREAD) { 3241 ASSERT(vp->v_rdcnt > 0); 3242 atomic_dec_32(&vp->v_rdcnt); 3243 } 3244 if (flag & FWRITE) { 3245 ASSERT(vp->v_wrcnt > 0); 3246 atomic_dec_32(&vp->v_wrcnt); 3247 } 3248 } 3249 return (err); 3250 } 3251 3252 int 3253 fop_read( 3254 vnode_t *vp, 3255 uio_t *uiop, 3256 int ioflag, 3257 cred_t *cr, 3258 caller_context_t *ct) 3259 { 3260 int err; 3261 ssize_t resid_start = uiop->uio_resid; 3262 3263 VOPXID_MAP_CR(vp, cr); 3264 3265 err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); 3266 VOPSTATS_UPDATE_IO(vp, read, 3267 read_bytes, (resid_start - uiop->uio_resid)); 3268 return (err); 3269 } 3270 3271 int 3272 fop_write( 3273 vnode_t *vp, 3274 uio_t *uiop, 3275 int ioflag, 3276 cred_t *cr, 3277 caller_context_t *ct) 3278 { 3279 int err; 3280 ssize_t resid_start = uiop->uio_resid; 3281 3282 VOPXID_MAP_CR(vp, cr); 3283 3284 err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); 3285 VOPSTATS_UPDATE_IO(vp, write, 3286 write_bytes, (resid_start - uiop->uio_resid)); 3287 return (err); 3288 } 3289 3290 int 3291 fop_ioctl( 3292 vnode_t *vp, 3293 int cmd, 3294 intptr_t arg, 3295 int flag, 3296 cred_t *cr, 3297 int *rvalp, 3298 caller_context_t *ct) 3299 { 3300 int err; 3301 3302 VOPXID_MAP_CR(vp, cr); 3303 3304 err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct); 3305 VOPSTATS_UPDATE(vp, ioctl); 3306 return (err); 3307 } 3308 3309 int 3310 fop_setfl( 3311 vnode_t *vp, 3312 int oflags, 3313 int nflags, 3314 cred_t *cr, 3315 caller_context_t *ct) 3316 { 3317 int err; 3318 3319 VOPXID_MAP_CR(vp, cr); 3320 3321 err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct); 3322 VOPSTATS_UPDATE(vp, setfl); 3323 return (err); 3324 } 3325 3326 int 3327 fop_getattr( 3328 vnode_t *vp, 3329 vattr_t *vap, 3330 int flags, 3331 cred_t *cr, 3332 caller_context_t *ct) 3333 { 3334 int err; 3335 3336 VOPXID_MAP_CR(vp, cr); 3337 3338 /* 3339 * If this file system doesn't understand the xvattr extensions 3340 * then turn off the xvattr bit. 3341 */ 3342 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) { 3343 vap->va_mask &= ~AT_XVATTR; 3344 } 3345 3346 /* 3347 * We're only allowed to skip the ACL check iff we used a 32 bit 3348 * ACE mask with VOP_ACCESS() to determine permissions. 3349 */ 3350 if ((flags & ATTR_NOACLCHECK) && 3351 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 3352 return (EINVAL); 3353 } 3354 err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct); 3355 VOPSTATS_UPDATE(vp, getattr); 3356 return (err); 3357 } 3358 3359 int 3360 fop_setattr( 3361 vnode_t *vp, 3362 vattr_t *vap, 3363 int flags, 3364 cred_t *cr, 3365 caller_context_t *ct) 3366 { 3367 int err; 3368 3369 VOPXID_MAP_CR(vp, cr); 3370 3371 /* 3372 * If this file system doesn't understand the xvattr extensions 3373 * then turn off the xvattr bit. 3374 */ 3375 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) { 3376 vap->va_mask &= ~AT_XVATTR; 3377 } 3378 3379 /* 3380 * We're only allowed to skip the ACL check iff we used a 32 bit 3381 * ACE mask with VOP_ACCESS() to determine permissions. 3382 */ 3383 if ((flags & ATTR_NOACLCHECK) && 3384 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 3385 return (EINVAL); 3386 } 3387 err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct); 3388 VOPSTATS_UPDATE(vp, setattr); 3389 return (err); 3390 } 3391 3392 int 3393 fop_access( 3394 vnode_t *vp, 3395 int mode, 3396 int flags, 3397 cred_t *cr, 3398 caller_context_t *ct) 3399 { 3400 int err; 3401 3402 if ((flags & V_ACE_MASK) && 3403 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 3404 return (EINVAL); 3405 } 3406 3407 VOPXID_MAP_CR(vp, cr); 3408 3409 err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct); 3410 VOPSTATS_UPDATE(vp, access); 3411 return (err); 3412 } 3413 3414 int 3415 fop_lookup( 3416 vnode_t *dvp, 3417 char *nm, 3418 vnode_t **vpp, 3419 pathname_t *pnp, 3420 int flags, 3421 vnode_t *rdir, 3422 cred_t *cr, 3423 caller_context_t *ct, 3424 int *deflags, /* Returned per-dirent flags */ 3425 pathname_t *ppnp) /* Returned case-preserved name in directory */ 3426 { 3427 int ret; 3428 3429 /* 3430 * If this file system doesn't support case-insensitive access 3431 * and said access is requested, fail quickly. It is required 3432 * that if the vfs supports case-insensitive lookup, it also 3433 * supports extended dirent flags. 3434 */ 3435 if (flags & FIGNORECASE && 3436 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3437 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3438 return (EINVAL); 3439 3440 VOPXID_MAP_CR(dvp, cr); 3441 3442 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) { 3443 ret = xattr_dir_lookup(dvp, vpp, flags, cr); 3444 } else { 3445 ret = (*(dvp)->v_op->vop_lookup) 3446 (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp); 3447 } 3448 if (ret == 0 && *vpp) { 3449 VOPSTATS_UPDATE(*vpp, lookup); 3450 if ((*vpp)->v_path == NULL) { 3451 vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm)); 3452 } 3453 } 3454 3455 return (ret); 3456 } 3457 3458 int 3459 fop_create( 3460 vnode_t *dvp, 3461 char *name, 3462 vattr_t *vap, 3463 vcexcl_t excl, 3464 int mode, 3465 vnode_t **vpp, 3466 cred_t *cr, 3467 int flags, 3468 caller_context_t *ct, 3469 vsecattr_t *vsecp) /* ACL to set during create */ 3470 { 3471 int ret; 3472 3473 if (vsecp != NULL && 3474 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) { 3475 return (EINVAL); 3476 } 3477 /* 3478 * If this file system doesn't support case-insensitive access 3479 * and said access is requested, fail quickly. 3480 */ 3481 if (flags & FIGNORECASE && 3482 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3483 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3484 return (EINVAL); 3485 3486 VOPXID_MAP_CR(dvp, cr); 3487 3488 ret = (*(dvp)->v_op->vop_create) 3489 (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp); 3490 if (ret == 0 && *vpp) { 3491 VOPSTATS_UPDATE(*vpp, create); 3492 if ((*vpp)->v_path == NULL) { 3493 vn_setpath(rootdir, dvp, *vpp, name, strlen(name)); 3494 } 3495 } 3496 3497 return (ret); 3498 } 3499 3500 int 3501 fop_remove( 3502 vnode_t *dvp, 3503 char *nm, 3504 cred_t *cr, 3505 caller_context_t *ct, 3506 int flags) 3507 { 3508 int err; 3509 3510 /* 3511 * If this file system doesn't support case-insensitive access 3512 * and said access is requested, fail quickly. 3513 */ 3514 if (flags & FIGNORECASE && 3515 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3516 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3517 return (EINVAL); 3518 3519 VOPXID_MAP_CR(dvp, cr); 3520 3521 err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags); 3522 VOPSTATS_UPDATE(dvp, remove); 3523 return (err); 3524 } 3525 3526 int 3527 fop_link( 3528 vnode_t *tdvp, 3529 vnode_t *svp, 3530 char *tnm, 3531 cred_t *cr, 3532 caller_context_t *ct, 3533 int flags) 3534 { 3535 int err; 3536 3537 /* 3538 * If the target file system doesn't support case-insensitive access 3539 * and said access is requested, fail quickly. 3540 */ 3541 if (flags & FIGNORECASE && 3542 (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3543 vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3544 return (EINVAL); 3545 3546 VOPXID_MAP_CR(tdvp, cr); 3547 3548 err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags); 3549 VOPSTATS_UPDATE(tdvp, link); 3550 return (err); 3551 } 3552 3553 int 3554 fop_rename( 3555 vnode_t *sdvp, 3556 char *snm, 3557 vnode_t *tdvp, 3558 char *tnm, 3559 cred_t *cr, 3560 caller_context_t *ct, 3561 int flags) 3562 { 3563 int err; 3564 3565 /* 3566 * If the file system involved does not support 3567 * case-insensitive access and said access is requested, fail 3568 * quickly. 3569 */ 3570 if (flags & FIGNORECASE && 3571 ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3572 vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))) 3573 return (EINVAL); 3574 3575 VOPXID_MAP_CR(tdvp, cr); 3576 3577 err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags); 3578 VOPSTATS_UPDATE(sdvp, rename); 3579 return (err); 3580 } 3581 3582 int 3583 fop_mkdir( 3584 vnode_t *dvp, 3585 char *dirname, 3586 vattr_t *vap, 3587 vnode_t **vpp, 3588 cred_t *cr, 3589 caller_context_t *ct, 3590 int flags, 3591 vsecattr_t *vsecp) /* ACL to set during create */ 3592 { 3593 int ret; 3594 3595 if (vsecp != NULL && 3596 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) { 3597 return (EINVAL); 3598 } 3599 /* 3600 * If this file system doesn't support case-insensitive access 3601 * and said access is requested, fail quickly. 3602 */ 3603 if (flags & FIGNORECASE && 3604 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3605 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3606 return (EINVAL); 3607 3608 VOPXID_MAP_CR(dvp, cr); 3609 3610 ret = (*(dvp)->v_op->vop_mkdir) 3611 (dvp, dirname, vap, vpp, cr, ct, flags, vsecp); 3612 if (ret == 0 && *vpp) { 3613 VOPSTATS_UPDATE(*vpp, mkdir); 3614 if ((*vpp)->v_path == NULL) { 3615 vn_setpath(rootdir, dvp, *vpp, dirname, 3616 strlen(dirname)); 3617 } 3618 } 3619 3620 return (ret); 3621 } 3622 3623 int 3624 fop_rmdir( 3625 vnode_t *dvp, 3626 char *nm, 3627 vnode_t *cdir, 3628 cred_t *cr, 3629 caller_context_t *ct, 3630 int flags) 3631 { 3632 int err; 3633 3634 /* 3635 * If this file system doesn't support case-insensitive access 3636 * and said access is requested, fail quickly. 3637 */ 3638 if (flags & FIGNORECASE && 3639 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3640 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3641 return (EINVAL); 3642 3643 VOPXID_MAP_CR(dvp, cr); 3644 3645 err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags); 3646 VOPSTATS_UPDATE(dvp, rmdir); 3647 return (err); 3648 } 3649 3650 int 3651 fop_readdir( 3652 vnode_t *vp, 3653 uio_t *uiop, 3654 cred_t *cr, 3655 int *eofp, 3656 caller_context_t *ct, 3657 int flags) 3658 { 3659 int err; 3660 ssize_t resid_start = uiop->uio_resid; 3661 3662 /* 3663 * If this file system doesn't support retrieving directory 3664 * entry flags and said access is requested, fail quickly. 3665 */ 3666 if (flags & V_RDDIR_ENTFLAGS && 3667 vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0) 3668 return (EINVAL); 3669 3670 VOPXID_MAP_CR(vp, cr); 3671 3672 err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags); 3673 VOPSTATS_UPDATE_IO(vp, readdir, 3674 readdir_bytes, (resid_start - uiop->uio_resid)); 3675 return (err); 3676 } 3677 3678 int 3679 fop_symlink( 3680 vnode_t *dvp, 3681 char *linkname, 3682 vattr_t *vap, 3683 char *target, 3684 cred_t *cr, 3685 caller_context_t *ct, 3686 int flags) 3687 { 3688 int err; 3689 xvattr_t xvattr; 3690 3691 /* 3692 * If this file system doesn't support case-insensitive access 3693 * and said access is requested, fail quickly. 3694 */ 3695 if (flags & FIGNORECASE && 3696 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 && 3697 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)) 3698 return (EINVAL); 3699 3700 VOPXID_MAP_CR(dvp, cr); 3701 3702 /* check for reparse point */ 3703 if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) && 3704 (strncmp(target, FS_REPARSE_TAG_STR, 3705 strlen(FS_REPARSE_TAG_STR)) == 0)) { 3706 if (!fs_reparse_mark(target, vap, &xvattr)) 3707 vap = (vattr_t *)&xvattr; 3708 } 3709 3710 err = (*(dvp)->v_op->vop_symlink) 3711 (dvp, linkname, vap, target, cr, ct, flags); 3712 VOPSTATS_UPDATE(dvp, symlink); 3713 return (err); 3714 } 3715 3716 int 3717 fop_readlink( 3718 vnode_t *vp, 3719 uio_t *uiop, 3720 cred_t *cr, 3721 caller_context_t *ct) 3722 { 3723 int err; 3724 3725 VOPXID_MAP_CR(vp, cr); 3726 3727 err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct); 3728 VOPSTATS_UPDATE(vp, readlink); 3729 return (err); 3730 } 3731 3732 int 3733 fop_fsync( 3734 vnode_t *vp, 3735 int syncflag, 3736 cred_t *cr, 3737 caller_context_t *ct) 3738 { 3739 int err; 3740 3741 VOPXID_MAP_CR(vp, cr); 3742 3743 err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct); 3744 VOPSTATS_UPDATE(vp, fsync); 3745 return (err); 3746 } 3747 3748 void 3749 fop_inactive( 3750 vnode_t *vp, 3751 cred_t *cr, 3752 caller_context_t *ct) 3753 { 3754 /* Need to update stats before vop call since we may lose the vnode */ 3755 VOPSTATS_UPDATE(vp, inactive); 3756 3757 VOPXID_MAP_CR(vp, cr); 3758 3759 (*(vp)->v_op->vop_inactive)(vp, cr, ct); 3760 } 3761 3762 int 3763 fop_fid( 3764 vnode_t *vp, 3765 fid_t *fidp, 3766 caller_context_t *ct) 3767 { 3768 int err; 3769 3770 err = (*(vp)->v_op->vop_fid)(vp, fidp, ct); 3771 VOPSTATS_UPDATE(vp, fid); 3772 return (err); 3773 } 3774 3775 int 3776 fop_rwlock( 3777 vnode_t *vp, 3778 int write_lock, 3779 caller_context_t *ct) 3780 { 3781 int ret; 3782 3783 ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct)); 3784 VOPSTATS_UPDATE(vp, rwlock); 3785 return (ret); 3786 } 3787 3788 void 3789 fop_rwunlock( 3790 vnode_t *vp, 3791 int write_lock, 3792 caller_context_t *ct) 3793 { 3794 (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct); 3795 VOPSTATS_UPDATE(vp, rwunlock); 3796 } 3797 3798 int 3799 fop_seek( 3800 vnode_t *vp, 3801 offset_t ooff, 3802 offset_t *noffp, 3803 caller_context_t *ct) 3804 { 3805 int err; 3806 3807 err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct); 3808 VOPSTATS_UPDATE(vp, seek); 3809 return (err); 3810 } 3811 3812 int 3813 fop_cmp( 3814 vnode_t *vp1, 3815 vnode_t *vp2, 3816 caller_context_t *ct) 3817 { 3818 int err; 3819 3820 err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct); 3821 VOPSTATS_UPDATE(vp1, cmp); 3822 return (err); 3823 } 3824 3825 int 3826 fop_frlock( 3827 vnode_t *vp, 3828 int cmd, 3829 flock64_t *bfp, 3830 int flag, 3831 offset_t offset, 3832 struct flk_callback *flk_cbp, 3833 cred_t *cr, 3834 caller_context_t *ct) 3835 { 3836 int err; 3837 3838 VOPXID_MAP_CR(vp, cr); 3839 3840 err = (*(vp)->v_op->vop_frlock) 3841 (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); 3842 VOPSTATS_UPDATE(vp, frlock); 3843 return (err); 3844 } 3845 3846 int 3847 fop_space( 3848 vnode_t *vp, 3849 int cmd, 3850 flock64_t *bfp, 3851 int flag, 3852 offset_t offset, 3853 cred_t *cr, 3854 caller_context_t *ct) 3855 { 3856 int err; 3857 3858 VOPXID_MAP_CR(vp, cr); 3859 3860 err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct); 3861 VOPSTATS_UPDATE(vp, space); 3862 return (err); 3863 } 3864 3865 int 3866 fop_realvp( 3867 vnode_t *vp, 3868 vnode_t **vpp, 3869 caller_context_t *ct) 3870 { 3871 int err; 3872 3873 err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct); 3874 VOPSTATS_UPDATE(vp, realvp); 3875 return (err); 3876 } 3877 3878 int 3879 fop_getpage( 3880 vnode_t *vp, 3881 offset_t off, 3882 size_t len, 3883 uint_t *protp, 3884 page_t **plarr, 3885 size_t plsz, 3886 struct seg *seg, 3887 caddr_t addr, 3888 enum seg_rw rw, 3889 cred_t *cr, 3890 caller_context_t *ct) 3891 { 3892 int err; 3893 3894 VOPXID_MAP_CR(vp, cr); 3895 3896 err = (*(vp)->v_op->vop_getpage) 3897 (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct); 3898 VOPSTATS_UPDATE(vp, getpage); 3899 return (err); 3900 } 3901 3902 int 3903 fop_putpage( 3904 vnode_t *vp, 3905 offset_t off, 3906 size_t len, 3907 int flags, 3908 cred_t *cr, 3909 caller_context_t *ct) 3910 { 3911 int err; 3912 3913 VOPXID_MAP_CR(vp, cr); 3914 3915 err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct); 3916 VOPSTATS_UPDATE(vp, putpage); 3917 return (err); 3918 } 3919 3920 int 3921 fop_map( 3922 vnode_t *vp, 3923 offset_t off, 3924 struct as *as, 3925 caddr_t *addrp, 3926 size_t len, 3927 uchar_t prot, 3928 uchar_t maxprot, 3929 uint_t flags, 3930 cred_t *cr, 3931 caller_context_t *ct) 3932 { 3933 int err; 3934 3935 VOPXID_MAP_CR(vp, cr); 3936 3937 err = (*(vp)->v_op->vop_map) 3938 (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct); 3939 VOPSTATS_UPDATE(vp, map); 3940 return (err); 3941 } 3942 3943 int 3944 fop_addmap( 3945 vnode_t *vp, 3946 offset_t off, 3947 struct as *as, 3948 caddr_t addr, 3949 size_t len, 3950 uchar_t prot, 3951 uchar_t maxprot, 3952 uint_t flags, 3953 cred_t *cr, 3954 caller_context_t *ct) 3955 { 3956 int error; 3957 u_longlong_t delta; 3958 3959 VOPXID_MAP_CR(vp, cr); 3960 3961 error = (*(vp)->v_op->vop_addmap) 3962 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct); 3963 3964 if ((!error) && (vp->v_type == VREG)) { 3965 delta = (u_longlong_t)btopr(len); 3966 /* 3967 * If file is declared MAP_PRIVATE, it can't be written back 3968 * even if open for write. Handle as read. 3969 */ 3970 if (flags & MAP_PRIVATE) { 3971 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 3972 (int64_t)delta); 3973 } else { 3974 /* 3975 * atomic_add_64 forces the fetch of a 64 bit value to 3976 * be atomic on 32 bit machines 3977 */ 3978 if (maxprot & PROT_WRITE) 3979 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)), 3980 (int64_t)delta); 3981 if (maxprot & PROT_READ) 3982 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 3983 (int64_t)delta); 3984 if (maxprot & PROT_EXEC) 3985 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 3986 (int64_t)delta); 3987 } 3988 } 3989 VOPSTATS_UPDATE(vp, addmap); 3990 return (error); 3991 } 3992 3993 int 3994 fop_delmap( 3995 vnode_t *vp, 3996 offset_t off, 3997 struct as *as, 3998 caddr_t addr, 3999 size_t len, 4000 uint_t prot, 4001 uint_t maxprot, 4002 uint_t flags, 4003 cred_t *cr, 4004 caller_context_t *ct) 4005 { 4006 int error; 4007 u_longlong_t delta; 4008 4009 VOPXID_MAP_CR(vp, cr); 4010 4011 error = (*(vp)->v_op->vop_delmap) 4012 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct); 4013 4014 /* 4015 * NFS calls into delmap twice, the first time 4016 * it simply establishes a callback mechanism and returns EAGAIN 4017 * while the real work is being done upon the second invocation. 4018 * We have to detect this here and only decrement the counts upon 4019 * the second delmap request. 4020 */ 4021 if ((error != EAGAIN) && (vp->v_type == VREG)) { 4022 4023 delta = (u_longlong_t)btopr(len); 4024 4025 if (flags & MAP_PRIVATE) { 4026 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 4027 (int64_t)(-delta)); 4028 } else { 4029 /* 4030 * atomic_add_64 forces the fetch of a 64 bit value 4031 * to be atomic on 32 bit machines 4032 */ 4033 if (maxprot & PROT_WRITE) 4034 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)), 4035 (int64_t)(-delta)); 4036 if (maxprot & PROT_READ) 4037 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 4038 (int64_t)(-delta)); 4039 if (maxprot & PROT_EXEC) 4040 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)), 4041 (int64_t)(-delta)); 4042 } 4043 } 4044 VOPSTATS_UPDATE(vp, delmap); 4045 return (error); 4046 } 4047 4048 4049 int 4050 fop_poll( 4051 vnode_t *vp, 4052 short events, 4053 int anyyet, 4054 short *reventsp, 4055 struct pollhead **phpp, 4056 caller_context_t *ct) 4057 { 4058 int err; 4059 4060 err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct); 4061 VOPSTATS_UPDATE(vp, poll); 4062 return (err); 4063 } 4064 4065 int 4066 fop_dump( 4067 vnode_t *vp, 4068 caddr_t addr, 4069 offset_t lbdn, 4070 offset_t dblks, 4071 caller_context_t *ct) 4072 { 4073 int err; 4074 4075 /* ensure lbdn and dblks can be passed safely to bdev_dump */ 4076 if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks)) 4077 return (EIO); 4078 4079 err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct); 4080 VOPSTATS_UPDATE(vp, dump); 4081 return (err); 4082 } 4083 4084 int 4085 fop_pathconf( 4086 vnode_t *vp, 4087 int cmd, 4088 ulong_t *valp, 4089 cred_t *cr, 4090 caller_context_t *ct) 4091 { 4092 int err; 4093 4094 VOPXID_MAP_CR(vp, cr); 4095 4096 err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct); 4097 VOPSTATS_UPDATE(vp, pathconf); 4098 return (err); 4099 } 4100 4101 int 4102 fop_pageio( 4103 vnode_t *vp, 4104 struct page *pp, 4105 u_offset_t io_off, 4106 size_t io_len, 4107 int flags, 4108 cred_t *cr, 4109 caller_context_t *ct) 4110 { 4111 int err; 4112 4113 VOPXID_MAP_CR(vp, cr); 4114 4115 err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct); 4116 VOPSTATS_UPDATE(vp, pageio); 4117 return (err); 4118 } 4119 4120 int 4121 fop_dumpctl( 4122 vnode_t *vp, 4123 int action, 4124 offset_t *blkp, 4125 caller_context_t *ct) 4126 { 4127 int err; 4128 err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct); 4129 VOPSTATS_UPDATE(vp, dumpctl); 4130 return (err); 4131 } 4132 4133 void 4134 fop_dispose( 4135 vnode_t *vp, 4136 page_t *pp, 4137 int flag, 4138 int dn, 4139 cred_t *cr, 4140 caller_context_t *ct) 4141 { 4142 /* Must do stats first since it's possible to lose the vnode */ 4143 VOPSTATS_UPDATE(vp, dispose); 4144 4145 VOPXID_MAP_CR(vp, cr); 4146 4147 (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct); 4148 } 4149 4150 int 4151 fop_setsecattr( 4152 vnode_t *vp, 4153 vsecattr_t *vsap, 4154 int flag, 4155 cred_t *cr, 4156 caller_context_t *ct) 4157 { 4158 int err; 4159 4160 VOPXID_MAP_CR(vp, cr); 4161 4162 /* 4163 * We're only allowed to skip the ACL check iff we used a 32 bit 4164 * ACE mask with VOP_ACCESS() to determine permissions. 4165 */ 4166 if ((flag & ATTR_NOACLCHECK) && 4167 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 4168 return (EINVAL); 4169 } 4170 err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct); 4171 VOPSTATS_UPDATE(vp, setsecattr); 4172 return (err); 4173 } 4174 4175 int 4176 fop_getsecattr( 4177 vnode_t *vp, 4178 vsecattr_t *vsap, 4179 int flag, 4180 cred_t *cr, 4181 caller_context_t *ct) 4182 { 4183 int err; 4184 4185 /* 4186 * We're only allowed to skip the ACL check iff we used a 32 bit 4187 * ACE mask with VOP_ACCESS() to determine permissions. 4188 */ 4189 if ((flag & ATTR_NOACLCHECK) && 4190 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) { 4191 return (EINVAL); 4192 } 4193 4194 VOPXID_MAP_CR(vp, cr); 4195 4196 err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct); 4197 VOPSTATS_UPDATE(vp, getsecattr); 4198 return (err); 4199 } 4200 4201 int 4202 fop_shrlock( 4203 vnode_t *vp, 4204 int cmd, 4205 struct shrlock *shr, 4206 int flag, 4207 cred_t *cr, 4208 caller_context_t *ct) 4209 { 4210 int err; 4211 4212 VOPXID_MAP_CR(vp, cr); 4213 4214 err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct); 4215 VOPSTATS_UPDATE(vp, shrlock); 4216 return (err); 4217 } 4218 4219 int 4220 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm, 4221 caller_context_t *ct) 4222 { 4223 int err; 4224 4225 err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct); 4226 VOPSTATS_UPDATE(vp, vnevent); 4227 return (err); 4228 } 4229 4230 int 4231 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr, 4232 caller_context_t *ct) 4233 { 4234 int err; 4235 4236 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) 4237 return (ENOTSUP); 4238 err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct); 4239 VOPSTATS_UPDATE(vp, reqzcbuf); 4240 return (err); 4241 } 4242 4243 int 4244 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct) 4245 { 4246 int err; 4247 4248 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) 4249 return (ENOTSUP); 4250 err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct); 4251 VOPSTATS_UPDATE(vp, retzcbuf); 4252 return (err); 4253 } 4254 4255 /* 4256 * Default destructor 4257 * Needed because NULL destructor means that the key is unused 4258 */ 4259 /* ARGSUSED */ 4260 void 4261 vsd_defaultdestructor(void *value) 4262 {} 4263 4264 /* 4265 * Create a key (index into per vnode array) 4266 * Locks out vsd_create, vsd_destroy, and vsd_free 4267 * May allocate memory with lock held 4268 */ 4269 void 4270 vsd_create(uint_t *keyp, void (*destructor)(void *)) 4271 { 4272 int i; 4273 uint_t nkeys; 4274 4275 /* 4276 * if key is allocated, do nothing 4277 */ 4278 mutex_enter(&vsd_lock); 4279 if (*keyp) { 4280 mutex_exit(&vsd_lock); 4281 return; 4282 } 4283 /* 4284 * find an unused key 4285 */ 4286 if (destructor == NULL) 4287 destructor = vsd_defaultdestructor; 4288 4289 for (i = 0; i < vsd_nkeys; ++i) 4290 if (vsd_destructor[i] == NULL) 4291 break; 4292 4293 /* 4294 * if no unused keys, increase the size of the destructor array 4295 */ 4296 if (i == vsd_nkeys) { 4297 if ((nkeys = (vsd_nkeys << 1)) == 0) 4298 nkeys = 1; 4299 vsd_destructor = 4300 (void (**)(void *))vsd_realloc((void *)vsd_destructor, 4301 (size_t)(vsd_nkeys * sizeof (void (*)(void *))), 4302 (size_t)(nkeys * sizeof (void (*)(void *)))); 4303 vsd_nkeys = nkeys; 4304 } 4305 4306 /* 4307 * allocate the next available unused key 4308 */ 4309 vsd_destructor[i] = destructor; 4310 *keyp = i + 1; 4311 4312 /* create vsd_list, if it doesn't exist */ 4313 if (vsd_list == NULL) { 4314 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 4315 list_create(vsd_list, sizeof (struct vsd_node), 4316 offsetof(struct vsd_node, vs_nodes)); 4317 } 4318 4319 mutex_exit(&vsd_lock); 4320 } 4321 4322 /* 4323 * Destroy a key 4324 * 4325 * Assumes that the caller is preventing vsd_set and vsd_get 4326 * Locks out vsd_create, vsd_destroy, and vsd_free 4327 * May free memory with lock held 4328 */ 4329 void 4330 vsd_destroy(uint_t *keyp) 4331 { 4332 uint_t key; 4333 struct vsd_node *vsd; 4334 4335 /* 4336 * protect the key namespace and our destructor lists 4337 */ 4338 mutex_enter(&vsd_lock); 4339 key = *keyp; 4340 *keyp = 0; 4341 4342 ASSERT(key <= vsd_nkeys); 4343 4344 /* 4345 * if the key is valid 4346 */ 4347 if (key != 0) { 4348 uint_t k = key - 1; 4349 /* 4350 * for every vnode with VSD, call key's destructor 4351 */ 4352 for (vsd = list_head(vsd_list); vsd != NULL; 4353 vsd = list_next(vsd_list, vsd)) { 4354 /* 4355 * no VSD for key in this vnode 4356 */ 4357 if (key > vsd->vs_nkeys) 4358 continue; 4359 /* 4360 * call destructor for key 4361 */ 4362 if (vsd->vs_value[k] && vsd_destructor[k]) 4363 (*vsd_destructor[k])(vsd->vs_value[k]); 4364 /* 4365 * reset value for key 4366 */ 4367 vsd->vs_value[k] = NULL; 4368 } 4369 /* 4370 * actually free the key (NULL destructor == unused) 4371 */ 4372 vsd_destructor[k] = NULL; 4373 } 4374 4375 mutex_exit(&vsd_lock); 4376 } 4377 4378 /* 4379 * Quickly return the per vnode value that was stored with the specified key 4380 * Assumes the caller is protecting key from vsd_create and vsd_destroy 4381 * Assumes the caller is holding v_vsd_lock to protect the vsd. 4382 */ 4383 void * 4384 vsd_get(vnode_t *vp, uint_t key) 4385 { 4386 struct vsd_node *vsd; 4387 4388 ASSERT(vp != NULL); 4389 ASSERT(mutex_owned(&vp->v_vsd_lock)); 4390 4391 vsd = vp->v_vsd; 4392 4393 if (key && vsd != NULL && key <= vsd->vs_nkeys) 4394 return (vsd->vs_value[key - 1]); 4395 return (NULL); 4396 } 4397 4398 /* 4399 * Set a per vnode value indexed with the specified key 4400 * Assumes the caller is holding v_vsd_lock to protect the vsd. 4401 */ 4402 int 4403 vsd_set(vnode_t *vp, uint_t key, void *value) 4404 { 4405 struct vsd_node *vsd; 4406 4407 ASSERT(vp != NULL); 4408 ASSERT(mutex_owned(&vp->v_vsd_lock)); 4409 4410 if (key == 0) 4411 return (EINVAL); 4412 4413 vsd = vp->v_vsd; 4414 if (vsd == NULL) 4415 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP); 4416 4417 /* 4418 * If the vsd was just allocated, vs_nkeys will be 0, so the following 4419 * code won't happen and we will continue down and allocate space for 4420 * the vs_value array. 4421 * If the caller is replacing one value with another, then it is up 4422 * to the caller to free/rele/destroy the previous value (if needed). 4423 */ 4424 if (key <= vsd->vs_nkeys) { 4425 vsd->vs_value[key - 1] = value; 4426 return (0); 4427 } 4428 4429 ASSERT(key <= vsd_nkeys); 4430 4431 if (vsd->vs_nkeys == 0) { 4432 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */ 4433 /* 4434 * Link onto list of all VSD nodes. 4435 */ 4436 list_insert_head(vsd_list, vsd); 4437 mutex_exit(&vsd_lock); 4438 } 4439 4440 /* 4441 * Allocate vnode local storage and set the value for key 4442 */ 4443 vsd->vs_value = vsd_realloc(vsd->vs_value, 4444 vsd->vs_nkeys * sizeof (void *), 4445 key * sizeof (void *)); 4446 vsd->vs_nkeys = key; 4447 vsd->vs_value[key - 1] = value; 4448 4449 return (0); 4450 } 4451 4452 /* 4453 * Called from vn_free() to run the destructor function for each vsd 4454 * Locks out vsd_create and vsd_destroy 4455 * Assumes that the destructor *DOES NOT* use vsd 4456 */ 4457 void 4458 vsd_free(vnode_t *vp) 4459 { 4460 int i; 4461 struct vsd_node *vsd = vp->v_vsd; 4462 4463 if (vsd == NULL) 4464 return; 4465 4466 if (vsd->vs_nkeys == 0) { 4467 kmem_free(vsd, sizeof (*vsd)); 4468 vp->v_vsd = NULL; 4469 return; 4470 } 4471 4472 /* 4473 * lock out vsd_create and vsd_destroy, call 4474 * the destructor, and mark the value as destroyed. 4475 */ 4476 mutex_enter(&vsd_lock); 4477 4478 for (i = 0; i < vsd->vs_nkeys; i++) { 4479 if (vsd->vs_value[i] && vsd_destructor[i]) 4480 (*vsd_destructor[i])(vsd->vs_value[i]); 4481 vsd->vs_value[i] = NULL; 4482 } 4483 4484 /* 4485 * remove from linked list of VSD nodes 4486 */ 4487 list_remove(vsd_list, vsd); 4488 4489 mutex_exit(&vsd_lock); 4490 4491 /* 4492 * free up the VSD 4493 */ 4494 kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *)); 4495 kmem_free(vsd, sizeof (struct vsd_node)); 4496 vp->v_vsd = NULL; 4497 } 4498 4499 /* 4500 * realloc 4501 */ 4502 static void * 4503 vsd_realloc(void *old, size_t osize, size_t nsize) 4504 { 4505 void *new; 4506 4507 new = kmem_zalloc(nsize, KM_SLEEP); 4508 if (old) { 4509 bcopy(old, new, osize); 4510 kmem_free(old, osize); 4511 } 4512 return (new); 4513 } 4514 4515 /* 4516 * Setup the extensible system attribute for creating a reparse point. 4517 * The symlink data 'target' is validated for proper format of a reparse 4518 * string and a check also made to make sure the symlink data does not 4519 * point to an existing file. 4520 * 4521 * return 0 if ok else -1. 4522 */ 4523 static int 4524 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr) 4525 { 4526 xoptattr_t *xoap; 4527 4528 if ((!target) || (!vap) || (!xvattr)) 4529 return (-1); 4530 4531 /* validate reparse string */ 4532 if (reparse_validate((const char *)target)) 4533 return (-1); 4534 4535 xva_init(xvattr); 4536 xvattr->xva_vattr = *vap; 4537 xvattr->xva_vattr.va_mask |= AT_XVATTR; 4538 xoap = xva_getxoptattr(xvattr); 4539 ASSERT(xoap); 4540 XVA_SET_REQ(xvattr, XAT_REPARSE); 4541 xoap->xoa_reparse = 1; 4542 4543 return (0); 4544 } 4545 4546 /* 4547 * Function to check whether a symlink is a reparse point. 4548 * Return B_TRUE if it is a reparse point, else return B_FALSE 4549 */ 4550 boolean_t 4551 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4552 { 4553 xvattr_t xvattr; 4554 xoptattr_t *xoap; 4555 4556 if ((vp->v_type != VLNK) || 4557 !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR))) 4558 return (B_FALSE); 4559 4560 xva_init(&xvattr); 4561 xoap = xva_getxoptattr(&xvattr); 4562 ASSERT(xoap); 4563 XVA_SET_REQ(&xvattr, XAT_REPARSE); 4564 4565 if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct)) 4566 return (B_FALSE); 4567 4568 if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) || 4569 (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE)))) 4570 return (B_FALSE); 4571 4572 return (xoap->xoa_reparse ? B_TRUE : B_FALSE); 4573 } 4574