1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* Portions Copyright 2007 Shivakumar GN */ 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* 27 * Copyright (c) 2017 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/cmn_err.h> 32 #include <sys/debug.h> 33 #include <sys/dirent.h> 34 #include <sys/kmem.h> 35 #include <sys/mman.h> 36 #include <sys/mutex.h> 37 #include <sys/sysmacros.h> 38 #include <sys/systm.h> 39 #include <sys/sunddi.h> 40 #include <sys/uio.h> 41 #include <sys/vmsystm.h> 42 #include <sys/vfs.h> 43 #include <sys/vnode.h> 44 45 #include <vm/as.h> 46 #include <vm/seg_vn.h> 47 48 #include <sys/gfs.h> 49 50 /* 51 * Generic pseudo-filesystem routines. 52 * 53 * There are significant similarities between the implementation of certain file 54 * system entry points across different filesystems. While one could attempt to 55 * "choke up on the bat" and incorporate common functionality into a VOP 56 * preamble or postamble, such an approach is limited in the benefit it can 57 * provide. In this file we instead define a toolkit of routines which can be 58 * called from a filesystem (with in-kernel pseudo-filesystems being the focus 59 * of the exercise) in a more component-like fashion. 60 * 61 * There are three basic classes of routines: 62 * 63 * 1) Lowlevel support routines 64 * 65 * These routines are designed to play a support role for existing 66 * pseudo-filesystems (such as procfs). They simplify common tasks, 67 * without forcing the filesystem to hand over management to GFS. The 68 * routines covered are: 69 * 70 * gfs_readdir_init() 71 * gfs_readdir_emit() 72 * gfs_readdir_emitn() 73 * gfs_readdir_pred() 74 * gfs_readdir_fini() 75 * gfs_lookup_dot() 76 * 77 * 2) Complete GFS management 78 * 79 * These routines take a more active role in management of the 80 * pseudo-filesystem. They handle the relationship between vnode private 81 * data and VFS data, as well as the relationship between vnodes in the 82 * directory hierarchy. 83 * 84 * In order to use these interfaces, the first member of every private 85 * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control 86 * to GFS. 87 * 88 * gfs_file_create() 89 * gfs_dir_create() 90 * gfs_root_create() 91 * 92 * gfs_file_inactive() 93 * gfs_dir_inactive() 94 * gfs_dir_lookup() 95 * gfs_dir_readdir() 96 * 97 * gfs_vop_inactive() 98 * gfs_vop_lookup() 99 * gfs_vop_readdir() 100 * gfs_vop_map() 101 * 102 * 3) Single File pseudo-filesystems 103 * 104 * This routine creates a rooted file to be overlayed ontop of another 105 * file in the physical filespace. 106 * 107 * Note that the parent is NULL (actually the vfs), but there is nothing 108 * technically keeping such a file from utilizing the "Complete GFS 109 * management" set of routines. 110 * 111 * gfs_root_create_file() 112 */ 113 114 /* 115 * gfs_make_opsvec: take an array of vnode type definitions and create 116 * their vnodeops_t structures 117 * 118 * This routine takes an array of gfs_opsvec_t's. It could 119 * alternatively take an array of gfs_opsvec_t*'s, which would allow 120 * vnode types to be completely defined in files external to the caller 121 * of gfs_make_opsvec(). As it stands, much more sharing takes place -- 122 * both the caller and the vnode type provider need to access gfsv_ops 123 * and gfsv_template, and the caller also needs to know gfsv_name. 124 */ 125 int 126 gfs_make_opsvec(gfs_opsvec_t *vec) 127 { 128 int error, i; 129 130 for (i = 0; ; i++) { 131 if (vec[i].gfsv_name == NULL) 132 return (0); 133 error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template, 134 vec[i].gfsv_ops); 135 if (error) 136 break; 137 } 138 139 cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'", 140 vec[i].gfsv_name); 141 for (i--; i >= 0; i--) { 142 vn_freevnodeops(*vec[i].gfsv_ops); 143 *vec[i].gfsv_ops = NULL; 144 } 145 return (error); 146 } 147 148 /* 149 * Low level directory routines 150 * 151 * These routines provide some simple abstractions for reading directories. 152 * They are designed to be used by existing pseudo filesystems (namely procfs) 153 * that already have a complicated management infrastructure. 154 */ 155 156 /* 157 * gfs_get_parent_ino: used to obtain a parent inode number and the 158 * inode number of the given vnode in preparation for calling gfs_readdir_init. 159 */ 160 int 161 gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, 162 ino64_t *pino, ino64_t *ino) 163 { 164 vnode_t *parent; 165 gfs_dir_t *dp = dvp->v_data; 166 int error; 167 168 *ino = dp->gfsd_file.gfs_ino; 169 parent = dp->gfsd_file.gfs_parent; 170 171 if (parent == NULL) { 172 *pino = *ino; /* root of filesystem */ 173 } else if (dvp->v_flag & V_XATTRDIR) { 174 vattr_t va; 175 176 va.va_mask = AT_NODEID; 177 error = VOP_GETATTR(parent, &va, 0, cr, ct); 178 if (error) 179 return (error); 180 *pino = va.va_nodeid; 181 } else { 182 *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; 183 } 184 185 return (0); 186 } 187 188 /* 189 * gfs_readdir_init: initiate a generic readdir 190 * st - a pointer to an uninitialized gfs_readdir_state_t structure 191 * name_max - the directory's maximum file name length 192 * ureclen - the exported file-space record length (1 for non-legacy FSs) 193 * uiop - the uiop passed to readdir 194 * parent - the parent directory's inode 195 * self - this directory's inode 196 * flags - flags from VOP_READDIR 197 * 198 * Returns 0 or a non-zero errno. 199 * 200 * Typical VOP_READDIR usage of gfs_readdir_*: 201 * 202 * if ((error = gfs_readdir_init(...)) != 0) 203 * return (error); 204 * eof = 0; 205 * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { 206 * if (!consumer_entry_at(voffset)) 207 * voffset = consumer_next_entry(voffset); 208 * if (consumer_eof(voffset)) { 209 * eof = 1 210 * break; 211 * } 212 * if ((error = gfs_readdir_emit(..., voffset, 213 * consumer_ino(voffset), consumer_name(voffset))) != 0) 214 * break; 215 * } 216 * return (gfs_readdir_fini(..., error, eofp, eof)); 217 * 218 * As you can see, a zero result from gfs_readdir_pred() or 219 * gfs_readdir_emit() indicates that processing should continue, 220 * whereas a non-zero result indicates that the loop should terminate. 221 * Most consumers need do nothing more than let gfs_readdir_fini() 222 * determine what the cause of failure was and return the appropriate 223 * value. 224 */ 225 int 226 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, 227 uio_t *uiop, ino64_t parent, ino64_t self, int flags) 228 { 229 size_t dirent_size; 230 231 if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || 232 (uiop->uio_loffset % ureclen) != 0) 233 return (EINVAL); 234 235 st->grd_ureclen = ureclen; 236 st->grd_oresid = uiop->uio_resid; 237 st->grd_namlen = name_max; 238 if (flags & V_RDDIR_ENTFLAGS) 239 dirent_size = EDIRENT_RECLEN(st->grd_namlen); 240 else 241 dirent_size = DIRENT64_RECLEN(st->grd_namlen); 242 st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); 243 st->grd_parent = parent; 244 st->grd_self = self; 245 st->grd_flags = flags; 246 247 return (0); 248 } 249 250 /* 251 * gfs_readdir_emit_int: internal routine to emit directory entry 252 * 253 * st - the current readdir state, which must have d_ino/ed_ino 254 * and d_name/ed_name set 255 * uiop - caller-supplied uio pointer 256 * next - the offset of the next entry 257 */ 258 static int 259 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next) 260 { 261 int reclen; 262 dirent64_t *dp; 263 edirent_t *edp; 264 265 if (st->grd_flags & V_RDDIR_ENTFLAGS) { 266 edp = st->grd_dirent; 267 reclen = EDIRENT_RECLEN(strlen(edp->ed_name)); 268 } else { 269 dp = st->grd_dirent; 270 reclen = DIRENT64_RECLEN(strlen(dp->d_name)); 271 } 272 273 if (reclen > uiop->uio_resid) { 274 /* 275 * Error if no entries were returned yet 276 */ 277 if (uiop->uio_resid == st->grd_oresid) 278 return (EINVAL); 279 return (-1); 280 } 281 282 if (st->grd_flags & V_RDDIR_ENTFLAGS) { 283 edp->ed_off = next; 284 edp->ed_reclen = (ushort_t)reclen; 285 } else { 286 dp->d_off = next; 287 dp->d_reclen = (ushort_t)reclen; 288 } 289 290 if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) 291 return (EFAULT); 292 293 uiop->uio_loffset = next; 294 295 return (0); 296 } 297 298 /* 299 * gfs_readdir_emit: emit a directory entry 300 * voff - the virtual offset (obtained from gfs_readdir_pred) 301 * ino - the entry's inode 302 * name - the entry's name 303 * eflags - value for ed_eflags (if processing edirent_t) 304 * 305 * Returns a 0 on success, a non-zero errno on failure, or -1 if the 306 * readdir loop should terminate. A non-zero result (either errno or 307 * -1) from this function is typically passed directly to 308 * gfs_readdir_fini(). 309 */ 310 int 311 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, 312 ino64_t ino, const char *name, int eflags) 313 { 314 offset_t off = (voff + 2) * st->grd_ureclen; 315 316 if (st->grd_flags & V_RDDIR_ENTFLAGS) { 317 edirent_t *edp = st->grd_dirent; 318 319 edp->ed_ino = ino; 320 (void) strncpy(edp->ed_name, name, st->grd_namlen); 321 edp->ed_eflags = eflags; 322 } else { 323 dirent64_t *dp = st->grd_dirent; 324 325 dp->d_ino = ino; 326 (void) strncpy(dp->d_name, name, st->grd_namlen); 327 } 328 329 /* 330 * Inter-entry offsets are invalid, so we assume a record size of 331 * grd_ureclen and explicitly set the offset appropriately. 332 */ 333 return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen)); 334 } 335 336 /* 337 * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer 338 * instead of a string for the entry's name. 339 */ 340 int 341 gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, 342 ino64_t ino, unsigned long num) 343 { 344 char buf[40]; 345 346 numtos(num, buf); 347 return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0)); 348 } 349 350 /* 351 * gfs_readdir_pred: readdir loop predicate 352 * voffp - a pointer in which the next virtual offset should be stored 353 * 354 * Returns a 0 on success, a non-zero errno on failure, or -1 if the 355 * readdir loop should terminate. A non-zero result (either errno or 356 * -1) from this function is typically passed directly to 357 * gfs_readdir_fini(). 358 */ 359 int 360 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp) 361 { 362 offset_t off, voff; 363 int error; 364 365 top: 366 if (uiop->uio_resid <= 0) 367 return (-1); 368 369 off = uiop->uio_loffset / st->grd_ureclen; 370 voff = off - 2; 371 if (off == 0) { 372 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, 373 ".", 0)) == 0) 374 goto top; 375 } else if (off == 1) { 376 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, 377 "..", 0)) == 0) 378 goto top; 379 } else { 380 *voffp = voff; 381 return (0); 382 } 383 384 return (error); 385 } 386 387 /* 388 * gfs_readdir_fini: generic readdir cleanup 389 * error - if positive, an error to return 390 * eofp - the eofp passed to readdir 391 * eof - the eof value 392 * 393 * Returns a 0 on success, a non-zero errno on failure. This result 394 * should be returned from readdir. 395 */ 396 int 397 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) 398 { 399 size_t dirent_size; 400 401 if (st->grd_flags & V_RDDIR_ENTFLAGS) 402 dirent_size = EDIRENT_RECLEN(st->grd_namlen); 403 else 404 dirent_size = DIRENT64_RECLEN(st->grd_namlen); 405 kmem_free(st->grd_dirent, dirent_size); 406 if (error > 0) 407 return (error); 408 if (eofp) 409 *eofp = eof; 410 return (0); 411 } 412 413 /* 414 * gfs_lookup_dot 415 * 416 * Performs a basic check for "." and ".." directory entries. 417 */ 418 int 419 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) 420 { 421 if (*nm == '\0' || strcmp(nm, ".") == 0) { 422 VN_HOLD(dvp); 423 *vpp = dvp; 424 return (0); 425 } else if (strcmp(nm, "..") == 0) { 426 if (pvp == NULL) { 427 ASSERT(dvp->v_flag & VROOT); 428 VN_HOLD(dvp); 429 *vpp = dvp; 430 } else { 431 VN_HOLD(pvp); 432 *vpp = pvp; 433 } 434 return (0); 435 } 436 437 return (-1); 438 } 439 440 /* 441 * gfs_file_create(): create a new GFS file 442 * 443 * size - size of private data structure (v_data) 444 * pvp - parent vnode (GFS directory) 445 * ops - vnode operations vector 446 * 447 * In order to use this interface, the parent vnode must have been created by 448 * gfs_dir_create(), and the private data stored in v_data must have a 449 * 'gfs_file_t' as its first field. 450 * 451 * Given these constraints, this routine will automatically: 452 * 453 * - Allocate v_data for the vnode 454 * - Initialize necessary fields in the vnode 455 * - Hold the parent 456 */ 457 vnode_t * 458 gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops) 459 { 460 gfs_file_t *fp; 461 vnode_t *vp; 462 463 /* 464 * Allocate vnode and internal data structure 465 */ 466 fp = kmem_zalloc(size, KM_SLEEP); 467 vp = vn_alloc(KM_SLEEP); 468 469 /* 470 * Set up various pointers 471 */ 472 fp->gfs_vnode = vp; 473 fp->gfs_parent = pvp; 474 vp->v_data = fp; 475 fp->gfs_size = size; 476 fp->gfs_type = GFS_FILE; 477 478 /* 479 * Initialize vnode and hold parent. 480 */ 481 vn_setops(vp, ops); 482 if (pvp) { 483 VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0); 484 VN_HOLD(pvp); 485 } 486 487 return (vp); 488 } 489 490 /* 491 * gfs_dir_create: creates a new directory in the parent 492 * 493 * size - size of private data structure (v_data) 494 * pvp - parent vnode (GFS directory) 495 * ops - vnode operations vector 496 * entries - NULL-terminated list of static entries (if any) 497 * maxlen - maximum length of a directory entry 498 * readdir_cb - readdir callback (see gfs_dir_readdir) 499 * inode_cb - inode callback (see gfs_dir_readdir) 500 * lookup_cb - lookup callback (see gfs_dir_lookup) 501 * 502 * In order to use this function, the first member of the private vnode 503 * structure (v_data) must be a gfs_dir_t. For each directory, there are 504 * static entries, defined when the structure is initialized, and dynamic 505 * entries, retrieved through callbacks. 506 * 507 * If a directory has static entries, then it must supply a inode callback, 508 * which will compute the inode number based on the parent and the index. 509 * For a directory with dynamic entries, the caller must supply a readdir 510 * callback and a lookup callback. If a static lookup fails, we fall back to 511 * the supplied lookup callback, if any. 512 * 513 * This function also performs the same initialization as gfs_file_create(). 514 */ 515 vnode_t * 516 gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops, 517 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, 518 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) 519 { 520 vnode_t *vp; 521 gfs_dir_t *dp; 522 gfs_dirent_t *de; 523 524 vp = gfs_file_create(struct_size, pvp, ops); 525 vp->v_type = VDIR; 526 527 dp = vp->v_data; 528 dp->gfsd_file.gfs_type = GFS_DIR; 529 dp->gfsd_maxlen = maxlen; 530 531 if (entries != NULL) { 532 for (de = entries; de->gfse_name != NULL; de++) 533 dp->gfsd_nstatic++; 534 535 dp->gfsd_static = kmem_alloc( 536 dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); 537 bcopy(entries, dp->gfsd_static, 538 dp->gfsd_nstatic * sizeof (gfs_dirent_t)); 539 } 540 541 dp->gfsd_readdir = readdir_cb; 542 dp->gfsd_lookup = lookup_cb; 543 dp->gfsd_inode = inode_cb; 544 545 mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); 546 547 return (vp); 548 } 549 550 /* 551 * gfs_root_create(): create a root vnode for a GFS filesystem 552 * 553 * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The 554 * only difference is that it takes a vfs_t instead of a vnode_t as its parent. 555 */ 556 vnode_t * 557 gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, 558 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, 559 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) 560 { 561 vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb, 562 maxlen, readdir_cb, lookup_cb); 563 564 /* Manually set the inode */ 565 ((gfs_file_t *)vp->v_data)->gfs_ino = ino; 566 567 VFS_HOLD(vfsp); 568 VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0); 569 vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; 570 571 return (vp); 572 } 573 574 /* 575 * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem 576 * 577 * Similar to gfs_root_create(), this creates a root vnode for a file to 578 * be the pseudo-filesystem. 579 */ 580 vnode_t * 581 gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino) 582 { 583 vnode_t *vp = gfs_file_create(size, NULL, ops); 584 585 ((gfs_file_t *)vp->v_data)->gfs_ino = ino; 586 587 VFS_HOLD(vfsp); 588 VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0); 589 vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; 590 591 return (vp); 592 } 593 594 /* 595 * gfs_file_inactive() 596 * 597 * Called from the VOP_INACTIVE() routine. If necessary, this routine will 598 * remove the given vnode from the parent directory and clean up any references 599 * in the VFS layer. 600 * 601 * If the vnode was not removed (due to a race with vget), then NULL is 602 * returned. Otherwise, a pointer to the private data is returned. 603 */ 604 void * 605 gfs_file_inactive(vnode_t *vp) 606 { 607 int i; 608 gfs_dirent_t *ge = NULL; 609 gfs_file_t *fp = vp->v_data; 610 gfs_dir_t *dp = NULL; 611 void *data; 612 613 if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) 614 goto found; 615 616 dp = fp->gfs_parent->v_data; 617 618 /* 619 * First, see if this vnode is cached in the parent. 620 */ 621 gfs_dir_lock(dp); 622 623 /* 624 * Find it in the set of static entries. 625 */ 626 for (i = 0; i < dp->gfsd_nstatic; i++) { 627 ge = &dp->gfsd_static[i]; 628 629 if (ge->gfse_vnode == vp) 630 goto found; 631 } 632 633 /* 634 * If 'ge' is NULL, then it is a dynamic entry. 635 */ 636 ge = NULL; 637 638 found: 639 if (vp->v_flag & V_XATTRDIR) { 640 mutex_enter(&fp->gfs_parent->v_lock); 641 } 642 mutex_enter(&vp->v_lock); 643 if (vp->v_count == 1) { 644 /* 645 * Really remove this vnode 646 */ 647 data = vp->v_data; 648 if (ge != NULL) { 649 /* 650 * If this was a statically cached entry, simply set the 651 * cached vnode to NULL. 652 */ 653 ge->gfse_vnode = NULL; 654 } 655 if (vp->v_flag & V_XATTRDIR) { 656 fp->gfs_parent->v_xattrdir = NULL; 657 mutex_exit(&fp->gfs_parent->v_lock); 658 } 659 mutex_exit(&vp->v_lock); 660 661 /* 662 * Free vnode and release parent 663 */ 664 if (fp->gfs_parent) { 665 if (dp) { 666 gfs_dir_unlock(dp); 667 } 668 VN_RELE(fp->gfs_parent); 669 } else { 670 ASSERT(vp->v_vfsp != NULL); 671 VFS_RELE(vp->v_vfsp); 672 } 673 vn_free(vp); 674 } else { 675 VN_RELE_LOCKED(vp); 676 data = NULL; 677 mutex_exit(&vp->v_lock); 678 if (vp->v_flag & V_XATTRDIR) { 679 mutex_exit(&fp->gfs_parent->v_lock); 680 } 681 if (dp) 682 gfs_dir_unlock(dp); 683 } 684 685 return (data); 686 } 687 688 /* 689 * gfs_dir_inactive() 690 * 691 * Same as above, but for directories. 692 */ 693 void * 694 gfs_dir_inactive(vnode_t *vp) 695 { 696 gfs_dir_t *dp; 697 698 ASSERT(vp->v_type == VDIR); 699 700 if ((dp = gfs_file_inactive(vp)) != NULL) { 701 mutex_destroy(&dp->gfsd_lock); 702 if (dp->gfsd_nstatic) 703 kmem_free(dp->gfsd_static, 704 dp->gfsd_nstatic * sizeof (gfs_dirent_t)); 705 } 706 707 return (dp); 708 } 709 710 /* 711 * gfs_dir_lookup_dynamic() 712 * 713 * This routine looks up the provided name amongst the dynamic entries 714 * in the gfs directory and returns the corresponding vnode, if found. 715 * 716 * The gfs directory is expected to be locked by the caller prior to 717 * calling this function. The directory will be unlocked during the 718 * execution of this function, but will be locked upon return from the 719 * function. This function returns 0 on success, non-zero on error. 720 * 721 * The dynamic lookups are performed by invoking the lookup 722 * callback, which is passed to this function as the first argument. 723 * The arguments to the callback are: 724 * 725 * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr, 726 * int flags, int *deflgs, pathname_t *rpnp); 727 * 728 * pvp - parent vnode 729 * nm - name of entry 730 * vpp - pointer to resulting vnode 731 * cr - pointer to cred 732 * flags - flags value from lookup request 733 * ignored here; currently only used to request 734 * insensitive lookups 735 * direntflgs - output parameter, directory entry flags 736 * ignored here; currently only used to indicate a lookup 737 * has more than one possible match when case is not considered 738 * realpnp - output parameter, real pathname 739 * ignored here; when lookup was performed case-insensitively, 740 * this field contains the "real" name of the file. 741 * 742 * Returns 0 on success, non-zero on error. 743 */ 744 static int 745 gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp, 746 const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags, 747 int *direntflags, pathname_t *realpnp) 748 { 749 gfs_file_t *fp; 750 ino64_t ino; 751 int ret; 752 753 ASSERT(GFS_DIR_LOCKED(dp)); 754 755 /* 756 * Drop the directory lock, as the lookup routine 757 * will need to allocate memory, or otherwise deadlock on this 758 * directory. 759 */ 760 gfs_dir_unlock(dp); 761 ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp); 762 gfs_dir_lock(dp); 763 764 /* 765 * The callback for extended attributes returns a vnode 766 * with v_data from an underlying fs. 767 */ 768 if (ret == 0 && !IS_XATTRDIR(dvp)) { 769 fp = (gfs_file_t *)((*vpp)->v_data); 770 fp->gfs_index = -1; 771 fp->gfs_ino = ino; 772 } 773 774 return (ret); 775 } 776 777 /* 778 * gfs_dir_lookup_static() 779 * 780 * This routine looks up the provided name amongst the static entries 781 * in the gfs directory and returns the corresponding vnode, if found. 782 * The first argument to the function is a pointer to the comparison 783 * function this function should use to decide if names are a match. 784 * 785 * If a match is found, and GFS_CACHE_VNODE is set and the vnode 786 * exists, we simply return the existing vnode. Otherwise, we call 787 * the static entry's callback routine, caching the result if 788 * necessary. If the idx pointer argument is non-NULL, we use it to 789 * return the index of the matching static entry. 790 * 791 * The gfs directory is expected to be locked by the caller prior to calling 792 * this function. The directory may be unlocked during the execution of 793 * this function, but will be locked upon return from the function. 794 * 795 * This function returns 0 if a match is found, ENOENT if not. 796 */ 797 static int 798 gfs_dir_lookup_static(int (*compare)(const char *, const char *), 799 gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx, 800 vnode_t **vpp, pathname_t *rpnp) 801 { 802 gfs_dirent_t *ge; 803 vnode_t *vp = NULL; 804 int i; 805 806 ASSERT(GFS_DIR_LOCKED(dp)); 807 808 /* 809 * Search static entries. 810 */ 811 for (i = 0; i < dp->gfsd_nstatic; i++) { 812 ge = &dp->gfsd_static[i]; 813 814 if (compare(ge->gfse_name, nm) == 0) { 815 if (rpnp) 816 (void) strlcpy(rpnp->pn_buf, ge->gfse_name, 817 rpnp->pn_bufsize); 818 819 if (ge->gfse_vnode) { 820 ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); 821 vp = ge->gfse_vnode; 822 VN_HOLD(vp); 823 break; 824 } 825 826 /* 827 * We drop the directory lock, as the constructor will 828 * need to do KM_SLEEP allocations. If we return from 829 * the constructor only to find that a parallel 830 * operation has completed, and GFS_CACHE_VNODE is set 831 * for this entry, we discard the result in favor of 832 * the cached vnode. 833 */ 834 gfs_dir_unlock(dp); 835 vp = ge->gfse_ctor(dvp); 836 gfs_dir_lock(dp); 837 838 ((gfs_file_t *)vp->v_data)->gfs_index = i; 839 840 /* Set the inode according to the callback. */ 841 ((gfs_file_t *)vp->v_data)->gfs_ino = 842 dp->gfsd_inode(dvp, i); 843 844 if (ge->gfse_flags & GFS_CACHE_VNODE) { 845 if (ge->gfse_vnode == NULL) { 846 ge->gfse_vnode = vp; 847 } else { 848 /* 849 * A parallel constructor beat us to it; 850 * return existing vnode. We have to be 851 * careful because we can't release the 852 * current vnode while holding the 853 * directory lock; its inactive routine 854 * will try to lock this directory. 855 */ 856 vnode_t *oldvp = vp; 857 vp = ge->gfse_vnode; 858 VN_HOLD(vp); 859 860 gfs_dir_unlock(dp); 861 VN_RELE(oldvp); 862 gfs_dir_lock(dp); 863 } 864 } 865 break; 866 } 867 } 868 869 if (vp == NULL) 870 return (ENOENT); 871 else if (idx) 872 *idx = i; 873 *vpp = vp; 874 return (0); 875 } 876 877 /* 878 * gfs_dir_lookup() 879 * 880 * Looks up the given name in the directory and returns the corresponding 881 * vnode, if found. 882 * 883 * First, we search statically defined entries, if any, with a call to 884 * gfs_dir_lookup_static(). If no static entry is found, and we have 885 * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic(). 886 * 887 * This function returns 0 on success, non-zero on error. 888 */ 889 int 890 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr, 891 int flags, int *direntflags, pathname_t *realpnp) 892 { 893 gfs_dir_t *dp = dvp->v_data; 894 boolean_t casecheck; 895 vnode_t *dynvp = NULL; 896 vnode_t *vp = NULL; 897 int (*compare)(const char *, const char *); 898 int error, idx; 899 900 ASSERT(dvp->v_type == VDIR); 901 902 if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) 903 return (0); 904 905 casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL; 906 if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) || 907 (flags & FIGNORECASE)) 908 compare = strcasecmp; 909 else 910 compare = strcmp; 911 912 gfs_dir_lock(dp); 913 914 error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp); 915 916 if (vp && casecheck) { 917 gfs_dirent_t *ge; 918 int i; 919 920 for (i = idx + 1; i < dp->gfsd_nstatic; i++) { 921 ge = &dp->gfsd_static[i]; 922 923 if (strcasecmp(ge->gfse_name, nm) == 0) { 924 *direntflags |= ED_CASE_CONFLICT; 925 goto out; 926 } 927 } 928 } 929 930 if ((error || casecheck) && dp->gfsd_lookup) 931 error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp, 932 &dynvp, cr, flags, direntflags, vp ? NULL : realpnp); 933 934 if (vp && dynvp) { 935 /* static and dynamic entries are case-insensitive conflict */ 936 ASSERT(casecheck); 937 *direntflags |= ED_CASE_CONFLICT; 938 VN_RELE(dynvp); 939 } else if (vp == NULL) { 940 vp = dynvp; 941 } else if (error == ENOENT) { 942 error = 0; 943 } else if (error) { 944 VN_RELE(vp); 945 vp = NULL; 946 } 947 948 out: 949 gfs_dir_unlock(dp); 950 951 *vpp = vp; 952 return (error); 953 } 954 955 /* 956 * gfs_dir_readdir: does a readdir() on the given directory 957 * 958 * dvp - directory vnode 959 * uiop - uio structure 960 * eofp - eof pointer 961 * data - arbitrary data passed to readdir callback 962 * 963 * This routine does all the readdir() dirty work. Even so, the caller must 964 * supply two callbacks in order to get full compatibility. 965 * 966 * If the directory contains static entries, an inode callback must be 967 * specified. This avoids having to create every vnode and call VOP_GETATTR() 968 * when reading the directory. This function has the following arguments: 969 * 970 * ino_t gfs_inode_cb(vnode_t *vp, int index); 971 * 972 * vp - vnode for the directory 973 * index - index in original gfs_dirent_t array 974 * 975 * Returns the inode number for the given entry. 976 * 977 * For directories with dynamic entries, a readdir callback must be provided. 978 * This is significantly more complex, thanks to the particulars of 979 * VOP_READDIR(). 980 * 981 * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, 982 * offset_t *off, offset_t *nextoff, void *data, int flags) 983 * 984 * vp - directory vnode 985 * dp - directory entry, sized according to maxlen given to 986 * gfs_dir_create(). callback must fill in d_name and 987 * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags 988 * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS 989 * is set in 'flags'. 990 * eofp - callback must set to 1 when EOF has been reached 991 * off - on entry, the last offset read from the directory. Callback 992 * must set to the offset of the current entry, typically left 993 * untouched. 994 * nextoff - callback must set to offset of next entry. Typically 995 * (off + 1) 996 * data - caller-supplied data 997 * flags - VOP_READDIR flags 998 * 999 * Return 0 on success, or error on failure. 1000 */ 1001 int 1002 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr, 1003 caller_context_t *ct, int flags) 1004 { 1005 gfs_readdir_state_t gstate; 1006 int error, eof = 0; 1007 ino64_t ino, pino; 1008 offset_t off, next; 1009 gfs_dir_t *dp = dvp->v_data; 1010 1011 error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino); 1012 if (error) 1013 return (error); 1014 1015 if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, 1016 pino, ino, flags)) != 0) 1017 return (error); 1018 1019 while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 && 1020 !eof) { 1021 1022 if (off >= 0 && off < dp->gfsd_nstatic) { 1023 ino = dp->gfsd_inode(dvp, off); 1024 1025 if ((error = gfs_readdir_emit(&gstate, uiop, 1026 off, ino, dp->gfsd_static[off].gfse_name, 0)) 1027 != 0) 1028 break; 1029 1030 } else if (dp->gfsd_readdir) { 1031 off -= dp->gfsd_nstatic; 1032 1033 if ((error = dp->gfsd_readdir(dvp, 1034 gstate.grd_dirent, &eof, &off, &next, 1035 data, flags)) != 0 || eof) 1036 break; 1037 1038 off += dp->gfsd_nstatic + 2; 1039 next += dp->gfsd_nstatic + 2; 1040 1041 if ((error = gfs_readdir_emit_int(&gstate, uiop, 1042 next)) != 0) 1043 break; 1044 } else { 1045 /* 1046 * Offset is beyond the end of the static entries, and 1047 * we have no dynamic entries. Set EOF. 1048 */ 1049 eof = 1; 1050 } 1051 } 1052 1053 return (gfs_readdir_fini(&gstate, error, eofp, eof)); 1054 } 1055 1056 1057 /* 1058 * gfs_vop_lookup: VOP_LOOKUP() entry point 1059 * 1060 * For use directly in vnode ops table. Given a GFS directory, calls 1061 * gfs_dir_lookup() as necessary. 1062 */ 1063 /* ARGSUSED */ 1064 int 1065 gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, 1066 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1067 int *direntflags, pathname_t *realpnp) 1068 { 1069 return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp)); 1070 } 1071 1072 /* 1073 * gfs_vop_readdir: VOP_READDIR() entry point 1074 * 1075 * For use directly in vnode ops table. Given a GFS directory, calls 1076 * gfs_dir_readdir() as necessary. 1077 */ 1078 /* ARGSUSED */ 1079 int 1080 gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, 1081 caller_context_t *ct, int flags) 1082 { 1083 return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags)); 1084 } 1085 1086 1087 /* 1088 * gfs_vop_map: VOP_MAP() entry point 1089 * 1090 * Convenient routine for handling pseudo-files that wish to allow mmap() calls. 1091 * This function only works for readonly files, and uses the read function for 1092 * the vnode to fill in the data. The mapped data is immediately faulted in and 1093 * filled with the necessary data during this call; there are no getpage() or 1094 * putpage() routines. 1095 */ 1096 /* ARGSUSED */ 1097 int 1098 gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 1099 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred, 1100 caller_context_t *ct) 1101 { 1102 int rv; 1103 ssize_t resid = len; 1104 1105 /* 1106 * Check for bad parameters 1107 */ 1108 #ifdef _ILP32 1109 if (len > MAXOFF_T) 1110 return (ENOMEM); 1111 #endif 1112 if (vp->v_flag & VNOMAP) 1113 return (ENOTSUP); 1114 if (off > MAXOFF_T) 1115 return (EFBIG); 1116 if ((long)off < 0 || (long)(off + len) < 0) 1117 return (EINVAL); 1118 if (vp->v_type != VREG) 1119 return (ENODEV); 1120 if ((prot & (PROT_EXEC | PROT_WRITE)) != 0) 1121 return (EACCES); 1122 1123 /* 1124 * Find appropriate address if needed, otherwise clear address range. 1125 */ 1126 as_rangelock(as); 1127 rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 1128 if (rv != 0) { 1129 as_rangeunlock(as); 1130 return (rv); 1131 } 1132 1133 /* 1134 * Create mapping 1135 */ 1136 rv = as_map(as, *addrp, len, segvn_create, zfod_argsp); 1137 as_rangeunlock(as); 1138 if (rv != 0) 1139 return (rv); 1140 1141 /* 1142 * Fill with data from read() 1143 */ 1144 rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE, 1145 0, (rlim64_t)0, cred, &resid); 1146 1147 if (rv == 0 && resid != 0) 1148 rv = ENXIO; 1149 1150 if (rv != 0) { 1151 as_rangelock(as); 1152 (void) as_unmap(as, *addrp, len); 1153 as_rangeunlock(as); 1154 } 1155 1156 return (rv); 1157 } 1158 1159 /* 1160 * gfs_vop_inactive: VOP_INACTIVE() entry point 1161 * 1162 * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or 1163 * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. 1164 */ 1165 /* ARGSUSED */ 1166 void 1167 gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 1168 { 1169 gfs_file_t *fp = vp->v_data; 1170 void *data; 1171 1172 if (fp->gfs_type == GFS_DIR) 1173 data = gfs_dir_inactive(vp); 1174 else 1175 data = gfs_file_inactive(vp); 1176 1177 if (data != NULL) 1178 kmem_free(data, fp->gfs_size); 1179 } 1180