1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* Portions Copyright 2007 Shivakumar GN */ 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/cmn_err.h> 31 #include <sys/debug.h> 32 #include <sys/dirent.h> 33 #include <sys/kmem.h> 34 #include <sys/mman.h> 35 #include <sys/mutex.h> 36 #include <sys/sysmacros.h> 37 #include <sys/systm.h> 38 #include <sys/uio.h> 39 #include <sys/vmsystm.h> 40 #include <sys/vfs.h> 41 #include <sys/vnode.h> 42 43 #include <vm/as.h> 44 #include <vm/seg_vn.h> 45 46 #include <sys/gfs.h> 47 48 /* 49 * Generic pseudo-filesystem routines. 50 * 51 * There are significant similarities between the implementation of certain file 52 * system entry points across different filesystems. While one could attempt to 53 * "choke up on the bat" and incorporate common functionality into a VOP 54 * preamble or postamble, such an approach is limited in the benefit it can 55 * provide. In this file we instead define a toolkit of routines which can be 56 * called from a filesystem (with in-kernel pseudo-filesystems being the focus 57 * of the exercise) in a more component-like fashion. 58 * 59 * There are three basic classes of routines: 60 * 61 * 1) Lowlevel support routines 62 * 63 * These routines are designed to play a support role for existing 64 * pseudo-filesystems (such as procfs). They simplify common tasks, 65 * without forcing the filesystem to hand over management to GFS. The 66 * routines covered are: 67 * 68 * gfs_readdir_init() 69 * gfs_readdir_emit() 70 * gfs_readdir_emitn() 71 * gfs_readdir_pred() 72 * gfs_readdir_fini() 73 * gfs_lookup_dot() 74 * 75 * 2) Complete GFS management 76 * 77 * These routines take a more active role in management of the 78 * pseudo-filesystem. They handle the relationship between vnode private 79 * data and VFS data, as well as the relationship between vnodes in the 80 * directory hierarchy. 81 * 82 * In order to use these interfaces, the first member of every private 83 * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control 84 * to GFS. 85 * 86 * gfs_file_create() 87 * gfs_dir_create() 88 * gfs_root_create() 89 * 90 * gfs_file_inactive() 91 * gfs_dir_inactive() 92 * gfs_dir_lookup() 93 * gfs_dir_readdir() 94 * 95 * gfs_vop_inactive() 96 * gfs_vop_lookup() 97 * gfs_vop_readdir() 98 * gfs_vop_map() 99 * 100 * 3) Single File pseudo-filesystems 101 * 102 * This routine creates a rooted file to be overlayed ontop of another 103 * file in the physical filespace. 104 * 105 * Note that the parent is NULL (actually the vfs), but there is nothing 106 * technically keeping such a file from utilizing the "Complete GFS 107 * management" set of routines. 108 * 109 * gfs_root_create_file() 110 */ 111 112 /* 113 * gfs_make_opsvec: take an array of vnode type definitions and create 114 * their vnodeops_t structures 115 * 116 * This routine takes an array of gfs_opsvec_t's. It could 117 * alternatively take an array of gfs_opsvec_t*'s, which would allow 118 * vnode types to be completely defined in files external to the caller 119 * of gfs_make_opsvec(). As it stands, much more sharing takes place -- 120 * both the caller and the vnode type provider need to access gfsv_ops 121 * and gfsv_template, and the caller also needs to know gfsv_name. 122 */ 123 int 124 gfs_make_opsvec(gfs_opsvec_t *vec) 125 { 126 int error, i; 127 128 for (i = 0; ; i++) { 129 if (vec[i].gfsv_name == NULL) 130 return (0); 131 error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template, 132 vec[i].gfsv_ops); 133 if (error) 134 break; 135 } 136 137 cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'", 138 vec[i].gfsv_name); 139 for (i--; i >= 0; i--) { 140 vn_freevnodeops(*vec[i].gfsv_ops); 141 *vec[i].gfsv_ops = NULL; 142 } 143 return (error); 144 } 145 146 /* 147 * Low level directory routines 148 * 149 * These routines provide some simple abstractions for reading directories. 150 * They are designed to be used by existing pseudo filesystems (namely procfs) 151 * that already have a complicated management infrastructure. 152 */ 153 154 /* 155 * gfs_get_parent_ino: used to obtain a parent inode number and the 156 * inode number of the given vnode in preparation for calling gfs_readdir_init. 157 */ 158 int 159 gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, 160 ino64_t *pino, ino64_t *ino) 161 { 162 vnode_t *parent; 163 gfs_dir_t *dp = dvp->v_data; 164 int error; 165 166 *ino = dp->gfsd_file.gfs_ino; 167 parent = dp->gfsd_file.gfs_parent; 168 169 if (parent == NULL) { 170 *pino = *ino; /* root of filesystem */ 171 } else if (dvp->v_flag & V_XATTRDIR) { 172 vattr_t va; 173 174 va.va_mask = AT_NODEID; 175 error = VOP_GETATTR(parent, &va, 0, cr, ct); 176 if (error) 177 return (error); 178 *pino = va.va_nodeid; 179 } else { 180 *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; 181 } 182 183 return (0); 184 } 185 186 /* 187 * gfs_readdir_init: initiate a generic readdir 188 * st - a pointer to an uninitialized gfs_readdir_state_t structure 189 * name_max - the directory's maximum file name length 190 * ureclen - the exported file-space record length (1 for non-legacy FSs) 191 * uiop - the uiop passed to readdir 192 * parent - the parent directory's inode 193 * self - this directory's inode 194 * flags - flags from VOP_READDIR 195 * 196 * Returns 0 or a non-zero errno. 197 * 198 * Typical VOP_READDIR usage of gfs_readdir_*: 199 * 200 * if ((error = gfs_readdir_init(...)) != 0) 201 * return (error); 202 * eof = 0; 203 * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { 204 * if (!consumer_entry_at(voffset)) 205 * voffset = consumer_next_entry(voffset); 206 * if (consumer_eof(voffset)) { 207 * eof = 1 208 * break; 209 * } 210 * if ((error = gfs_readdir_emit(..., voffset, 211 * consumer_ino(voffset), consumer_name(voffset))) != 0) 212 * break; 213 * } 214 * return (gfs_readdir_fini(..., error, eofp, eof)); 215 * 216 * As you can see, a zero result from gfs_readdir_pred() or 217 * gfs_readdir_emit() indicates that processing should continue, 218 * whereas a non-zero result indicates that the loop should terminate. 219 * Most consumers need do nothing more than let gfs_readdir_fini() 220 * determine what the cause of failure was and return the appropriate 221 * value. 222 */ 223 int 224 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, 225 uio_t *uiop, ino64_t parent, ino64_t self, int flags) 226 { 227 size_t dirent_size; 228 229 if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || 230 (uiop->uio_loffset % ureclen) != 0) 231 return (EINVAL); 232 233 st->grd_ureclen = ureclen; 234 st->grd_oresid = uiop->uio_resid; 235 st->grd_namlen = name_max; 236 if (flags & V_RDDIR_ENTFLAGS) 237 dirent_size = EDIRENT_RECLEN(st->grd_namlen); 238 else 239 dirent_size = DIRENT64_RECLEN(st->grd_namlen); 240 st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); 241 st->grd_parent = parent; 242 st->grd_self = self; 243 st->grd_flags = flags; 244 245 return (0); 246 } 247 248 /* 249 * gfs_readdir_emit_int: internal routine to emit directory entry 250 * 251 * st - the current readdir state, which must have d_ino/ed_ino 252 * and d_name/ed_name set 253 * uiop - caller-supplied uio pointer 254 * next - the offset of the next entry 255 */ 256 static int 257 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next) 258 { 259 int reclen; 260 dirent64_t *dp; 261 edirent_t *edp; 262 263 if (st->grd_flags & V_RDDIR_ENTFLAGS) { 264 edp = st->grd_dirent; 265 reclen = EDIRENT_RECLEN(strlen(edp->ed_name)); 266 } else { 267 dp = st->grd_dirent; 268 reclen = DIRENT64_RECLEN(strlen(dp->d_name)); 269 } 270 271 if (reclen > uiop->uio_resid) { 272 /* 273 * Error if no entries were returned yet 274 */ 275 if (uiop->uio_resid == st->grd_oresid) 276 return (EINVAL); 277 return (-1); 278 } 279 280 if (st->grd_flags & V_RDDIR_ENTFLAGS) { 281 edp->ed_off = next; 282 edp->ed_reclen = (ushort_t)reclen; 283 } else { 284 dp->d_off = next; 285 dp->d_reclen = (ushort_t)reclen; 286 } 287 288 if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) 289 return (EFAULT); 290 291 uiop->uio_loffset = next; 292 293 return (0); 294 } 295 296 /* 297 * gfs_readdir_emit: emit a directory entry 298 * voff - the virtual offset (obtained from gfs_readdir_pred) 299 * ino - the entry's inode 300 * name - the entry's name 301 * eflags - value for ed_eflags (if processing edirent_t) 302 * 303 * Returns a 0 on success, a non-zero errno on failure, or -1 if the 304 * readdir loop should terminate. A non-zero result (either errno or 305 * -1) from this function is typically passed directly to 306 * gfs_readdir_fini(). 307 */ 308 int 309 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, 310 ino64_t ino, const char *name, int eflags) 311 { 312 offset_t off = (voff + 2) * st->grd_ureclen; 313 314 if (st->grd_flags & V_RDDIR_ENTFLAGS) { 315 edirent_t *edp = st->grd_dirent; 316 317 edp->ed_ino = ino; 318 (void) strncpy(edp->ed_name, name, st->grd_namlen); 319 edp->ed_eflags = eflags; 320 } else { 321 dirent64_t *dp = st->grd_dirent; 322 323 dp->d_ino = ino; 324 (void) strncpy(dp->d_name, name, st->grd_namlen); 325 } 326 327 /* 328 * Inter-entry offsets are invalid, so we assume a record size of 329 * grd_ureclen and explicitly set the offset appropriately. 330 */ 331 return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen)); 332 } 333 334 /* 335 * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer 336 * instead of a string for the entry's name. 337 */ 338 int 339 gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, 340 ino64_t ino, unsigned long num) 341 { 342 char buf[40]; 343 344 numtos(num, buf); 345 return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0)); 346 } 347 348 /* 349 * gfs_readdir_pred: readdir loop predicate 350 * voffp - a pointer in which the next virtual offset should be stored 351 * 352 * Returns a 0 on success, a non-zero errno on failure, or -1 if the 353 * readdir loop should terminate. A non-zero result (either errno or 354 * -1) from this function is typically passed directly to 355 * gfs_readdir_fini(). 356 */ 357 int 358 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp) 359 { 360 offset_t off, voff; 361 int error; 362 363 top: 364 if (uiop->uio_resid <= 0) 365 return (-1); 366 367 off = uiop->uio_loffset / st->grd_ureclen; 368 voff = off - 2; 369 if (off == 0) { 370 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, 371 ".", 0)) == 0) 372 goto top; 373 } else if (off == 1) { 374 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, 375 "..", 0)) == 0) 376 goto top; 377 } else { 378 *voffp = voff; 379 return (0); 380 } 381 382 return (error); 383 } 384 385 /* 386 * gfs_readdir_fini: generic readdir cleanup 387 * error - if positive, an error to return 388 * eofp - the eofp passed to readdir 389 * eof - the eof value 390 * 391 * Returns a 0 on success, a non-zero errno on failure. This result 392 * should be returned from readdir. 393 */ 394 int 395 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) 396 { 397 size_t dirent_size; 398 399 if (st->grd_flags & V_RDDIR_ENTFLAGS) 400 dirent_size = EDIRENT_RECLEN(st->grd_namlen); 401 else 402 dirent_size = DIRENT64_RECLEN(st->grd_namlen); 403 kmem_free(st->grd_dirent, dirent_size); 404 if (error > 0) 405 return (error); 406 if (eofp) 407 *eofp = eof; 408 return (0); 409 } 410 411 /* 412 * gfs_lookup_dot 413 * 414 * Performs a basic check for "." and ".." directory entries. 415 */ 416 int 417 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) 418 { 419 if (*nm == '\0' || strcmp(nm, ".") == 0) { 420 VN_HOLD(dvp); 421 *vpp = dvp; 422 return (0); 423 } else if (strcmp(nm, "..") == 0) { 424 if (pvp == NULL) { 425 ASSERT(dvp->v_flag & VROOT); 426 VN_HOLD(dvp); 427 *vpp = dvp; 428 } else { 429 VN_HOLD(pvp); 430 *vpp = pvp; 431 } 432 return (0); 433 } 434 435 return (-1); 436 } 437 438 /* 439 * gfs_file_create(): create a new GFS file 440 * 441 * size - size of private data structure (v_data) 442 * pvp - parent vnode (GFS directory) 443 * ops - vnode operations vector 444 * 445 * In order to use this interface, the parent vnode must have been created by 446 * gfs_dir_create(), and the private data stored in v_data must have a 447 * 'gfs_file_t' as its first field. 448 * 449 * Given these constraints, this routine will automatically: 450 * 451 * - Allocate v_data for the vnode 452 * - Initialize necessary fields in the vnode 453 * - Hold the parent 454 */ 455 vnode_t * 456 gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops) 457 { 458 gfs_file_t *fp; 459 vnode_t *vp; 460 461 /* 462 * Allocate vnode and internal data structure 463 */ 464 fp = kmem_zalloc(size, KM_SLEEP); 465 vp = vn_alloc(KM_SLEEP); 466 467 /* 468 * Set up various pointers 469 */ 470 fp->gfs_vnode = vp; 471 fp->gfs_parent = pvp; 472 vp->v_data = fp; 473 fp->gfs_size = size; 474 fp->gfs_type = GFS_FILE; 475 476 /* 477 * Initialize vnode and hold parent. 478 */ 479 vn_setops(vp, ops); 480 if (pvp) { 481 VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0); 482 VN_HOLD(pvp); 483 } 484 485 return (vp); 486 } 487 488 /* 489 * gfs_dir_create: creates a new directory in the parent 490 * 491 * size - size of private data structure (v_data) 492 * pvp - parent vnode (GFS directory) 493 * ops - vnode operations vector 494 * entries - NULL-terminated list of static entries (if any) 495 * maxlen - maximum length of a directory entry 496 * readdir_cb - readdir callback (see gfs_dir_readdir) 497 * inode_cb - inode callback (see gfs_dir_readdir) 498 * lookup_cb - lookup callback (see gfs_dir_lookup) 499 * 500 * In order to use this function, the first member of the private vnode 501 * structure (v_data) must be a gfs_dir_t. For each directory, there are 502 * static entries, defined when the structure is initialized, and dynamic 503 * entries, retrieved through callbacks. 504 * 505 * If a directory has static entries, then it must supply a inode callback, 506 * which will compute the inode number based on the parent and the index. 507 * For a directory with dynamic entries, the caller must supply a readdir 508 * callback and a lookup callback. If a static lookup fails, we fall back to 509 * the supplied lookup callback, if any. 510 * 511 * This function also performs the same initialization as gfs_file_create(). 512 */ 513 vnode_t * 514 gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops, 515 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, 516 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) 517 { 518 vnode_t *vp; 519 gfs_dir_t *dp; 520 gfs_dirent_t *de; 521 522 vp = gfs_file_create(struct_size, pvp, ops); 523 vp->v_type = VDIR; 524 525 dp = vp->v_data; 526 dp->gfsd_file.gfs_type = GFS_DIR; 527 dp->gfsd_maxlen = maxlen; 528 529 if (entries != NULL) { 530 for (de = entries; de->gfse_name != NULL; de++) 531 dp->gfsd_nstatic++; 532 533 dp->gfsd_static = kmem_alloc( 534 dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); 535 bcopy(entries, dp->gfsd_static, 536 dp->gfsd_nstatic * sizeof (gfs_dirent_t)); 537 } 538 539 dp->gfsd_readdir = readdir_cb; 540 dp->gfsd_lookup = lookup_cb; 541 dp->gfsd_inode = inode_cb; 542 543 mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); 544 545 return (vp); 546 } 547 548 /* 549 * gfs_root_create(): create a root vnode for a GFS filesystem 550 * 551 * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The 552 * only difference is that it takes a vfs_t instead of a vnode_t as its parent. 553 */ 554 vnode_t * 555 gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, 556 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, 557 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) 558 { 559 vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb, 560 maxlen, readdir_cb, lookup_cb); 561 562 /* Manually set the inode */ 563 ((gfs_file_t *)vp->v_data)->gfs_ino = ino; 564 565 VFS_HOLD(vfsp); 566 VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0); 567 vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; 568 569 return (vp); 570 } 571 572 /* 573 * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem 574 * 575 * Similar to gfs_root_create(), this creates a root vnode for a file to 576 * be the pseudo-filesystem. 577 */ 578 vnode_t * 579 gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino) 580 { 581 vnode_t *vp = gfs_file_create(size, NULL, ops); 582 583 ((gfs_file_t *)vp->v_data)->gfs_ino = ino; 584 585 VFS_HOLD(vfsp); 586 VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0); 587 vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; 588 589 return (vp); 590 } 591 592 /* 593 * gfs_file_inactive() 594 * 595 * Called from the VOP_INACTIVE() routine. If necessary, this routine will 596 * remove the given vnode from the parent directory and clean up any references 597 * in the VFS layer. 598 * 599 * If the vnode was not removed (due to a race with vget), then NULL is 600 * returned. Otherwise, a pointer to the private data is returned. 601 */ 602 void * 603 gfs_file_inactive(vnode_t *vp) 604 { 605 int i; 606 gfs_dirent_t *ge = NULL; 607 gfs_file_t *fp = vp->v_data; 608 gfs_dir_t *dp = NULL; 609 void *data; 610 611 if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) 612 goto found; 613 614 dp = fp->gfs_parent->v_data; 615 616 /* 617 * First, see if this vnode is cached in the parent. 618 */ 619 gfs_dir_lock(dp); 620 621 /* 622 * Find it in the set of static entries. 623 */ 624 for (i = 0; i < dp->gfsd_nstatic; i++) { 625 ge = &dp->gfsd_static[i]; 626 627 if (ge->gfse_vnode == vp) 628 goto found; 629 } 630 631 /* 632 * If 'ge' is NULL, then it is a dynamic entry. 633 */ 634 ge = NULL; 635 636 found: 637 if (vp->v_flag & V_XATTRDIR) { 638 mutex_enter(&fp->gfs_parent->v_lock); 639 } 640 mutex_enter(&vp->v_lock); 641 if (vp->v_count == 1) { 642 /* 643 * Really remove this vnode 644 */ 645 data = vp->v_data; 646 if (ge != NULL) { 647 /* 648 * If this was a statically cached entry, simply set the 649 * cached vnode to NULL. 650 */ 651 ge->gfse_vnode = NULL; 652 } 653 if (vp->v_flag & V_XATTRDIR) { 654 fp->gfs_parent->v_xattrdir = NULL; 655 mutex_exit(&fp->gfs_parent->v_lock); 656 } 657 mutex_exit(&vp->v_lock); 658 659 /* 660 * Free vnode and release parent 661 */ 662 if (fp->gfs_parent) { 663 if (dp) { 664 gfs_dir_unlock(dp); 665 } 666 VN_RELE(fp->gfs_parent); 667 } else { 668 ASSERT(vp->v_vfsp != NULL); 669 VFS_RELE(vp->v_vfsp); 670 } 671 vn_free(vp); 672 } else { 673 vp->v_count--; 674 data = NULL; 675 mutex_exit(&vp->v_lock); 676 if (vp->v_flag & V_XATTRDIR) { 677 mutex_exit(&fp->gfs_parent->v_lock); 678 } 679 if (dp) 680 gfs_dir_unlock(dp); 681 } 682 683 return (data); 684 } 685 686 /* 687 * gfs_dir_inactive() 688 * 689 * Same as above, but for directories. 690 */ 691 void * 692 gfs_dir_inactive(vnode_t *vp) 693 { 694 gfs_dir_t *dp; 695 696 ASSERT(vp->v_type == VDIR); 697 698 if ((dp = gfs_file_inactive(vp)) != NULL) { 699 mutex_destroy(&dp->gfsd_lock); 700 if (dp->gfsd_nstatic) 701 kmem_free(dp->gfsd_static, 702 dp->gfsd_nstatic * sizeof (gfs_dirent_t)); 703 } 704 705 return (dp); 706 } 707 708 /* 709 * gfs_dir_lookup() 710 * 711 * Looks up the given name in the directory and returns the corresponding vnode, 712 * if found. 713 * 714 * First, we search statically defined entries, if any. If a match is found, 715 * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the 716 * existing vnode. Otherwise, we call the static entry's callback routine, 717 * caching the result if necessary. 718 * 719 * If no static entry is found, we invoke the lookup callback, if any. The 720 * arguments to this callback are: 721 * 722 * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr); 723 * 724 * pvp - parent vnode 725 * nm - name of entry 726 * vpp - pointer to resulting vnode 727 * cr - pointer to cred 728 * 729 * Returns 0 on success, non-zero on error. 730 */ 731 int 732 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr) 733 { 734 int i; 735 gfs_dirent_t *ge; 736 vnode_t *vp; 737 gfs_dir_t *dp = dvp->v_data; 738 int ret = 0; 739 740 ASSERT(dvp->v_type == VDIR); 741 742 if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) 743 return (0); 744 745 gfs_dir_lock(dp); 746 747 /* 748 * Search static entries. 749 */ 750 for (i = 0; i < dp->gfsd_nstatic; i++) { 751 ge = &dp->gfsd_static[i]; 752 753 if (strcmp(ge->gfse_name, nm) == 0) { 754 if (ge->gfse_vnode) { 755 ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); 756 vp = ge->gfse_vnode; 757 VN_HOLD(vp); 758 goto out; 759 } 760 761 /* 762 * We drop the directory lock, as the constructor will 763 * need to do KM_SLEEP allocations. If we return from 764 * the constructor only to find that a parallel 765 * operation has completed, and GFS_CACHE_VNODE is set 766 * for this entry, we discard the result in favor of the 767 * cached vnode. 768 */ 769 gfs_dir_unlock(dp); 770 vp = ge->gfse_ctor(dvp); 771 gfs_dir_lock(dp); 772 773 ((gfs_file_t *)vp->v_data)->gfs_index = i; 774 775 /* Set the inode according to the callback. */ 776 ((gfs_file_t *)vp->v_data)->gfs_ino = 777 dp->gfsd_inode(dvp, i); 778 779 if (ge->gfse_flags & GFS_CACHE_VNODE) { 780 if (ge->gfse_vnode == NULL) { 781 ge->gfse_vnode = vp; 782 } else { 783 /* 784 * A parallel constructor beat us to it; 785 * return existing vnode. We have to be 786 * careful because we can't release the 787 * current vnode while holding the 788 * directory lock; its inactive routine 789 * will try to lock this directory. 790 */ 791 vnode_t *oldvp = vp; 792 vp = ge->gfse_vnode; 793 VN_HOLD(vp); 794 795 gfs_dir_unlock(dp); 796 VN_RELE(oldvp); 797 gfs_dir_lock(dp); 798 } 799 } 800 801 goto out; 802 } 803 } 804 805 /* 806 * See if there is a dynamic constructor. 807 */ 808 if (dp->gfsd_lookup) { 809 ino64_t ino; 810 gfs_file_t *fp; 811 812 /* 813 * Once again, drop the directory lock, as the lookup routine 814 * will need to allocate memory, or otherwise deadlock on this 815 * directory. 816 */ 817 gfs_dir_unlock(dp); 818 ret = dp->gfsd_lookup(dvp, nm, &vp, &ino, cr); 819 gfs_dir_lock(dp); 820 if (ret != 0) 821 goto out; 822 823 /* 824 * The lookup_cb might be returning a non-GFS vnode. 825 * Currently this is true for extended attributes, 826 * where we're returning a vnode with v_data from an 827 * underlying fs. 828 */ 829 if ((dvp->v_flag & V_XATTRDIR) == 0) { 830 fp = (gfs_file_t *)vp->v_data; 831 fp->gfs_index = -1; 832 fp->gfs_ino = ino; 833 } 834 } else { 835 /* 836 * No static entry found, and there is no lookup callback, so 837 * return ENOENT. 838 */ 839 ret = ENOENT; 840 } 841 842 out: 843 gfs_dir_unlock(dp); 844 845 if (ret == 0) 846 *vpp = vp; 847 else 848 *vpp = NULL; 849 850 return (ret); 851 } 852 853 /* 854 * gfs_dir_readdir: does a readdir() on the given directory 855 * 856 * dvp - directory vnode 857 * uiop - uio structure 858 * eofp - eof pointer 859 * data - arbitrary data passed to readdir callback 860 * 861 * This routine does all the readdir() dirty work. Even so, the caller must 862 * supply two callbacks in order to get full compatibility. 863 * 864 * If the directory contains static entries, an inode callback must be 865 * specified. This avoids having to create every vnode and call VOP_GETATTR() 866 * when reading the directory. This function has the following arguments: 867 * 868 * ino_t gfs_inode_cb(vnode_t *vp, int index); 869 * 870 * vp - vnode for the directory 871 * index - index in original gfs_dirent_t array 872 * 873 * Returns the inode number for the given entry. 874 * 875 * For directories with dynamic entries, a readdir callback must be provided. 876 * This is significantly more complex, thanks to the particulars of 877 * VOP_READDIR(). 878 * 879 * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, 880 * offset_t *off, offset_t *nextoff, void *data, int flags) 881 * 882 * vp - directory vnode 883 * dp - directory entry, sized according to maxlen given to 884 * gfs_dir_create(). callback must fill in d_name and 885 * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags 886 * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS 887 * is set in 'flags'. 888 * eofp - callback must set to 1 when EOF has been reached 889 * off - on entry, the last offset read from the directory. Callback 890 * must set to the offset of the current entry, typically left 891 * untouched. 892 * nextoff - callback must set to offset of next entry. Typically 893 * (off + 1) 894 * data - caller-supplied data 895 * flags - VOP_READDIR flags 896 * 897 * Return 0 on success, or error on failure. 898 */ 899 int 900 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr, 901 caller_context_t *ct, int flags) 902 { 903 gfs_readdir_state_t gstate; 904 int error, eof = 0; 905 ino64_t ino, pino; 906 offset_t off, next; 907 gfs_dir_t *dp = dvp->v_data; 908 909 error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino); 910 if (error) 911 return (error); 912 913 if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, 914 pino, ino, flags)) != 0) 915 return (error); 916 917 while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 && 918 !eof) { 919 920 if (off >= 0 && off < dp->gfsd_nstatic) { 921 ino = dp->gfsd_inode(dvp, off); 922 923 if ((error = gfs_readdir_emit(&gstate, uiop, 924 off, ino, dp->gfsd_static[off].gfse_name, 0)) 925 != 0) 926 break; 927 928 } else if (dp->gfsd_readdir) { 929 off -= dp->gfsd_nstatic; 930 931 if ((error = dp->gfsd_readdir(dvp, 932 gstate.grd_dirent, &eof, &off, &next, 933 data, flags)) != 0 || eof) 934 break; 935 936 off += dp->gfsd_nstatic + 2; 937 next += dp->gfsd_nstatic + 2; 938 939 if ((error = gfs_readdir_emit_int(&gstate, uiop, 940 next)) != 0) 941 break; 942 } else { 943 /* 944 * Offset is beyond the end of the static entries, and 945 * we have no dynamic entries. Set EOF. 946 */ 947 eof = 1; 948 } 949 } 950 951 return (gfs_readdir_fini(&gstate, error, eofp, eof)); 952 } 953 954 955 /* 956 * gfs_vop_lookup: VOP_LOOKUP() entry point 957 * 958 * For use directly in vnode ops table. Given a GFS directory, calls 959 * gfs_dir_lookup() as necessary. 960 */ 961 /* ARGSUSED */ 962 int 963 gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, 964 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 965 int *direntflags, pathname_t *realpnp) 966 { 967 return (gfs_dir_lookup(dvp, nm, vpp, cr)); 968 } 969 970 /* 971 * gfs_vop_readdir: VOP_READDIR() entry point 972 * 973 * For use directly in vnode ops table. Given a GFS directory, calls 974 * gfs_dir_readdir() as necessary. 975 */ 976 /* ARGSUSED */ 977 int 978 gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, 979 caller_context_t *ct, int flags) 980 { 981 return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags)); 982 } 983 984 985 /* 986 * gfs_vop_map: VOP_MAP() entry point 987 * 988 * Convenient routine for handling pseudo-files that wish to allow mmap() calls. 989 * This function only works for readonly files, and uses the read function for 990 * the vnode to fill in the data. The mapped data is immediately faulted in and 991 * filled with the necessary data during this call; there are no getpage() or 992 * putpage() routines. 993 */ 994 /* ARGSUSED */ 995 int 996 gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 997 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred, 998 caller_context_t *ct) 999 { 1000 int rv; 1001 ssize_t resid = len; 1002 1003 /* 1004 * Check for bad parameters 1005 */ 1006 #ifdef _ILP32 1007 if (len > MAXOFF_T) 1008 return (ENOMEM); 1009 #endif 1010 if (vp->v_flag & VNOMAP) 1011 return (ENOTSUP); 1012 if (off > MAXOFF_T) 1013 return (EFBIG); 1014 if ((long)off < 0 || (long)(off + len) < 0) 1015 return (EINVAL); 1016 if (vp->v_type != VREG) 1017 return (ENODEV); 1018 if ((prot & (PROT_EXEC | PROT_WRITE)) != 0) 1019 return (EACCES); 1020 1021 /* 1022 * Find appropriate address if needed, otherwise clear address range. 1023 */ 1024 as_rangelock(as); 1025 if ((flags & MAP_FIXED) == 0) { 1026 map_addr(addrp, len, (offset_t)off, 1, flags); 1027 if (*addrp == NULL) { 1028 as_rangeunlock(as); 1029 return (ENOMEM); 1030 } 1031 } else { 1032 (void) as_unmap(as, *addrp, len); 1033 } 1034 1035 /* 1036 * Create mapping 1037 */ 1038 rv = as_map(as, *addrp, len, segvn_create, zfod_argsp); 1039 as_rangeunlock(as); 1040 if (rv != 0) 1041 return (rv); 1042 1043 /* 1044 * Fill with data from read() 1045 */ 1046 rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE, 1047 0, (rlim64_t)0, cred, &resid); 1048 1049 if (rv == 0 && resid != 0) 1050 rv = ENXIO; 1051 1052 if (rv != 0) { 1053 as_rangelock(as); 1054 (void) as_unmap(as, *addrp, len); 1055 as_rangeunlock(as); 1056 } 1057 1058 return (rv); 1059 } 1060 1061 /* 1062 * gfs_vop_inactive: VOP_INACTIVE() entry point 1063 * 1064 * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or 1065 * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. 1066 */ 1067 /* ARGSUSED */ 1068 void 1069 gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 1070 { 1071 gfs_file_t *fp = vp->v_data; 1072 void *data; 1073 1074 if (fp->gfs_type == GFS_DIR) 1075 data = gfs_dir_inactive(vp); 1076 else 1077 data = gfs_file_inactive(vp); 1078 1079 if (data != NULL) 1080 kmem_free(data, fp->gfs_size); 1081 } 1082