1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright 2012 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29 /* 30 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 31 * All Rights Reserved 32 */ 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cmn_err.h> 38 #include <sys/vtrace.h> 39 #include <sys/session.h> 40 #include <sys/thread.h> 41 #include <sys/dnlc.h> 42 #include <sys/cred.h> 43 #include <sys/priv.h> 44 #include <sys/list.h> 45 #include <sys/sdt.h> 46 #include <sys/policy.h> 47 48 #include <rpc/types.h> 49 #include <rpc/xdr.h> 50 51 #include <nfs/nfs.h> 52 53 #include <nfs/nfs_clnt.h> 54 55 #include <nfs/nfs4.h> 56 #include <nfs/rnode4.h> 57 #include <nfs/nfs4_clnt.h> 58 59 /* utf8-checking variables */ 60 #define UTF8_TAIL_MASK 0xc0 61 #define UTF8_TAIL_SIGNATURE 0x80 62 #define UTF8_TAIL_SHIFT 6 63 #define UTF16_SURROGATE_LOW 0xd800 64 #define UTF16_SURROGATE_HIGH 0xdfff 65 #define UNICODE_INVAL_1 0xfffe 66 #define UNICODE_INVAL_2 0xffff 67 68 typedef struct { 69 unsigned char mask; 70 unsigned char signature; 71 unsigned int min_val; 72 unsigned char tail_bytes; 73 } utf8_encoding_table; 74 75 static utf8_encoding_table utf8_table[] = { 76 { 0x80, 0x00, 0x00000000, 0 }, // 1 byte 77 { 0xe0, 0xc0, 0x00000080, 1 }, // 2 bytes 78 { 0xf0, 0xe0, 0x00000800, 2 }, // 3 bytes 79 { 0xf8, 0xf0, 0x00010000, 3 }, // 4 bytes 80 { 0xfc, 0xf8, 0x00200000, 4 }, // 5 bytes 81 { 0xfe, 0xfc, 0x04000000, 5 }, // 6 bytes 82 { 0, 0, 0, 0 }, 83 }; 84 85 86 /* 87 * client side statistics 88 */ 89 static const struct clstat4 clstat4_tmpl = { 90 { "calls", KSTAT_DATA_UINT64 }, 91 { "badcalls", KSTAT_DATA_UINT64 }, 92 { "referrals", KSTAT_DATA_UINT64 }, 93 { "referlinks", KSTAT_DATA_UINT64 }, 94 { "clgets", KSTAT_DATA_UINT64 }, 95 { "cltoomany", KSTAT_DATA_UINT64 }, 96 #ifdef DEBUG 97 { "clalloc", KSTAT_DATA_UINT64 }, 98 { "noresponse", KSTAT_DATA_UINT64 }, 99 { "failover", KSTAT_DATA_UINT64 }, 100 { "remap", KSTAT_DATA_UINT64 }, 101 #endif 102 }; 103 104 #ifdef DEBUG 105 struct clstat4_debug clstat4_debug = { 106 { "nrnode", KSTAT_DATA_UINT64 }, 107 { "access", KSTAT_DATA_UINT64 }, 108 { "dirent", KSTAT_DATA_UINT64 }, 109 { "dirents", KSTAT_DATA_UINT64 }, 110 { "reclaim", KSTAT_DATA_UINT64 }, 111 { "clreclaim", KSTAT_DATA_UINT64 }, 112 { "f_reclaim", KSTAT_DATA_UINT64 }, 113 { "a_reclaim", KSTAT_DATA_UINT64 }, 114 { "r_reclaim", KSTAT_DATA_UINT64 }, 115 { "r_path", KSTAT_DATA_UINT64 }, 116 }; 117 #endif 118 119 /* 120 * We keep a global list of per-zone client data, so we can clean up all zones 121 * if we get low on memory. 122 */ 123 static list_t nfs4_clnt_list; 124 static kmutex_t nfs4_clnt_list_lock; 125 zone_key_t nfs4clnt_zone_key; 126 127 static struct kmem_cache *chtab4_cache; 128 129 #ifdef DEBUG 130 static int nfs4_rfscall_debug; 131 static int nfs4_try_failover_any; 132 int nfs4_utf8_debug = 0; 133 #endif 134 135 /* 136 * NFSv4 readdir cache implementation 137 */ 138 typedef struct rddir4_cache_impl { 139 rddir4_cache rc; /* readdir cache element */ 140 kmutex_t lock; /* lock protects count */ 141 uint_t count; /* reference count */ 142 avl_node_t tree; /* AVL tree link */ 143 } rddir4_cache_impl; 144 145 static int rddir4_cache_compar(const void *, const void *); 146 static void rddir4_cache_free(rddir4_cache_impl *); 147 static rddir4_cache *rddir4_cache_alloc(int); 148 static void rddir4_cache_hold(rddir4_cache *); 149 static int try_failover(enum clnt_stat); 150 151 static int nfs4_readdir_cache_hits = 0; 152 static int nfs4_readdir_cache_waits = 0; 153 static int nfs4_readdir_cache_misses = 0; 154 155 /* 156 * Shared nfs4 functions 157 */ 158 159 /* 160 * Copy an nfs_fh4. The destination storage (to->nfs_fh4_val) must already 161 * be allocated. 162 */ 163 164 void 165 nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to) 166 { 167 to->nfs_fh4_len = from->nfs_fh4_len; 168 bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len); 169 } 170 171 /* 172 * nfs4cmpfh - compare 2 filehandles. 173 * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is 174 * "less" than the second, +1 if the first is "greater" than the second. 175 */ 176 177 int 178 nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2) 179 { 180 const char *c1, *c2; 181 182 if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len) 183 return (-1); 184 if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len) 185 return (1); 186 for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val; 187 c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len; 188 c1++, c2++) { 189 if (*c1 < *c2) 190 return (-1); 191 if (*c1 > *c2) 192 return (1); 193 } 194 195 return (0); 196 } 197 198 /* 199 * Compare two v4 filehandles. Return zero if they're the same, non-zero 200 * if they're not. Like nfs4cmpfh(), but different filehandle 201 * representation, and doesn't provide information about greater than or 202 * less than. 203 */ 204 205 int 206 nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2) 207 { 208 if (fh1->fh_len == fh2->fh_len) 209 return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len)); 210 211 return (1); 212 } 213 214 int 215 stateid4_cmp(stateid4 *s1, stateid4 *s2) 216 { 217 if (bcmp(s1, s2, sizeof (stateid4)) == 0) 218 return (1); 219 else 220 return (0); 221 } 222 223 nfsstat4 224 puterrno4(int error) 225 { 226 switch (error) { 227 case 0: 228 return (NFS4_OK); 229 case EPERM: 230 return (NFS4ERR_PERM); 231 case ENOENT: 232 return (NFS4ERR_NOENT); 233 case EINTR: 234 return (NFS4ERR_IO); 235 case EIO: 236 return (NFS4ERR_IO); 237 case ENXIO: 238 return (NFS4ERR_NXIO); 239 case ENOMEM: 240 return (NFS4ERR_RESOURCE); 241 case EACCES: 242 return (NFS4ERR_ACCESS); 243 case EBUSY: 244 return (NFS4ERR_IO); 245 case EEXIST: 246 return (NFS4ERR_EXIST); 247 case EXDEV: 248 return (NFS4ERR_XDEV); 249 case ENODEV: 250 return (NFS4ERR_IO); 251 case ENOTDIR: 252 return (NFS4ERR_NOTDIR); 253 case EISDIR: 254 return (NFS4ERR_ISDIR); 255 case EINVAL: 256 return (NFS4ERR_INVAL); 257 case EMFILE: 258 return (NFS4ERR_RESOURCE); 259 case EFBIG: 260 return (NFS4ERR_FBIG); 261 case ENOSPC: 262 return (NFS4ERR_NOSPC); 263 case EROFS: 264 return (NFS4ERR_ROFS); 265 case EMLINK: 266 return (NFS4ERR_MLINK); 267 case EDEADLK: 268 return (NFS4ERR_DEADLOCK); 269 case ENOLCK: 270 return (NFS4ERR_DENIED); 271 case EREMOTE: 272 return (NFS4ERR_SERVERFAULT); 273 case ENOTSUP: 274 return (NFS4ERR_NOTSUPP); 275 case EDQUOT: 276 return (NFS4ERR_DQUOT); 277 case ENAMETOOLONG: 278 return (NFS4ERR_NAMETOOLONG); 279 case EOVERFLOW: 280 return (NFS4ERR_INVAL); 281 case ENOSYS: 282 return (NFS4ERR_NOTSUPP); 283 case ENOTEMPTY: 284 return (NFS4ERR_NOTEMPTY); 285 case EOPNOTSUPP: 286 return (NFS4ERR_NOTSUPP); 287 case ESTALE: 288 return (NFS4ERR_STALE); 289 case EAGAIN: 290 if (curthread->t_flag & T_WOULDBLOCK) { 291 curthread->t_flag &= ~T_WOULDBLOCK; 292 return (NFS4ERR_DELAY); 293 } 294 return (NFS4ERR_LOCKED); 295 default: 296 return ((enum nfsstat4)error); 297 } 298 } 299 300 int 301 geterrno4(enum nfsstat4 status) 302 { 303 switch (status) { 304 case NFS4_OK: 305 return (0); 306 case NFS4ERR_PERM: 307 return (EPERM); 308 case NFS4ERR_NOENT: 309 return (ENOENT); 310 case NFS4ERR_IO: 311 return (EIO); 312 case NFS4ERR_NXIO: 313 return (ENXIO); 314 case NFS4ERR_ACCESS: 315 return (EACCES); 316 case NFS4ERR_EXIST: 317 return (EEXIST); 318 case NFS4ERR_XDEV: 319 return (EXDEV); 320 case NFS4ERR_NOTDIR: 321 return (ENOTDIR); 322 case NFS4ERR_ISDIR: 323 return (EISDIR); 324 case NFS4ERR_INVAL: 325 return (EINVAL); 326 case NFS4ERR_FBIG: 327 return (EFBIG); 328 case NFS4ERR_NOSPC: 329 return (ENOSPC); 330 case NFS4ERR_ROFS: 331 return (EROFS); 332 case NFS4ERR_MLINK: 333 return (EMLINK); 334 case NFS4ERR_NAMETOOLONG: 335 return (ENAMETOOLONG); 336 case NFS4ERR_NOTEMPTY: 337 return (ENOTEMPTY); 338 case NFS4ERR_DQUOT: 339 return (EDQUOT); 340 case NFS4ERR_STALE: 341 return (ESTALE); 342 case NFS4ERR_BADHANDLE: 343 return (ESTALE); 344 case NFS4ERR_BAD_COOKIE: 345 return (EINVAL); 346 case NFS4ERR_NOTSUPP: 347 return (EOPNOTSUPP); 348 case NFS4ERR_TOOSMALL: 349 return (EINVAL); 350 case NFS4ERR_SERVERFAULT: 351 return (EIO); 352 case NFS4ERR_BADTYPE: 353 return (EINVAL); 354 case NFS4ERR_DELAY: 355 return (ENXIO); 356 case NFS4ERR_SAME: 357 return (EPROTO); 358 case NFS4ERR_DENIED: 359 return (ENOLCK); 360 case NFS4ERR_EXPIRED: 361 return (EPROTO); 362 case NFS4ERR_LOCKED: 363 return (EACCES); 364 case NFS4ERR_GRACE: 365 return (EAGAIN); 366 case NFS4ERR_FHEXPIRED: /* if got here, failed to get a new fh */ 367 return (ESTALE); 368 case NFS4ERR_SHARE_DENIED: 369 return (EACCES); 370 case NFS4ERR_WRONGSEC: 371 return (EPERM); 372 case NFS4ERR_CLID_INUSE: 373 return (EAGAIN); 374 case NFS4ERR_RESOURCE: 375 return (EAGAIN); 376 case NFS4ERR_MOVED: 377 return (EPROTO); 378 case NFS4ERR_NOFILEHANDLE: 379 return (EIO); 380 case NFS4ERR_MINOR_VERS_MISMATCH: 381 return (ENOTSUP); 382 case NFS4ERR_STALE_CLIENTID: 383 return (EIO); 384 case NFS4ERR_STALE_STATEID: 385 return (EIO); 386 case NFS4ERR_OLD_STATEID: 387 return (EIO); 388 case NFS4ERR_BAD_STATEID: 389 return (EIO); 390 case NFS4ERR_BAD_SEQID: 391 return (EIO); 392 case NFS4ERR_NOT_SAME: 393 return (EPROTO); 394 case NFS4ERR_LOCK_RANGE: 395 return (EPROTO); 396 case NFS4ERR_SYMLINK: 397 return (EPROTO); 398 case NFS4ERR_RESTOREFH: 399 return (EPROTO); 400 case NFS4ERR_LEASE_MOVED: 401 return (EPROTO); 402 case NFS4ERR_ATTRNOTSUPP: 403 return (ENOTSUP); 404 case NFS4ERR_NO_GRACE: 405 return (EPROTO); 406 case NFS4ERR_RECLAIM_BAD: 407 return (EPROTO); 408 case NFS4ERR_RECLAIM_CONFLICT: 409 return (EPROTO); 410 case NFS4ERR_BADXDR: 411 return (EINVAL); 412 case NFS4ERR_LOCKS_HELD: 413 return (EIO); 414 case NFS4ERR_OPENMODE: 415 return (EACCES); 416 case NFS4ERR_BADOWNER: 417 /* 418 * Client and server are in different DNS domains 419 * and the NFSMAPID_DOMAIN in /etc/default/nfs 420 * doesn't match. No good answer here. Return 421 * EACCESS, which translates to "permission denied". 422 */ 423 return (EACCES); 424 case NFS4ERR_BADCHAR: 425 return (EINVAL); 426 case NFS4ERR_BADNAME: 427 return (EINVAL); 428 case NFS4ERR_BAD_RANGE: 429 return (EIO); 430 case NFS4ERR_LOCK_NOTSUPP: 431 return (ENOTSUP); 432 case NFS4ERR_OP_ILLEGAL: 433 return (EINVAL); 434 case NFS4ERR_DEADLOCK: 435 return (EDEADLK); 436 case NFS4ERR_FILE_OPEN: 437 return (EACCES); 438 case NFS4ERR_ADMIN_REVOKED: 439 return (EPROTO); 440 case NFS4ERR_CB_PATH_DOWN: 441 return (EPROTO); 442 default: 443 #ifdef DEBUG 444 zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d", 445 status); 446 #endif 447 return ((int)status); 448 } 449 } 450 451 void 452 nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op) 453 { 454 nfs4_server_t *server; 455 456 /* 457 * Return if already printed/queued a msg 458 * for this mount point. 459 */ 460 if (mi->mi_flags & MI4_BADOWNER_DEBUG) 461 return; 462 /* 463 * Happens once per client <-> server pair. 464 */ 465 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 466 mi->mi_flags & MI4_INT)) 467 return; 468 469 server = find_nfs4_server(mi); 470 if (server == NULL) { 471 nfs_rw_exit(&mi->mi_recovlock); 472 return; 473 } 474 475 if (!(server->s_flags & N4S_BADOWNER_DEBUG)) { 476 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 477 "!NFSMAPID_DOMAIN does not match" 478 " the server: %s domain.\n" 479 "Please check configuration", 480 mi->mi_curr_serv->sv_hostname); 481 server->s_flags |= N4S_BADOWNER_DEBUG; 482 } 483 mutex_exit(&server->s_lock); 484 nfs4_server_rele(server); 485 nfs_rw_exit(&mi->mi_recovlock); 486 487 /* 488 * Happens once per mntinfo4_t. 489 * This error is deemed as one of the recovery facts "RF_BADOWNER", 490 * queue this in the mesg queue for this mount_info. This message 491 * is not printed, meaning its absent from id_to_dump_solo_fact() 492 * but its there for inspection if the queue is ever dumped/inspected. 493 */ 494 mutex_enter(&mi->mi_lock); 495 if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) { 496 nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op, 497 FALSE, NULL, 0, NULL); 498 mi->mi_flags |= MI4_BADOWNER_DEBUG; 499 } 500 mutex_exit(&mi->mi_lock); 501 } 502 503 int 504 nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime) 505 { 506 int64_t sec; 507 int32_t nsec; 508 509 /* 510 * Here check that the nfsv4 time is valid for the system. 511 * nfsv4 time value is a signed 64-bit, and the system time 512 * may be either int64_t or int32_t (depends on the kernel), 513 * so if the kernel is 32-bit, the nfsv4 time value may not fit. 514 */ 515 #ifndef _LP64 516 if (! NFS4_TIME_OK(ntime->seconds)) { 517 return (EOVERFLOW); 518 } 519 #endif 520 521 /* Invalid to specify 1 billion (or more) nsecs */ 522 if (ntime->nseconds >= 1000000000) 523 return (EINVAL); 524 525 if (ntime->seconds < 0) { 526 sec = ntime->seconds + 1; 527 nsec = -1000000000 + ntime->nseconds; 528 } else { 529 sec = ntime->seconds; 530 nsec = ntime->nseconds; 531 } 532 533 vatime->tv_sec = sec; 534 vatime->tv_nsec = nsec; 535 536 return (0); 537 } 538 539 int 540 nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime) 541 { 542 int64_t sec; 543 uint32_t nsec; 544 545 /* 546 * nfsv4 time value is a signed 64-bit, and the system time 547 * may be either int64_t or int32_t (depends on the kernel), 548 * so all system time values will fit. 549 */ 550 if (vatime->tv_nsec >= 0) { 551 sec = vatime->tv_sec; 552 nsec = vatime->tv_nsec; 553 } else { 554 sec = vatime->tv_sec - 1; 555 nsec = 1000000000 + vatime->tv_nsec; 556 } 557 ntime->seconds = sec; 558 ntime->nseconds = nsec; 559 560 return (0); 561 } 562 563 /* 564 * Converts a utf8 string to a valid null terminated filename string. 565 * 566 * XXX - Not actually translating the UTF-8 string as per RFC 2279. 567 * For now, just validate that the UTF-8 string off the wire 568 * does not have characters that will freak out UFS, and leave 569 * it at that. 570 */ 571 char * 572 utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s) 573 { 574 ASSERT(lenp != NULL); 575 576 if (u8s == NULL || u8s->utf8string_len <= 0 || 577 u8s->utf8string_val == NULL) 578 return (NULL); 579 580 /* 581 * Check for obvious illegal filename chars 582 */ 583 if (utf8_strchr(u8s, '/') != NULL) { 584 #ifdef DEBUG 585 if (nfs4_utf8_debug) { 586 char *path; 587 int len = u8s->utf8string_len; 588 589 path = kmem_alloc(len + 1, KM_SLEEP); 590 bcopy(u8s->utf8string_val, path, len); 591 path[len] = '\0'; 592 593 zcmn_err(getzoneid(), CE_WARN, 594 "Invalid UTF-8 filename: %s", path); 595 596 kmem_free(path, len + 1); 597 } 598 #endif 599 return (NULL); 600 } 601 602 return (utf8_to_str(u8s, lenp, s)); 603 } 604 605 /* 606 * Converts a utf8 string to a C string. 607 * kmem_allocs a new string if not supplied 608 */ 609 char * 610 utf8_to_str(utf8string *str, uint_t *lenp, char *s) 611 { 612 char *sp; 613 char *u8p; 614 int len; 615 int i; 616 617 ASSERT(lenp != NULL); 618 619 if (str == NULL) 620 return (NULL); 621 622 u8p = str->utf8string_val; 623 len = str->utf8string_len; 624 if (len <= 0 || u8p == NULL) { 625 if (s) 626 *s = '\0'; 627 return (NULL); 628 } 629 630 sp = s; 631 if (sp == NULL) 632 sp = kmem_alloc(len + 1, KM_SLEEP); 633 634 /* 635 * At least check for embedded nulls 636 */ 637 for (i = 0; i < len; i++) { 638 sp[i] = u8p[i]; 639 if (u8p[i] == '\0') { 640 #ifdef DEBUG 641 zcmn_err(getzoneid(), CE_WARN, 642 "Embedded NULL in UTF-8 string"); 643 #endif 644 if (s == NULL) 645 kmem_free(sp, len + 1); 646 return (NULL); 647 } 648 } 649 sp[len] = '\0'; 650 *lenp = len + 1; 651 652 return (sp); 653 } 654 655 /* 656 * str_to_utf8 - converts a null-terminated C string to a utf8 string 657 */ 658 utf8string * 659 str_to_utf8(char *nm, utf8string *str) 660 { 661 int len; 662 663 if (str == NULL) 664 return (NULL); 665 666 if (nm == NULL || *nm == '\0') { 667 str->utf8string_len = 0; 668 str->utf8string_val = NULL; 669 } 670 671 len = strlen(nm); 672 673 str->utf8string_val = kmem_alloc(len, KM_SLEEP); 674 str->utf8string_len = len; 675 bcopy(nm, str->utf8string_val, len); 676 677 return (str); 678 } 679 680 utf8string * 681 utf8_copy(utf8string *src, utf8string *dest) 682 { 683 if (src == NULL) 684 return (NULL); 685 if (dest == NULL) 686 return (NULL); 687 688 if (src->utf8string_len > 0) { 689 dest->utf8string_val = kmem_alloc(src->utf8string_len, 690 KM_SLEEP); 691 bcopy(src->utf8string_val, dest->utf8string_val, 692 src->utf8string_len); 693 dest->utf8string_len = src->utf8string_len; 694 } else { 695 dest->utf8string_val = NULL; 696 dest->utf8string_len = 0; 697 } 698 699 return (dest); 700 } 701 702 int 703 utf8_compare(const utf8string *a, const utf8string *b) 704 { 705 int mlen, cmp; 706 int alen, blen; 707 char *aval, *bval; 708 709 if ((a == NULL) && (b == NULL)) 710 return (0); 711 else if (a == NULL) 712 return (-1); 713 else if (b == NULL) 714 return (1); 715 716 alen = a->utf8string_len; 717 blen = b->utf8string_len; 718 aval = a->utf8string_val; 719 bval = b->utf8string_val; 720 721 if (((alen == 0) || (aval == NULL)) && 722 ((blen == 0) || (bval == NULL))) 723 return (0); 724 else if ((alen == 0) || (aval == NULL)) 725 return (-1); 726 else if ((blen == 0) || (bval == NULL)) 727 return (1); 728 729 mlen = MIN(alen, blen); 730 cmp = strncmp(aval, bval, mlen); 731 732 if ((cmp == 0) && (alen == blen)) 733 return (0); 734 else if ((cmp == 0) && (alen < blen)) 735 return (-1); 736 else if (cmp == 0) 737 return (1); 738 else if (cmp < 0) 739 return (-1); 740 return (1); 741 } 742 743 /* 744 * utf8_name_verify - verify utf8-correctness of the passed string. 745 * 746 * Byte's checking is performed by applying and-mask to byte and checking 747 * result of this operation (signature). 748 * ~mask used to extract valuable bits from byte that will be put in 'symbol' 749 * that represents encoded unicode character. 750 * 751 * Symbols encoded with UTF8 have following format: 752 * 0xxxxxxx - 1 byte symbol 753 * 110xxxxx 10xxxxxx - 2 bytes 754 * 1110xxxx 10xxxxxx 10xxxxxx - 3 bytes 755 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 4 bytes 756 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 5 bytes 757 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 6 bytes 758 */ 759 nfsstat4 760 utf8_name_verify(utf8string *str) 761 { 762 int len = str->utf8string_len; 763 unsigned char *u8p = (unsigned char *) str->utf8string_val; 764 int pos = 0; 765 766 while (pos < len) { 767 unsigned char c = u8p[pos++]; 768 int i; 769 unsigned int symbol; 770 utf8_encoding_table * encoding = utf8_table; 771 772 /* check leading byte */ 773 while (encoding->mask != 0x00) { 774 if ((c & encoding->mask) == encoding->signature) 775 break; 776 ++encoding; 777 } 778 if (encoding->mask == 0x00) 779 return (NFS4ERR_INVAL); 780 781 symbol = c & (~encoding->mask); 782 783 /* check tail bytes if leading byte describes so */ 784 for (i = 0; i < encoding->tail_bytes; ++i) { 785 if (pos >= len) 786 return (NFS4ERR_INVAL); 787 c = u8p[pos++]; 788 if ((c & UTF8_TAIL_MASK) != UTF8_TAIL_SIGNATURE) 789 return (NFS4ERR_INVAL); 790 symbol <<= UTF8_TAIL_SHIFT; 791 symbol |= (c & (~UTF8_TAIL_MASK)); 792 } 793 794 /* check UTF-16 surrogate */ 795 if ((symbol >= UTF16_SURROGATE_LOW) && 796 (symbol <= UTF16_SURROGATE_HIGH)) 797 return (NFS4ERR_INVAL); 798 799 /* check wrong Unicode character case */ 800 if ((symbol == UNICODE_INVAL_1) || (symbol == UNICODE_INVAL_2)) 801 return (NFS4ERR_INVAL); 802 803 /* check overlonging */ 804 if (symbol < encoding->min_val) 805 return (NFS4ERR_INVAL); 806 } 807 808 return (NFS4_OK); 809 } 810 811 /* 812 * utf8_dir_verify - checks that the utf8 string is valid 813 */ 814 nfsstat4 815 utf8_dir_verify(utf8string *str) 816 { 817 char *nm; 818 int len; 819 820 if (str == NULL) 821 return (NFS4ERR_INVAL); 822 823 nm = str->utf8string_val; 824 len = str->utf8string_len; 825 if (nm == NULL || len == 0) { 826 return (NFS4ERR_INVAL); 827 } 828 829 if (len == 1 && nm[0] == '.') 830 return (NFS4ERR_BADNAME); 831 if (len == 2 && nm[0] == '.' && nm[1] == '.') 832 return (NFS4ERR_BADNAME); 833 834 if (utf8_strchr(str, '/') != NULL) 835 return (NFS4ERR_BADNAME); 836 837 if (utf8_strchr(str, '\0') != NULL) 838 return (NFS4ERR_BADNAME); 839 840 return (utf8_name_verify(str)); 841 } 842 843 /* 844 * from rpcsec module (common/rpcsec) 845 */ 846 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 847 extern void sec_clnt_freeh(AUTH *); 848 extern void sec_clnt_freeinfo(struct sec_data *); 849 850 /* 851 * authget() gets an auth handle based on the security 852 * information from the servinfo in mountinfo. 853 * The auth handle is stored in ch_client->cl_auth. 854 * 855 * First security flavor of choice is to use sv_secdata 856 * which is initiated by the client. If that fails, get 857 * secinfo from the server and then select one from the 858 * server secinfo list . 859 * 860 * For RPCSEC_GSS flavor, upon success, a secure context is 861 * established between client and server. 862 */ 863 int 864 authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr) 865 { 866 int error, i; 867 868 /* 869 * SV4_TRYSECINFO indicates to try the secinfo list from 870 * sv_secinfo until a successful one is reached. Point 871 * sv_currsec to the selected security mechanism for 872 * later sessions. 873 */ 874 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0); 875 if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) { 876 for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count; 877 i++) { 878 if (!(error = sec_clnt_geth(ch_client, 879 &svp->sv_secinfo->sdata[i], 880 cr, &ch_client->cl_auth))) { 881 882 svp->sv_currsec = &svp->sv_secinfo->sdata[i]; 883 svp->sv_secinfo->index = i; 884 /* done */ 885 svp->sv_flags &= ~SV4_TRYSECINFO; 886 break; 887 } 888 889 /* 890 * Allow the caller retry with the security flavor 891 * pointed by svp->sv_secinfo->index when 892 * ETIMEDOUT/ECONNRESET occurs. 893 */ 894 if (error == ETIMEDOUT || error == ECONNRESET) { 895 svp->sv_secinfo->index = i; 896 break; 897 } 898 } 899 } else { 900 /* sv_currsec points to one of the entries in sv_secinfo */ 901 if (svp->sv_currsec) { 902 error = sec_clnt_geth(ch_client, svp->sv_currsec, cr, 903 &ch_client->cl_auth); 904 } else { 905 /* If it's null, use sv_secdata. */ 906 error = sec_clnt_geth(ch_client, svp->sv_secdata, cr, 907 &ch_client->cl_auth); 908 } 909 } 910 nfs_rw_exit(&svp->sv_lock); 911 912 return (error); 913 } 914 915 /* 916 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 917 */ 918 int 919 clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl, 920 struct chtab **chp, struct nfs4_clnt *nfscl) 921 { 922 struct chhead *ch, *newch; 923 struct chhead **plistp; 924 struct chtab *cp; 925 int error; 926 k_sigset_t smask; 927 928 if (newcl == NULL || chp == NULL || ci == NULL) 929 return (EINVAL); 930 931 *newcl = NULL; 932 *chp = NULL; 933 934 /* 935 * Find an unused handle or create one 936 */ 937 newch = NULL; 938 nfscl->nfscl_stat.clgets.value.ui64++; 939 top: 940 /* 941 * Find the correct entry in the cache to check for free 942 * client handles. The search is based on the RPC program 943 * number, program version number, dev_t for the transport 944 * device, and the protocol family. 945 */ 946 mutex_enter(&nfscl->nfscl_chtable4_lock); 947 plistp = &nfscl->nfscl_chtable4; 948 for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) { 949 if (ch->ch_prog == ci->cl_prog && 950 ch->ch_vers == ci->cl_vers && 951 ch->ch_dev == svp->sv_knconf->knc_rdev && 952 (strcmp(ch->ch_protofmly, 953 svp->sv_knconf->knc_protofmly) == 0)) 954 break; 955 plistp = &ch->ch_next; 956 } 957 958 /* 959 * If we didn't find a cache entry for this quadruple, then 960 * create one. If we don't have one already preallocated, 961 * then drop the cache lock, create one, and then start over. 962 * If we did have a preallocated entry, then just add it to 963 * the front of the list. 964 */ 965 if (ch == NULL) { 966 if (newch == NULL) { 967 mutex_exit(&nfscl->nfscl_chtable4_lock); 968 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 969 newch->ch_timesused = 0; 970 newch->ch_prog = ci->cl_prog; 971 newch->ch_vers = ci->cl_vers; 972 newch->ch_dev = svp->sv_knconf->knc_rdev; 973 newch->ch_protofmly = kmem_alloc( 974 strlen(svp->sv_knconf->knc_protofmly) + 1, 975 KM_SLEEP); 976 (void) strcpy(newch->ch_protofmly, 977 svp->sv_knconf->knc_protofmly); 978 newch->ch_list = NULL; 979 goto top; 980 } 981 ch = newch; 982 newch = NULL; 983 ch->ch_next = nfscl->nfscl_chtable4; 984 nfscl->nfscl_chtable4 = ch; 985 /* 986 * We found a cache entry, but if it isn't on the front of the 987 * list, then move it to the front of the list to try to take 988 * advantage of locality of operations. 989 */ 990 } else if (ch != nfscl->nfscl_chtable4) { 991 *plistp = ch->ch_next; 992 ch->ch_next = nfscl->nfscl_chtable4; 993 nfscl->nfscl_chtable4 = ch; 994 } 995 996 /* 997 * If there was a free client handle cached, then remove it 998 * from the list, init it, and use it. 999 */ 1000 if (ch->ch_list != NULL) { 1001 cp = ch->ch_list; 1002 ch->ch_list = cp->ch_list; 1003 mutex_exit(&nfscl->nfscl_chtable4_lock); 1004 if (newch != NULL) { 1005 kmem_free(newch->ch_protofmly, 1006 strlen(newch->ch_protofmly) + 1); 1007 kmem_free(newch, sizeof (*newch)); 1008 } 1009 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 1010 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 1011 1012 /* 1013 * Get an auth handle. 1014 */ 1015 error = authget(svp, cp->ch_client, cr); 1016 if (error || cp->ch_client->cl_auth == NULL) { 1017 CLNT_DESTROY(cp->ch_client); 1018 kmem_cache_free(chtab4_cache, cp); 1019 return ((error != 0) ? error : EINTR); 1020 } 1021 ch->ch_timesused++; 1022 *newcl = cp->ch_client; 1023 *chp = cp; 1024 return (0); 1025 } 1026 1027 /* 1028 * There weren't any free client handles which fit, so allocate 1029 * a new one and use that. 1030 */ 1031 #ifdef DEBUG 1032 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); 1033 #endif 1034 mutex_exit(&nfscl->nfscl_chtable4_lock); 1035 1036 nfscl->nfscl_stat.cltoomany.value.ui64++; 1037 if (newch != NULL) { 1038 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 1039 kmem_free(newch, sizeof (*newch)); 1040 } 1041 1042 cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP); 1043 cp->ch_head = ch; 1044 1045 sigintr(&smask, (int)ci->cl_flags & MI4_INT); 1046 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 1047 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 1048 sigunintr(&smask); 1049 1050 if (error != 0) { 1051 kmem_cache_free(chtab4_cache, cp); 1052 #ifdef DEBUG 1053 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 1054 #endif 1055 /* 1056 * Warning is unnecessary if error is EINTR. 1057 */ 1058 if (error != EINTR) { 1059 nfs_cmn_err(error, CE_WARN, 1060 "clget: couldn't create handle: %m\n"); 1061 } 1062 return (error); 1063 } 1064 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 1065 auth_destroy(cp->ch_client->cl_auth); 1066 1067 /* 1068 * Get an auth handle. 1069 */ 1070 error = authget(svp, cp->ch_client, cr); 1071 if (error || cp->ch_client->cl_auth == NULL) { 1072 CLNT_DESTROY(cp->ch_client); 1073 kmem_cache_free(chtab4_cache, cp); 1074 #ifdef DEBUG 1075 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 1076 #endif 1077 return ((error != 0) ? error : EINTR); 1078 } 1079 ch->ch_timesused++; 1080 *newcl = cp->ch_client; 1081 ASSERT(cp->ch_client->cl_nosignal == FALSE); 1082 *chp = cp; 1083 return (0); 1084 } 1085 1086 static int 1087 nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl, 1088 struct chtab **chp, struct nfs4_clnt *nfscl) 1089 { 1090 clinfo_t ci; 1091 bool_t is_recov; 1092 int firstcall, error = 0; 1093 1094 /* 1095 * Set read buffer size to rsize 1096 * and add room for RPC headers. 1097 */ 1098 ci.cl_readsize = mi->mi_tsize; 1099 if (ci.cl_readsize != 0) 1100 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 1101 1102 /* 1103 * If soft mount and server is down just try once. 1104 * meaning: do not retransmit. 1105 */ 1106 if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN)) 1107 ci.cl_retrans = 0; 1108 else 1109 ci.cl_retrans = mi->mi_retrans; 1110 1111 ci.cl_prog = mi->mi_prog; 1112 ci.cl_vers = mi->mi_vers; 1113 ci.cl_flags = mi->mi_flags; 1114 1115 /* 1116 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS 1117 * security flavor, the client tries to establish a security context 1118 * by contacting the server. If the connection is timed out or reset, 1119 * e.g. server reboot, we will try again. 1120 */ 1121 is_recov = (curthread == mi->mi_recovthread); 1122 firstcall = 1; 1123 1124 do { 1125 error = clget4(&ci, svp, cr, newcl, chp, nfscl); 1126 1127 if (error == 0) 1128 break; 1129 1130 /* 1131 * For forced unmount and zone shutdown, bail out but 1132 * let the recovery thread do one more transmission. 1133 */ 1134 if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) && 1135 (!is_recov || !firstcall)) { 1136 error = EIO; 1137 break; 1138 } 1139 1140 /* do not retry for soft mount */ 1141 if (!(mi->mi_flags & MI4_HARD)) 1142 break; 1143 1144 /* let the caller deal with the failover case */ 1145 if (FAILOVER_MOUNT4(mi)) 1146 break; 1147 1148 firstcall = 0; 1149 1150 } while (error == ETIMEDOUT || error == ECONNRESET); 1151 1152 return (error); 1153 } 1154 1155 void 1156 clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl) 1157 { 1158 if (cl->cl_auth != NULL) { 1159 sec_clnt_freeh(cl->cl_auth); 1160 cl->cl_auth = NULL; 1161 } 1162 1163 /* 1164 * Timestamp this cache entry so that we know when it was last 1165 * used. 1166 */ 1167 cp->ch_freed = gethrestime_sec(); 1168 1169 /* 1170 * Add the free client handle to the front of the list. 1171 * This way, the list will be sorted in youngest to oldest 1172 * order. 1173 */ 1174 mutex_enter(&nfscl->nfscl_chtable4_lock); 1175 cp->ch_list = cp->ch_head->ch_list; 1176 cp->ch_head->ch_list = cp; 1177 mutex_exit(&nfscl->nfscl_chtable4_lock); 1178 } 1179 1180 #define CL_HOLDTIME 60 /* time to hold client handles */ 1181 1182 static void 1183 clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime) 1184 { 1185 struct chhead *ch; 1186 struct chtab *cp; /* list of objects that can be reclaimed */ 1187 struct chtab *cpe; 1188 struct chtab *cpl; 1189 struct chtab **cpp; 1190 #ifdef DEBUG 1191 int n = 0; 1192 clstat4_debug.clreclaim.value.ui64++; 1193 #endif 1194 1195 /* 1196 * Need to reclaim some memory, so step through the cache 1197 * looking through the lists for entries which can be freed. 1198 */ 1199 cp = NULL; 1200 1201 mutex_enter(&nfscl->nfscl_chtable4_lock); 1202 1203 /* 1204 * Here we step through each non-NULL quadruple and start to 1205 * construct the reclaim list pointed to by cp. Note that 1206 * cp will contain all eligible chtab entries. When this traversal 1207 * completes, chtab entries from the last quadruple will be at the 1208 * front of cp and entries from previously inspected quadruples have 1209 * been appended to the rear of cp. 1210 */ 1211 for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) { 1212 if (ch->ch_list == NULL) 1213 continue; 1214 /* 1215 * Search each list for entries older then 1216 * cl_holdtime seconds. The lists are maintained 1217 * in youngest to oldest order so that when the 1218 * first entry is found which is old enough, then 1219 * all of the rest of the entries on the list will 1220 * be old enough as well. 1221 */ 1222 cpl = ch->ch_list; 1223 cpp = &ch->ch_list; 1224 while (cpl != NULL && 1225 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 1226 cpp = &cpl->ch_list; 1227 cpl = cpl->ch_list; 1228 } 1229 if (cpl != NULL) { 1230 *cpp = NULL; 1231 if (cp != NULL) { 1232 cpe = cpl; 1233 while (cpe->ch_list != NULL) 1234 cpe = cpe->ch_list; 1235 cpe->ch_list = cp; 1236 } 1237 cp = cpl; 1238 } 1239 } 1240 1241 mutex_exit(&nfscl->nfscl_chtable4_lock); 1242 1243 /* 1244 * If cp is empty, then there is nothing to reclaim here. 1245 */ 1246 if (cp == NULL) 1247 return; 1248 1249 /* 1250 * Step through the list of entries to free, destroying each client 1251 * handle and kmem_free'ing the memory for each entry. 1252 */ 1253 while (cp != NULL) { 1254 #ifdef DEBUG 1255 n++; 1256 #endif 1257 CLNT_DESTROY(cp->ch_client); 1258 cpl = cp->ch_list; 1259 kmem_cache_free(chtab4_cache, cp); 1260 cp = cpl; 1261 } 1262 1263 #ifdef DEBUG 1264 /* 1265 * Update clalloc so that nfsstat shows the current number 1266 * of allocated client handles. 1267 */ 1268 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 1269 #endif 1270 } 1271 1272 /* ARGSUSED */ 1273 static void 1274 clreclaim4(void *all) 1275 { 1276 struct nfs4_clnt *nfscl; 1277 1278 /* 1279 * The system is low on memory; go through and try to reclaim some from 1280 * every zone on the system. 1281 */ 1282 mutex_enter(&nfs4_clnt_list_lock); 1283 nfscl = list_head(&nfs4_clnt_list); 1284 for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) 1285 clreclaim4_zone(nfscl, CL_HOLDTIME); 1286 mutex_exit(&nfs4_clnt_list_lock); 1287 } 1288 1289 /* 1290 * Minimum time-out values indexed by call type 1291 * These units are in "eights" of a second to avoid multiplies 1292 */ 1293 static unsigned int minimum_timeo[] = { 1294 6, 7, 10 1295 }; 1296 1297 #define SHORTWAIT (NFS_COTS_TIMEO / 10) 1298 1299 /* 1300 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 1301 */ 1302 #define MAXTIMO (20*hz) 1303 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 1304 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 1305 1306 static int 1307 nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1308 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue, 1309 enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl) 1310 { 1311 CLIENT *client; 1312 struct chtab *ch; 1313 cred_t *cr = icr; 1314 struct rpc_err rpcerr, rpcerr_tmp; 1315 enum clnt_stat status; 1316 int error; 1317 struct timeval wait; 1318 int timeo; /* in units of hz */ 1319 bool_t tryagain, is_recov; 1320 bool_t cred_cloned = FALSE; 1321 k_sigset_t smask; 1322 servinfo4_t *svp; 1323 #ifdef DEBUG 1324 char *bufp; 1325 #endif 1326 int firstcall; 1327 1328 rpcerr.re_status = RPC_SUCCESS; 1329 1330 /* 1331 * If we know that we are rebooting then let's 1332 * not bother with doing any over the wireness. 1333 */ 1334 mutex_enter(&mi->mi_lock); 1335 if (mi->mi_flags & MI4_SHUTDOWN) { 1336 mutex_exit(&mi->mi_lock); 1337 return (EIO); 1338 } 1339 mutex_exit(&mi->mi_lock); 1340 1341 /* For TSOL, use a new cred which has net_mac_aware flag */ 1342 if (!cred_cloned && is_system_labeled()) { 1343 cred_cloned = TRUE; 1344 cr = crdup(icr); 1345 (void) setpflags(NET_MAC_AWARE, 1, cr); 1346 } 1347 1348 /* 1349 * clget() calls clnt_tli_kinit() which clears the xid, so we 1350 * are guaranteed to reprocess the retry as a new request. 1351 */ 1352 svp = mi->mi_curr_serv; 1353 rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl); 1354 if (rpcerr.re_errno != 0) 1355 return (rpcerr.re_errno); 1356 1357 timeo = (mi->mi_timeo * hz) / 10; 1358 1359 /* 1360 * If hard mounted fs, retry call forever unless hard error 1361 * occurs. 1362 * 1363 * For forced unmount, let the recovery thread through but return 1364 * an error for all others. This is so that user processes can 1365 * exit quickly. The recovery thread bails out after one 1366 * transmission so that it can tell if it needs to continue. 1367 * 1368 * For zone shutdown, behave as above to encourage quick 1369 * process exit, but also fail quickly when servers have 1370 * timed out before and reduce the timeouts. 1371 */ 1372 is_recov = (curthread == mi->mi_recovthread); 1373 firstcall = 1; 1374 do { 1375 tryagain = FALSE; 1376 1377 NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE, 1378 "nfs4_rfscall: vfs_flag=0x%x, %s", 1379 mi->mi_vfsp->vfs_flag, 1380 is_recov ? "recov thread" : "not recov thread")); 1381 1382 /* 1383 * It's possible while we're retrying the admin 1384 * decided to reboot. 1385 */ 1386 mutex_enter(&mi->mi_lock); 1387 if (mi->mi_flags & MI4_SHUTDOWN) { 1388 mutex_exit(&mi->mi_lock); 1389 clfree4(client, ch, nfscl); 1390 if (cred_cloned) 1391 crfree(cr); 1392 return (EIO); 1393 } 1394 mutex_exit(&mi->mi_lock); 1395 1396 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) && 1397 (!is_recov || !firstcall)) { 1398 clfree4(client, ch, nfscl); 1399 if (cred_cloned) 1400 crfree(cr); 1401 return (EIO); 1402 } 1403 1404 if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) { 1405 mutex_enter(&mi->mi_lock); 1406 if ((mi->mi_flags & MI4_TIMEDOUT) || 1407 !is_recov || !firstcall) { 1408 mutex_exit(&mi->mi_lock); 1409 clfree4(client, ch, nfscl); 1410 if (cred_cloned) 1411 crfree(cr); 1412 return (EIO); 1413 } 1414 mutex_exit(&mi->mi_lock); 1415 timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10; 1416 } 1417 1418 firstcall = 0; 1419 TICK_TO_TIMEVAL(timeo, &wait); 1420 1421 /* 1422 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1423 * and SIGTERM. (Preserving the existing masks). 1424 * Mask out SIGINT if mount option nointr is specified. 1425 */ 1426 sigintr(&smask, (int)mi->mi_flags & MI4_INT); 1427 if (!(mi->mi_flags & MI4_INT)) 1428 client->cl_nosignal = TRUE; 1429 1430 /* 1431 * If there is a current signal, then don't bother 1432 * even trying to send out the request because we 1433 * won't be able to block waiting for the response. 1434 * Simply assume RPC_INTR and get on with it. 1435 */ 1436 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1437 status = RPC_INTR; 1438 else { 1439 status = CLNT_CALL(client, which, xdrargs, argsp, 1440 xdrres, resp, wait); 1441 } 1442 1443 if (!(mi->mi_flags & MI4_INT)) 1444 client->cl_nosignal = FALSE; 1445 /* 1446 * restore original signal mask 1447 */ 1448 sigunintr(&smask); 1449 1450 switch (status) { 1451 case RPC_SUCCESS: 1452 break; 1453 1454 case RPC_INTR: 1455 /* 1456 * There is no way to recover from this error, 1457 * even if mount option nointr is specified. 1458 * SIGKILL, for example, cannot be blocked. 1459 */ 1460 rpcerr.re_status = RPC_INTR; 1461 rpcerr.re_errno = EINTR; 1462 break; 1463 1464 case RPC_UDERROR: 1465 /* 1466 * If the NFS server is local (vold) and 1467 * it goes away then we get RPC_UDERROR. 1468 * This is a retryable error, so we would 1469 * loop, so check to see if the specific 1470 * error was ECONNRESET, indicating that 1471 * target did not exist at all. If so, 1472 * return with RPC_PROGUNAVAIL and 1473 * ECONNRESET to indicate why. 1474 */ 1475 CLNT_GETERR(client, &rpcerr); 1476 if (rpcerr.re_errno == ECONNRESET) { 1477 rpcerr.re_status = RPC_PROGUNAVAIL; 1478 rpcerr.re_errno = ECONNRESET; 1479 break; 1480 } 1481 /*FALLTHROUGH*/ 1482 1483 default: /* probably RPC_TIMEDOUT */ 1484 1485 if (IS_UNRECOVERABLE_RPC(status)) 1486 break; 1487 1488 /* 1489 * increment server not responding count 1490 */ 1491 mutex_enter(&mi->mi_lock); 1492 mi->mi_noresponse++; 1493 mutex_exit(&mi->mi_lock); 1494 #ifdef DEBUG 1495 nfscl->nfscl_stat.noresponse.value.ui64++; 1496 #endif 1497 /* 1498 * On zone shutdown, mark server dead and move on. 1499 */ 1500 if (zone_status_get(curproc->p_zone) >= 1501 ZONE_IS_SHUTTING_DOWN) { 1502 mutex_enter(&mi->mi_lock); 1503 mi->mi_flags |= MI4_TIMEDOUT; 1504 mutex_exit(&mi->mi_lock); 1505 clfree4(client, ch, nfscl); 1506 if (cred_cloned) 1507 crfree(cr); 1508 return (EIO); 1509 } 1510 1511 /* 1512 * NFS client failover support: 1513 * return and let the caller take care of 1514 * failover. We only return for failover mounts 1515 * because otherwise we want the "not responding" 1516 * message, the timer updates, etc. 1517 */ 1518 if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) && 1519 (error = try_failover(status)) != 0) { 1520 clfree4(client, ch, nfscl); 1521 if (cred_cloned) 1522 crfree(cr); 1523 *rpc_statusp = status; 1524 return (error); 1525 } 1526 1527 if (flags & RFSCALL_SOFT) 1528 break; 1529 1530 tryagain = TRUE; 1531 1532 /* 1533 * The call is in progress (over COTS). 1534 * Try the CLNT_CALL again, but don't 1535 * print a noisy error message. 1536 */ 1537 if (status == RPC_INPROGRESS) 1538 break; 1539 1540 timeo = backoff(timeo); 1541 CLNT_GETERR(client, &rpcerr_tmp); 1542 1543 mutex_enter(&mi->mi_lock); 1544 if (!(mi->mi_flags & MI4_PRINTED)) { 1545 mi->mi_flags |= MI4_PRINTED; 1546 mutex_exit(&mi->mi_lock); 1547 if ((status == RPC_CANTSEND) && 1548 (rpcerr_tmp.re_errno == ENOBUFS)) 1549 nfs4_queue_fact(RF_SENDQ_FULL, mi, 0, 1550 0, 0, FALSE, NULL, 0, NULL); 1551 else 1552 nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi, 1553 0, 0, 0, FALSE, NULL, 0, NULL); 1554 } else 1555 mutex_exit(&mi->mi_lock); 1556 1557 if (*doqueue && nfs_has_ctty()) { 1558 *doqueue = 0; 1559 if (!(mi->mi_flags & MI4_NOPRINT)) { 1560 if ((status == RPC_CANTSEND) && 1561 (rpcerr_tmp.re_errno == ENOBUFS)) 1562 nfs4_queue_fact(RF_SENDQ_FULL, 1563 mi, 0, 0, 0, FALSE, NULL, 1564 0, NULL); 1565 else 1566 nfs4_queue_fact( 1567 RF_SRV_NOT_RESPOND, mi, 0, 1568 0, 0, FALSE, NULL, 0, NULL); 1569 } 1570 } 1571 } 1572 } while (tryagain); 1573 1574 DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status, 1575 int, rpcerr.re_errno); 1576 1577 if (status != RPC_SUCCESS) { 1578 zoneid_t zoneid = mi->mi_zone->zone_id; 1579 1580 /* 1581 * Let soft mounts use the timed out message. 1582 */ 1583 if (status == RPC_INPROGRESS) 1584 status = RPC_TIMEDOUT; 1585 nfscl->nfscl_stat.badcalls.value.ui64++; 1586 if (status != RPC_INTR) { 1587 mutex_enter(&mi->mi_lock); 1588 mi->mi_flags |= MI4_DOWN; 1589 mutex_exit(&mi->mi_lock); 1590 CLNT_GETERR(client, &rpcerr); 1591 #ifdef DEBUG 1592 bufp = clnt_sperror(client, svp->sv_hostname); 1593 zprintf(zoneid, "NFS%d %s failed for %s\n", 1594 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1595 if (nfs_has_ctty()) { 1596 if (!(mi->mi_flags & MI4_NOPRINT)) { 1597 uprintf("NFS%d %s failed for %s\n", 1598 mi->mi_vers, mi->mi_rfsnames[which], 1599 bufp); 1600 } 1601 } 1602 kmem_free(bufp, MAXPATHLEN); 1603 #else 1604 zprintf(zoneid, 1605 "NFS %s failed for server %s: error %d (%s)\n", 1606 mi->mi_rfsnames[which], svp->sv_hostname, 1607 status, clnt_sperrno(status)); 1608 if (nfs_has_ctty()) { 1609 if (!(mi->mi_flags & MI4_NOPRINT)) { 1610 uprintf( 1611 "NFS %s failed for server %s: error %d (%s)\n", 1612 mi->mi_rfsnames[which], 1613 svp->sv_hostname, status, 1614 clnt_sperrno(status)); 1615 } 1616 } 1617 #endif 1618 /* 1619 * when CLNT_CALL() fails with RPC_AUTHERROR, 1620 * re_errno is set appropriately depending on 1621 * the authentication error 1622 */ 1623 if (status == RPC_VERSMISMATCH || 1624 status == RPC_PROGVERSMISMATCH) 1625 rpcerr.re_errno = EIO; 1626 } 1627 } else { 1628 /* 1629 * Test the value of mi_down and mi_printed without 1630 * holding the mi_lock mutex. If they are both zero, 1631 * then it is okay to skip the down and printed 1632 * processing. This saves on a mutex_enter and 1633 * mutex_exit pair for a normal, successful RPC. 1634 * This was just complete overhead. 1635 */ 1636 if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) { 1637 mutex_enter(&mi->mi_lock); 1638 mi->mi_flags &= ~MI4_DOWN; 1639 if (mi->mi_flags & MI4_PRINTED) { 1640 mi->mi_flags &= ~MI4_PRINTED; 1641 mutex_exit(&mi->mi_lock); 1642 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1643 nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 1644 0, FALSE, NULL, 0, NULL); 1645 } else 1646 mutex_exit(&mi->mi_lock); 1647 } 1648 1649 if (*doqueue == 0) { 1650 if (!(mi->mi_flags & MI4_NOPRINT) && 1651 !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1652 nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0, 1653 FALSE, NULL, 0, NULL); 1654 1655 *doqueue = 1; 1656 } 1657 } 1658 1659 clfree4(client, ch, nfscl); 1660 if (cred_cloned) 1661 crfree(cr); 1662 1663 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1664 1665 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d", 1666 rpcerr.re_errno); 1667 1668 *rpc_statusp = status; 1669 return (rpcerr.re_errno); 1670 } 1671 1672 /* 1673 * rfs4call - general wrapper for RPC calls initiated by the client 1674 */ 1675 void 1676 rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp, 1677 cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep) 1678 { 1679 int i, error; 1680 enum clnt_stat rpc_status = NFS4_OK; 1681 int num_resops; 1682 struct nfs4_clnt *nfscl; 1683 1684 ASSERT(nfs_zone() == mi->mi_zone); 1685 nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone()); 1686 ASSERT(nfscl != NULL); 1687 1688 nfscl->nfscl_stat.calls.value.ui64++; 1689 mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++; 1690 1691 /* Set up the results struct for XDR usage */ 1692 resp->argsp = argsp; 1693 resp->array = NULL; 1694 resp->status = 0; 1695 resp->decode_len = 0; 1696 1697 error = nfs4_rfscall(mi, NFSPROC4_COMPOUND, 1698 xdr_COMPOUND4args_clnt, (caddr_t)argsp, 1699 xdr_COMPOUND4res_clnt, (caddr_t)resp, cr, 1700 doqueue, &rpc_status, flags, nfscl); 1701 1702 /* Return now if it was an RPC error */ 1703 if (error) { 1704 ep->error = error; 1705 ep->stat = resp->status; 1706 ep->rpc_status = rpc_status; 1707 return; 1708 } 1709 1710 /* else we'll count the processed operations */ 1711 num_resops = resp->decode_len; 1712 for (i = 0; i < num_resops; i++) { 1713 /* 1714 * Count the individual operations 1715 * processed by the server. 1716 */ 1717 if (resp->array[i].resop >= NFSPROC4_NULL && 1718 resp->array[i].resop <= OP_WRITE) 1719 mi->mi_reqs[resp->array[i].resop].value.ui64++; 1720 } 1721 1722 ep->error = 0; 1723 ep->stat = resp->status; 1724 ep->rpc_status = rpc_status; 1725 } 1726 1727 /* 1728 * nfs4rename_update - updates stored state after a rename. Currently this 1729 * is the path of the object and anything under it, and the filehandle of 1730 * the renamed object. 1731 */ 1732 void 1733 nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm) 1734 { 1735 sfh4_update(VTOR4(renvp)->r_fh, nfh4p); 1736 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm); 1737 } 1738 1739 /* 1740 * Routine to look up the filehandle for the given path and rootvp. 1741 * 1742 * Return values: 1743 * - success: returns zero and *statp is set to NFS4_OK, and *fhp is 1744 * updated. 1745 * - error: return value (errno value) and/or *statp is set appropriately. 1746 */ 1747 #define RML_ORDINARY 1 1748 #define RML_NAMED_ATTR 2 1749 #define RML_ATTRDIR 3 1750 1751 static void 1752 remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp, 1753 int filetype, cred_t *cr, 1754 nfs_fh4 *fhp, nfs4_ga_res_t *garp, /* fh, attrs for object */ 1755 nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp, /* fh, attrs for parent */ 1756 nfs4_error_t *ep) 1757 { 1758 COMPOUND4args_clnt args; 1759 COMPOUND4res_clnt res; 1760 nfs_argop4 *argop; 1761 nfs_resop4 *resop; 1762 int num_argops; 1763 lookup4_param_t lookuparg; 1764 nfs_fh4 *tmpfhp; 1765 int doqueue = 1; 1766 char *path; 1767 mntinfo4_t *mi; 1768 1769 ASSERT(fname != NULL); 1770 ASSERT(rootvp->v_type == VDIR); 1771 1772 mi = VTOMI4(rootvp); 1773 path = fn_path(fname); 1774 switch (filetype) { 1775 case RML_NAMED_ATTR: 1776 lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR; 1777 args.ctag = TAG_REMAP_LOOKUP_NA; 1778 break; 1779 case RML_ATTRDIR: 1780 lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR; 1781 args.ctag = TAG_REMAP_LOOKUP_AD; 1782 break; 1783 case RML_ORDINARY: 1784 lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES; 1785 args.ctag = TAG_REMAP_LOOKUP; 1786 break; 1787 default: 1788 ep->error = EINVAL; 1789 return; 1790 } 1791 lookuparg.argsp = &args; 1792 lookuparg.resp = &res; 1793 lookuparg.header_len = 1; /* Putfh */ 1794 lookuparg.trailer_len = 0; 1795 lookuparg.ga_bits = NFS4_VATTR_MASK; 1796 lookuparg.mi = VTOMI4(rootvp); 1797 1798 (void) nfs4lookup_setup(path, &lookuparg, 1); 1799 1800 /* 0: putfh directory */ 1801 argop = args.array; 1802 argop[0].argop = OP_CPUTFH; 1803 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh; 1804 1805 num_argops = args.array_len; 1806 1807 rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep); 1808 1809 if (ep->error || res.status != NFS4_OK) 1810 goto exit; 1811 1812 /* get the object filehandle */ 1813 resop = &res.array[res.array_len - 2]; 1814 if (resop->resop != OP_GETFH) { 1815 nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL, 1816 0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1817 ep->stat = NFS4ERR_SERVERFAULT; 1818 goto exit; 1819 } 1820 tmpfhp = &resop->nfs_resop4_u.opgetfh.object; 1821 if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) { 1822 nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL, 1823 tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE, 1824 TAG_NONE, 0, 0); 1825 ep->stat = NFS4ERR_SERVERFAULT; 1826 goto exit; 1827 } 1828 fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP); 1829 nfs_fh4_copy(tmpfhp, fhp); 1830 1831 /* get the object attributes */ 1832 resop = &res.array[res.array_len - 1]; 1833 if (garp && resop->resop == OP_GETATTR) 1834 *garp = resop->nfs_resop4_u.opgetattr.ga_res; 1835 1836 /* See if there are enough fields in the response for parent info */ 1837 if ((int)res.array_len - 5 <= 0) 1838 goto exit; 1839 1840 /* get the parent filehandle */ 1841 resop = &res.array[res.array_len - 5]; 1842 if (resop->resop != OP_GETFH) { 1843 nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL, 1844 0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1845 ep->stat = NFS4ERR_SERVERFAULT; 1846 goto exit; 1847 } 1848 tmpfhp = &resop->nfs_resop4_u.opgetfh.object; 1849 if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) { 1850 nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL, 1851 tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE, 1852 TAG_NONE, 0, 0); 1853 ep->stat = NFS4ERR_SERVERFAULT; 1854 goto exit; 1855 } 1856 pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP); 1857 nfs_fh4_copy(tmpfhp, pfhp); 1858 1859 /* get the parent attributes */ 1860 resop = &res.array[res.array_len - 4]; 1861 if (pgarp && resop->resop == OP_GETATTR) 1862 *pgarp = resop->nfs_resop4_u.opgetattr.ga_res; 1863 1864 exit: 1865 /* 1866 * It is too hard to remember where all the OP_LOOKUPs are 1867 */ 1868 nfs4args_lookup_free(argop, num_argops); 1869 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4)); 1870 1871 if (!ep->error) 1872 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1873 kmem_free(path, strlen(path)+1); 1874 } 1875 1876 /* 1877 * NFS client failover / volatile filehandle support 1878 * 1879 * Recover the filehandle for the given rnode. 1880 * 1881 * Errors are returned via the nfs4_error_t parameter. 1882 */ 1883 1884 void 1885 nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep) 1886 { 1887 int is_stub; 1888 rnode4_t *rp = VTOR4(vp); 1889 vnode_t *rootvp = NULL; 1890 vnode_t *dvp = NULL; 1891 cred_t *cr, *cred_otw; 1892 nfs4_ga_res_t gar, pgar; 1893 nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL}; 1894 int filetype = RML_ORDINARY; 1895 nfs4_recov_state_t recov = {NULL, 0, 0}; 1896 int badfhcount = 0; 1897 nfs4_open_stream_t *osp = NULL; 1898 bool_t first_time = TRUE; /* first time getting OTW cred */ 1899 bool_t last_time = FALSE; /* last time getting OTW cred */ 1900 1901 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1902 "nfs4_remap_file: remapping %s", rnode4info(rp))); 1903 ASSERT(nfs4_consistent_type(vp)); 1904 1905 if (vp->v_flag & VROOT) { 1906 nfs4_remap_root(mi, ep, flags); 1907 return; 1908 } 1909 1910 /* 1911 * Given the root fh, use the path stored in 1912 * the rnode to find the fh for the new server. 1913 */ 1914 ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp); 1915 if (ep->error != 0) 1916 return; 1917 1918 cr = curthread->t_cred; 1919 ASSERT(cr != NULL); 1920 get_remap_cred: 1921 /* 1922 * Releases the osp, if it is provided. 1923 * Puts a hold on the cred_otw and the new osp (if found). 1924 */ 1925 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 1926 &first_time, &last_time); 1927 ASSERT(cred_otw != NULL); 1928 1929 if (rp->r_flags & R4ISXATTR) { 1930 filetype = RML_NAMED_ATTR; 1931 (void) vtodv(vp, &dvp, cred_otw, FALSE); 1932 } 1933 1934 if (vp->v_flag & V_XATTRDIR) { 1935 filetype = RML_ATTRDIR; 1936 } 1937 1938 if (filetype == RML_ORDINARY && rootvp->v_type == VREG) { 1939 /* file mount, doesn't need a remap */ 1940 goto done; 1941 } 1942 1943 again: 1944 remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw, 1945 &newfh, &gar, &newpfh, &pgar, ep); 1946 1947 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1948 "nfs4_remap_file: remap_lookup returned %d/%d", 1949 ep->error, ep->stat)); 1950 1951 if (last_time == FALSE && ep->error == EACCES) { 1952 crfree(cred_otw); 1953 if (dvp != NULL) 1954 VN_RELE(dvp); 1955 goto get_remap_cred; 1956 } 1957 if (ep->error != 0) 1958 goto done; 1959 1960 switch (ep->stat) { 1961 case NFS4_OK: 1962 badfhcount = 0; 1963 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1964 mutex_enter(&rp->r_statelock); 1965 rp->r_delay_interval = 0; 1966 mutex_exit(&rp->r_statelock); 1967 uprintf("NFS File Available..\n"); 1968 } 1969 break; 1970 case NFS4ERR_FHEXPIRED: 1971 case NFS4ERR_BADHANDLE: 1972 case NFS4ERR_STALE: 1973 /* 1974 * If we ran into filehandle problems, we should try to 1975 * remap the root vnode first and hope life gets better. 1976 * But we need to avoid loops. 1977 */ 1978 if (badfhcount++ > 0) 1979 goto done; 1980 if (newfh.nfs_fh4_len != 0) { 1981 kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len); 1982 newfh.nfs_fh4_len = 0; 1983 } 1984 if (newpfh.nfs_fh4_len != 0) { 1985 kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len); 1986 newpfh.nfs_fh4_len = 0; 1987 } 1988 /* relative path - remap rootvp then retry */ 1989 VN_RELE(rootvp); 1990 rootvp = NULL; 1991 nfs4_remap_root(mi, ep, flags); 1992 if (ep->error != 0 || ep->stat != NFS4_OK) 1993 goto done; 1994 ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp); 1995 if (ep->error != 0) 1996 goto done; 1997 goto again; 1998 case NFS4ERR_DELAY: 1999 badfhcount = 0; 2000 nfs4_set_delay_wait(vp); 2001 ep->error = nfs4_wait_for_delay(vp, &recov); 2002 if (ep->error != 0) 2003 goto done; 2004 goto again; 2005 case NFS4ERR_ACCESS: 2006 /* get new cred, try again */ 2007 if (last_time == TRUE) 2008 goto done; 2009 if (dvp != NULL) 2010 VN_RELE(dvp); 2011 crfree(cred_otw); 2012 goto get_remap_cred; 2013 default: 2014 goto done; 2015 } 2016 2017 /* 2018 * Check on the new and old rnodes before updating; 2019 * if the vnode type or size changes, issue a warning 2020 * and mark the file dead. 2021 */ 2022 mutex_enter(&rp->r_statelock); 2023 if (flags & NFS4_REMAP_CKATTRS) { 2024 if (vp->v_type != gar.n4g_va.va_type || 2025 (vp->v_type != VDIR && 2026 rp->r_size != gar.n4g_va.va_size)) { 2027 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2028 "nfs4_remap_file: size %d vs. %d, type %d vs. %d", 2029 (int)rp->r_size, (int)gar.n4g_va.va_size, 2030 vp->v_type, gar.n4g_va.va_type)); 2031 mutex_exit(&rp->r_statelock); 2032 nfs4_queue_event(RE_FILE_DIFF, mi, 2033 rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0, 2034 TAG_NONE, TAG_NONE, 0, 0); 2035 nfs4_fail_recov(vp, NULL, 0, NFS4_OK); 2036 goto done; 2037 } 2038 } 2039 ASSERT(gar.n4g_va.va_type != VNON); 2040 rp->r_server = mi->mi_curr_serv; 2041 2042 /* 2043 * Turn this object into a "stub" object if we 2044 * crossed an underlying server fs boundary. 2045 * 2046 * This stub will be for a mirror-mount. 2047 * A referral would look like a boundary crossing 2048 * as well, but would not be the same type of object, 2049 * so we would expect to mark the object dead. 2050 * 2051 * See comment in r4_do_attrcache() for more details. 2052 */ 2053 is_stub = 0; 2054 if (gar.n4g_fsid_valid) { 2055 (void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0); 2056 rp->r_srv_fsid = gar.n4g_fsid; 2057 if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid)) 2058 is_stub = 1; 2059 nfs_rw_exit(&rp->r_server->sv_lock); 2060 #ifdef DEBUG 2061 } else { 2062 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2063 "remap_file: fsid attr not provided by server. rp=%p", 2064 (void *)rp)); 2065 #endif 2066 } 2067 if (is_stub) 2068 r4_stub_mirrormount(rp); 2069 else 2070 r4_stub_none(rp); 2071 mutex_exit(&rp->r_statelock); 2072 nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */ 2073 sfh4_update(rp->r_fh, &newfh); 2074 ASSERT(nfs4_consistent_type(vp)); 2075 2076 /* 2077 * If we got parent info, use it to update the parent 2078 */ 2079 if (newpfh.nfs_fh4_len != 0) { 2080 if (rp->r_svnode.sv_dfh != NULL) 2081 sfh4_update(rp->r_svnode.sv_dfh, &newpfh); 2082 if (dvp != NULL) { 2083 /* force update of attrs */ 2084 nfs4_attrcache_noinval(dvp, &pgar, gethrtime()); 2085 } 2086 } 2087 done: 2088 if (newfh.nfs_fh4_len != 0) 2089 kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len); 2090 if (newpfh.nfs_fh4_len != 0) 2091 kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len); 2092 if (cred_otw != NULL) 2093 crfree(cred_otw); 2094 if (rootvp != NULL) 2095 VN_RELE(rootvp); 2096 if (dvp != NULL) 2097 VN_RELE(dvp); 2098 if (osp != NULL) 2099 open_stream_rele(osp, rp); 2100 } 2101 2102 /* 2103 * Client-side failover support: remap the filehandle for vp if it appears 2104 * necessary. errors are returned via the nfs4_error_t parameter; though, 2105 * if there is a problem, we will just try again later. 2106 */ 2107 2108 void 2109 nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep) 2110 { 2111 if (vp == NULL) 2112 return; 2113 2114 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY)) 2115 return; 2116 2117 if (VTOR4(vp)->r_server == mi->mi_curr_serv) 2118 return; 2119 2120 nfs4_remap_file(mi, vp, flags, ep); 2121 } 2122 2123 /* 2124 * nfs4_make_dotdot() - find or create a parent vnode of a non-root node. 2125 * 2126 * Our caller has a filehandle for ".." relative to a particular 2127 * directory object. We want to find or create a parent vnode 2128 * with that filehandle and return it. We can of course create 2129 * a vnode from this filehandle, but we need to also make sure 2130 * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR) 2131 * that we have a parent FH for future reopens as well. If 2132 * we have a remap failure, we won't be able to reopen this 2133 * file, but we won't treat that as fatal because a reopen 2134 * is at least unlikely. Someday nfs4_reopen() should look 2135 * for a missing parent FH and try a remap to recover from it. 2136 * 2137 * need_start_op argument indicates whether this function should 2138 * do a start_op before calling remap_lookup(). This should 2139 * be FALSE, if you are the recovery thread or in an op; otherwise, 2140 * set it to TRUE. 2141 */ 2142 int 2143 nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp, 2144 cred_t *cr, vnode_t **vpp, int need_start_op) 2145 { 2146 mntinfo4_t *mi = VTOMI4(dvp); 2147 nfs4_fname_t *np = NULL, *pnp = NULL; 2148 vnode_t *vp = NULL, *rootvp = NULL; 2149 rnode4_t *rp; 2150 nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL}; 2151 nfs4_ga_res_t gar, pgar; 2152 vattr_t va, pva; 2153 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2154 nfs4_sharedfh_t *sfh = NULL, *psfh = NULL; 2155 nfs4_recov_state_t recov_state; 2156 2157 #ifdef DEBUG 2158 /* 2159 * ensure need_start_op is correct 2160 */ 2161 { 2162 int no_need_start_op = (tsd_get(nfs4_tsd_key) || 2163 (curthread == mi->mi_recovthread)); 2164 /* C needs a ^^ operator! */ 2165 ASSERT(((need_start_op) && (!no_need_start_op)) || 2166 ((! need_start_op) && (no_need_start_op))); 2167 } 2168 #endif 2169 ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone()); 2170 2171 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, 2172 "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp, 2173 rnode4info(VTOR4(dvp)))); 2174 2175 /* 2176 * rootvp might be needed eventually. Holding it now will 2177 * ensure that r4find_unlocked() will find it, if ".." is the root. 2178 */ 2179 e.error = VFS_ROOT(mi->mi_vfsp, &rootvp); 2180 if (e.error != 0) 2181 goto out; 2182 rp = r4find_unlocked(fhp, mi->mi_vfsp); 2183 if (rp != NULL) { 2184 *vpp = RTOV4(rp); 2185 VN_RELE(rootvp); 2186 return (0); 2187 } 2188 2189 /* 2190 * Since we don't have the rnode, we have to go over the wire. 2191 * remap_lookup() can get all of the filehandles and attributes 2192 * we need in one operation. 2193 */ 2194 np = fn_parent(VTOSV(dvp)->sv_name); 2195 /* if a parent was not found return an error */ 2196 if (np == NULL) { 2197 e.error = ENOENT; 2198 goto out; 2199 } 2200 2201 recov_state.rs_flags = 0; 2202 recov_state.rs_num_retry_despite_err = 0; 2203 recov_retry: 2204 if (need_start_op) { 2205 e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP, 2206 &recov_state, NULL); 2207 if (e.error != 0) { 2208 goto out; 2209 } 2210 } 2211 2212 pgar.n4g_va.va_type = VNON; 2213 gar.n4g_va.va_type = VNON; 2214 2215 remap_lookup(np, rootvp, RML_ORDINARY, cr, 2216 &newfh, &gar, &newpfh, &pgar, &e); 2217 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) { 2218 if (need_start_op) { 2219 bool_t abort; 2220 2221 abort = nfs4_start_recovery(&e, mi, 2222 rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL, NULL, 2223 NULL); 2224 if (abort) { 2225 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, 2226 &recov_state, FALSE); 2227 if (e.error == 0) 2228 e.error = EIO; 2229 goto out; 2230 } 2231 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, 2232 &recov_state, TRUE); 2233 goto recov_retry; 2234 } 2235 if (e.error == 0) 2236 e.error = EIO; 2237 goto out; 2238 } 2239 2240 va = gar.n4g_va; 2241 pva = pgar.n4g_va; 2242 2243 if ((e.error != 0) || 2244 (va.va_type != VDIR)) { 2245 if (need_start_op) 2246 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, 2247 &recov_state, FALSE); 2248 if (e.error == 0) 2249 e.error = EIO; 2250 goto out; 2251 } 2252 2253 if (e.stat != NFS4_OK) { 2254 if (need_start_op) 2255 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, 2256 &recov_state, FALSE); 2257 e.error = EIO; 2258 goto out; 2259 } 2260 2261 /* 2262 * It is possible for remap_lookup() to return with no error, 2263 * but without providing the parent filehandle and attrs. 2264 */ 2265 if (pva.va_type != VDIR) { 2266 /* 2267 * Call remap_lookup() again, this time with the 2268 * newpfh and pgar args in the first position. 2269 */ 2270 pnp = fn_parent(np); 2271 if (pnp != NULL) { 2272 remap_lookup(pnp, rootvp, RML_ORDINARY, cr, 2273 &newpfh, &pgar, NULL, NULL, &e); 2274 /* 2275 * This remap_lookup call modifies pgar. The following 2276 * line prevents trouble when checking the va_type of 2277 * pva later in this code. 2278 */ 2279 pva = pgar.n4g_va; 2280 2281 if (nfs4_needs_recovery(&e, FALSE, 2282 mi->mi_vfsp)) { 2283 if (need_start_op) { 2284 bool_t abort; 2285 2286 abort = nfs4_start_recovery(&e, mi, 2287 rootvp, NULL, NULL, NULL, 2288 OP_LOOKUP, NULL, NULL, NULL); 2289 if (abort) { 2290 nfs4_end_fop(mi, rootvp, NULL, 2291 OH_LOOKUP, &recov_state, 2292 FALSE); 2293 if (e.error == 0) 2294 e.error = EIO; 2295 goto out; 2296 } 2297 nfs4_end_fop(mi, rootvp, NULL, 2298 OH_LOOKUP, &recov_state, TRUE); 2299 goto recov_retry; 2300 } 2301 if (e.error == 0) 2302 e.error = EIO; 2303 goto out; 2304 } 2305 2306 if (e.stat != NFS4_OK) { 2307 if (need_start_op) 2308 nfs4_end_fop(mi, rootvp, NULL, 2309 OH_LOOKUP, &recov_state, FALSE); 2310 e.error = EIO; 2311 goto out; 2312 } 2313 } 2314 if ((pnp == NULL) || 2315 (e.error != 0) || 2316 (pva.va_type == VNON)) { 2317 if (need_start_op) 2318 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, 2319 &recov_state, FALSE); 2320 if (e.error == 0) 2321 e.error = EIO; 2322 goto out; 2323 } 2324 } 2325 ASSERT(newpfh.nfs_fh4_len != 0); 2326 if (need_start_op) 2327 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE); 2328 psfh = sfh4_get(&newpfh, mi); 2329 2330 sfh = sfh4_get(&newfh, mi); 2331 vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t); 2332 2333 out: 2334 if (np != NULL) 2335 fn_rele(&np); 2336 if (pnp != NULL) 2337 fn_rele(&pnp); 2338 if (newfh.nfs_fh4_len != 0) 2339 kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len); 2340 if (newpfh.nfs_fh4_len != 0) 2341 kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len); 2342 if (sfh != NULL) 2343 sfh4_rele(&sfh); 2344 if (psfh != NULL) 2345 sfh4_rele(&psfh); 2346 if (rootvp != NULL) 2347 VN_RELE(rootvp); 2348 *vpp = vp; 2349 return (e.error); 2350 } 2351 2352 #ifdef DEBUG 2353 size_t r_path_memuse = 0; 2354 #endif 2355 2356 /* 2357 * NFS client failover support 2358 * 2359 * sv4_free() frees the malloc'd portion of a "servinfo_t". 2360 */ 2361 void 2362 sv4_free(servinfo4_t *svp) 2363 { 2364 servinfo4_t *next; 2365 struct knetconfig *knconf; 2366 2367 while (svp != NULL) { 2368 next = svp->sv_next; 2369 if (svp->sv_dhsec) 2370 sec_clnt_freeinfo(svp->sv_dhsec); 2371 if (svp->sv_secdata) 2372 sec_clnt_freeinfo(svp->sv_secdata); 2373 if (svp->sv_save_secinfo && 2374 svp->sv_save_secinfo != svp->sv_secinfo) 2375 secinfo_free(svp->sv_save_secinfo); 2376 if (svp->sv_secinfo) 2377 secinfo_free(svp->sv_secinfo); 2378 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 2379 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 2380 knconf = svp->sv_knconf; 2381 if (knconf != NULL) { 2382 if (knconf->knc_protofmly != NULL) 2383 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 2384 if (knconf->knc_proto != NULL) 2385 kmem_free(knconf->knc_proto, KNC_STRSIZE); 2386 kmem_free(knconf, sizeof (*knconf)); 2387 } 2388 knconf = svp->sv_origknconf; 2389 if (knconf != NULL) { 2390 if (knconf->knc_protofmly != NULL) 2391 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 2392 if (knconf->knc_proto != NULL) 2393 kmem_free(knconf->knc_proto, KNC_STRSIZE); 2394 kmem_free(knconf, sizeof (*knconf)); 2395 } 2396 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 2397 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 2398 if (svp->sv_path != NULL) { 2399 kmem_free(svp->sv_path, svp->sv_pathlen); 2400 } 2401 nfs_rw_destroy(&svp->sv_lock); 2402 kmem_free(svp, sizeof (*svp)); 2403 svp = next; 2404 } 2405 } 2406 2407 void 2408 nfs4_printfhandle(nfs4_fhandle_t *fhp) 2409 { 2410 int *ip; 2411 char *buf; 2412 size_t bufsize; 2413 char *cp; 2414 2415 /* 2416 * 13 == "(file handle:" 2417 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2418 * 1 == ' ' 2419 * 8 == maximum strlen of "%x" 2420 * 3 == ")\n\0" 2421 */ 2422 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2423 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2424 if (buf == NULL) 2425 return; 2426 2427 cp = buf; 2428 (void) strcpy(cp, "(file handle:"); 2429 while (*cp != '\0') 2430 cp++; 2431 for (ip = (int *)fhp->fh_buf; 2432 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2433 ip++) { 2434 (void) sprintf(cp, " %x", *ip); 2435 while (*cp != '\0') 2436 cp++; 2437 } 2438 (void) strcpy(cp, ")\n"); 2439 2440 zcmn_err(getzoneid(), CE_CONT, "%s", buf); 2441 2442 kmem_free(buf, bufsize); 2443 } 2444 2445 /* 2446 * The NFSv4 readdir cache subsystem. 2447 * 2448 * We provide a set of interfaces to allow the rest of the system to utilize 2449 * a caching mechanism while encapsulating the details of the actual 2450 * implementation. This should allow for better maintainability and 2451 * extensibility by consolidating the implementation details in one location. 2452 */ 2453 2454 /* 2455 * Comparator used by AVL routines. 2456 */ 2457 static int 2458 rddir4_cache_compar(const void *x, const void *y) 2459 { 2460 rddir4_cache_impl *ai = (rddir4_cache_impl *)x; 2461 rddir4_cache_impl *bi = (rddir4_cache_impl *)y; 2462 rddir4_cache *a = &ai->rc; 2463 rddir4_cache *b = &bi->rc; 2464 2465 if (a->nfs4_cookie == b->nfs4_cookie) { 2466 if (a->buflen == b->buflen) 2467 return (0); 2468 if (a->buflen < b->buflen) 2469 return (-1); 2470 return (1); 2471 } 2472 2473 if (a->nfs4_cookie < b->nfs4_cookie) 2474 return (-1); 2475 2476 return (1); 2477 } 2478 2479 /* 2480 * Allocate an opaque handle for the readdir cache. 2481 */ 2482 void 2483 rddir4_cache_create(rnode4_t *rp) 2484 { 2485 ASSERT(rp->r_dir == NULL); 2486 2487 rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 2488 2489 avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl), 2490 offsetof(rddir4_cache_impl, tree)); 2491 } 2492 2493 /* 2494 * Purge the cache of all cached readdir responses. 2495 */ 2496 void 2497 rddir4_cache_purge(rnode4_t *rp) 2498 { 2499 rddir4_cache_impl *rdip; 2500 rddir4_cache_impl *nrdip; 2501 2502 ASSERT(MUTEX_HELD(&rp->r_statelock)); 2503 2504 if (rp->r_dir == NULL) 2505 return; 2506 2507 rdip = avl_first(rp->r_dir); 2508 2509 while (rdip != NULL) { 2510 nrdip = AVL_NEXT(rp->r_dir, rdip); 2511 avl_remove(rp->r_dir, rdip); 2512 rdip->rc.flags &= ~RDDIRCACHED; 2513 rddir4_cache_rele(rp, &rdip->rc); 2514 rdip = nrdip; 2515 } 2516 ASSERT(avl_numnodes(rp->r_dir) == 0); 2517 } 2518 2519 /* 2520 * Destroy the readdir cache. 2521 */ 2522 void 2523 rddir4_cache_destroy(rnode4_t *rp) 2524 { 2525 ASSERT(MUTEX_HELD(&rp->r_statelock)); 2526 if (rp->r_dir == NULL) 2527 return; 2528 2529 rddir4_cache_purge(rp); 2530 avl_destroy(rp->r_dir); 2531 kmem_free(rp->r_dir, sizeof (avl_tree_t)); 2532 rp->r_dir = NULL; 2533 } 2534 2535 /* 2536 * Locate a readdir response from the readdir cache. 2537 * 2538 * Return values: 2539 * 2540 * NULL - If there is an unrecoverable situation like the operation may have 2541 * been interrupted. 2542 * 2543 * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller. 2544 * The flags are set approprately, such that the caller knows 2545 * what state the entry is in. 2546 */ 2547 rddir4_cache * 2548 rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count) 2549 { 2550 rddir4_cache_impl *rdip = NULL; 2551 rddir4_cache_impl srdip; 2552 rddir4_cache *srdc; 2553 rddir4_cache *rdc = NULL; 2554 rddir4_cache *nrdc = NULL; 2555 avl_index_t where; 2556 2557 top: 2558 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2559 ASSERT(MUTEX_HELD(&rp->r_statelock)); 2560 /* 2561 * Check to see if the readdir cache has been disabled. If so, then 2562 * simply allocate an rddir4_cache entry and return it, since caching 2563 * operations do not apply. 2564 */ 2565 if (rp->r_dir == NULL) { 2566 if (nrdc == NULL) { 2567 /* 2568 * Drop the lock because we are doing a sleeping 2569 * allocation. 2570 */ 2571 mutex_exit(&rp->r_statelock); 2572 rdc = rddir4_cache_alloc(KM_SLEEP); 2573 rdc->nfs4_cookie = cookie; 2574 rdc->buflen = count; 2575 mutex_enter(&rp->r_statelock); 2576 return (rdc); 2577 } 2578 return (nrdc); 2579 } 2580 2581 srdc = &srdip.rc; 2582 srdc->nfs4_cookie = cookie; 2583 srdc->buflen = count; 2584 2585 rdip = avl_find(rp->r_dir, &srdip, &where); 2586 2587 /* 2588 * If we didn't find an entry then create one and insert it 2589 * into the cache. 2590 */ 2591 if (rdip == NULL) { 2592 /* 2593 * Check for the case where we have made a second pass through 2594 * the cache due to a lockless allocation. If we find that no 2595 * thread has already inserted this entry, do the insert now 2596 * and return. 2597 */ 2598 if (nrdc != NULL) { 2599 avl_insert(rp->r_dir, nrdc->data, where); 2600 nrdc->flags |= RDDIRCACHED; 2601 rddir4_cache_hold(nrdc); 2602 return (nrdc); 2603 } 2604 2605 #ifdef DEBUG 2606 nfs4_readdir_cache_misses++; 2607 #endif 2608 /* 2609 * First, try to allocate an entry without sleeping. If that 2610 * fails then drop the lock and do a sleeping allocation. 2611 */ 2612 nrdc = rddir4_cache_alloc(KM_NOSLEEP); 2613 if (nrdc != NULL) { 2614 nrdc->nfs4_cookie = cookie; 2615 nrdc->buflen = count; 2616 avl_insert(rp->r_dir, nrdc->data, where); 2617 nrdc->flags |= RDDIRCACHED; 2618 rddir4_cache_hold(nrdc); 2619 return (nrdc); 2620 } 2621 2622 /* 2623 * Drop the lock and do a sleeping allocation. We incur 2624 * additional overhead by having to search the cache again, 2625 * but this case should be rare. 2626 */ 2627 mutex_exit(&rp->r_statelock); 2628 nrdc = rddir4_cache_alloc(KM_SLEEP); 2629 nrdc->nfs4_cookie = cookie; 2630 nrdc->buflen = count; 2631 mutex_enter(&rp->r_statelock); 2632 /* 2633 * We need to take another pass through the cache 2634 * since we dropped our lock to perform the alloc. 2635 * Another thread may have come by and inserted the 2636 * entry we are interested in. 2637 */ 2638 goto top; 2639 } 2640 2641 /* 2642 * Check to see if we need to free our entry. This can happen if 2643 * another thread came along beat us to the insert. We can 2644 * safely call rddir4_cache_free directly because no other thread 2645 * would have a reference to this entry. 2646 */ 2647 if (nrdc != NULL) 2648 rddir4_cache_free((rddir4_cache_impl *)nrdc->data); 2649 2650 #ifdef DEBUG 2651 nfs4_readdir_cache_hits++; 2652 #endif 2653 /* 2654 * Found something. Make sure it's ready to return. 2655 */ 2656 rdc = &rdip->rc; 2657 rddir4_cache_hold(rdc); 2658 /* 2659 * If the cache entry is in the process of being filled in, wait 2660 * until this completes. The RDDIRWAIT bit is set to indicate that 2661 * someone is waiting and when the thread currently filling the entry 2662 * is done, it should do a cv_broadcast to wakeup all of the threads 2663 * waiting for it to finish. If the thread wakes up to find that 2664 * someone new is now trying to complete the the entry, go back 2665 * to sleep. 2666 */ 2667 while (rdc->flags & RDDIR) { 2668 /* 2669 * The entry is not complete. 2670 */ 2671 nfs_rw_exit(&rp->r_rwlock); 2672 rdc->flags |= RDDIRWAIT; 2673 #ifdef DEBUG 2674 nfs4_readdir_cache_waits++; 2675 #endif 2676 while (rdc->flags & RDDIRWAIT) { 2677 if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) { 2678 /* 2679 * We got interrupted, probably the user 2680 * typed ^C or an alarm fired. We free the 2681 * new entry if we allocated one. 2682 */ 2683 rddir4_cache_rele(rp, rdc); 2684 mutex_exit(&rp->r_statelock); 2685 (void) nfs_rw_enter_sig(&rp->r_rwlock, 2686 RW_READER, FALSE); 2687 mutex_enter(&rp->r_statelock); 2688 return (NULL); 2689 } 2690 } 2691 mutex_exit(&rp->r_statelock); 2692 (void) nfs_rw_enter_sig(&rp->r_rwlock, 2693 RW_READER, FALSE); 2694 mutex_enter(&rp->r_statelock); 2695 } 2696 2697 /* 2698 * The entry we were waiting on may have been purged from 2699 * the cache and should no longer be used, release it and 2700 * start over. 2701 */ 2702 if (!(rdc->flags & RDDIRCACHED)) { 2703 rddir4_cache_rele(rp, rdc); 2704 goto top; 2705 } 2706 2707 /* 2708 * The entry is completed. Return it. 2709 */ 2710 return (rdc); 2711 } 2712 2713 /* 2714 * Allocate a cache element and return it. Can return NULL if memory is 2715 * low. 2716 */ 2717 static rddir4_cache * 2718 rddir4_cache_alloc(int flags) 2719 { 2720 rddir4_cache_impl *rdip = NULL; 2721 rddir4_cache *rc = NULL; 2722 2723 rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags); 2724 2725 if (rdip != NULL) { 2726 rc = &rdip->rc; 2727 rc->data = (void *)rdip; 2728 rc->nfs4_cookie = 0; 2729 rc->nfs4_ncookie = 0; 2730 rc->entries = NULL; 2731 rc->eof = 0; 2732 rc->entlen = 0; 2733 rc->buflen = 0; 2734 rc->actlen = 0; 2735 /* 2736 * A readdir is required so set the flag. 2737 */ 2738 rc->flags = RDDIRREQ; 2739 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 2740 rc->error = 0; 2741 mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL); 2742 rdip->count = 1; 2743 #ifdef DEBUG 2744 atomic_add_64(&clstat4_debug.dirent.value.ui64, 1); 2745 #endif 2746 } 2747 return (rc); 2748 } 2749 2750 /* 2751 * Increment the reference count to this cache element. 2752 */ 2753 static void 2754 rddir4_cache_hold(rddir4_cache *rc) 2755 { 2756 rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data; 2757 2758 mutex_enter(&rdip->lock); 2759 rdip->count++; 2760 mutex_exit(&rdip->lock); 2761 } 2762 2763 /* 2764 * Release a reference to this cache element. If the count is zero then 2765 * free the element. 2766 */ 2767 void 2768 rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc) 2769 { 2770 rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data; 2771 2772 ASSERT(MUTEX_HELD(&rp->r_statelock)); 2773 2774 /* 2775 * Check to see if we have any waiters. If so, we can wake them 2776 * so that they can proceed. 2777 */ 2778 if (rdc->flags & RDDIRWAIT) { 2779 rdc->flags &= ~RDDIRWAIT; 2780 cv_broadcast(&rdc->cv); 2781 } 2782 2783 mutex_enter(&rdip->lock); 2784 ASSERT(rdip->count > 0); 2785 if (--rdip->count == 0) { 2786 mutex_exit(&rdip->lock); 2787 rddir4_cache_free(rdip); 2788 } else 2789 mutex_exit(&rdip->lock); 2790 } 2791 2792 /* 2793 * Free a cache element. 2794 */ 2795 static void 2796 rddir4_cache_free(rddir4_cache_impl *rdip) 2797 { 2798 rddir4_cache *rc = &rdip->rc; 2799 2800 #ifdef DEBUG 2801 atomic_add_64(&clstat4_debug.dirent.value.ui64, -1); 2802 #endif 2803 if (rc->entries != NULL) 2804 kmem_free(rc->entries, rc->buflen); 2805 cv_destroy(&rc->cv); 2806 mutex_destroy(&rdip->lock); 2807 kmem_free(rdip, sizeof (*rdip)); 2808 } 2809 2810 /* 2811 * Snapshot callback for nfs:0:nfs4_client as registered with the kstat 2812 * framework. 2813 */ 2814 static int 2815 cl4_snapshot(kstat_t *ksp, void *buf, int rw) 2816 { 2817 ksp->ks_snaptime = gethrtime(); 2818 if (rw == KSTAT_WRITE) { 2819 bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl)); 2820 #ifdef DEBUG 2821 /* 2822 * Currently only the global zone can write to kstats, but we 2823 * add the check just for paranoia. 2824 */ 2825 if (INGLOBALZONE(curproc)) 2826 bcopy((char *)buf + sizeof (clstat4_tmpl), 2827 &clstat4_debug, sizeof (clstat4_debug)); 2828 #endif 2829 } else { 2830 bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl)); 2831 #ifdef DEBUG 2832 /* 2833 * If we're displaying the "global" debug kstat values, we 2834 * display them as-is to all zones since in fact they apply to 2835 * the system as a whole. 2836 */ 2837 bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl), 2838 sizeof (clstat4_debug)); 2839 #endif 2840 } 2841 return (0); 2842 } 2843 2844 2845 2846 /* 2847 * Zone support 2848 */ 2849 static void * 2850 clinit4_zone(zoneid_t zoneid) 2851 { 2852 kstat_t *nfs4_client_kstat; 2853 struct nfs4_clnt *nfscl; 2854 uint_t ndata; 2855 2856 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 2857 mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL); 2858 nfscl->nfscl_chtable4 = NULL; 2859 nfscl->nfscl_zoneid = zoneid; 2860 2861 bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl)); 2862 ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t); 2863 #ifdef DEBUG 2864 ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t); 2865 #endif 2866 if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client", 2867 "misc", KSTAT_TYPE_NAMED, ndata, 2868 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 2869 nfs4_client_kstat->ks_private = &nfscl->nfscl_stat; 2870 nfs4_client_kstat->ks_snapshot = cl4_snapshot; 2871 kstat_install(nfs4_client_kstat); 2872 } 2873 mutex_enter(&nfs4_clnt_list_lock); 2874 list_insert_head(&nfs4_clnt_list, nfscl); 2875 mutex_exit(&nfs4_clnt_list_lock); 2876 2877 return (nfscl); 2878 } 2879 2880 /*ARGSUSED*/ 2881 static void 2882 clfini4_zone(zoneid_t zoneid, void *arg) 2883 { 2884 struct nfs4_clnt *nfscl = arg; 2885 chhead_t *chp, *next; 2886 2887 if (nfscl == NULL) 2888 return; 2889 mutex_enter(&nfs4_clnt_list_lock); 2890 list_remove(&nfs4_clnt_list, nfscl); 2891 mutex_exit(&nfs4_clnt_list_lock); 2892 clreclaim4_zone(nfscl, 0); 2893 for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) { 2894 ASSERT(chp->ch_list == NULL); 2895 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 2896 next = chp->ch_next; 2897 kmem_free(chp, sizeof (*chp)); 2898 } 2899 kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid); 2900 mutex_destroy(&nfscl->nfscl_chtable4_lock); 2901 kmem_free(nfscl, sizeof (*nfscl)); 2902 } 2903 2904 /* 2905 * Called by endpnt_destructor to make sure the client handles are 2906 * cleaned up before the RPC endpoints. This becomes a no-op if 2907 * clfini_zone (above) is called first. This function is needed 2908 * (rather than relying on clfini_zone to clean up) because the ZSD 2909 * callbacks have no ordering mechanism, so we have no way to ensure 2910 * that clfini_zone is called before endpnt_destructor. 2911 */ 2912 void 2913 clcleanup4_zone(zoneid_t zoneid) 2914 { 2915 struct nfs4_clnt *nfscl; 2916 2917 mutex_enter(&nfs4_clnt_list_lock); 2918 nfscl = list_head(&nfs4_clnt_list); 2919 for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) { 2920 if (nfscl->nfscl_zoneid == zoneid) { 2921 clreclaim4_zone(nfscl, 0); 2922 break; 2923 } 2924 } 2925 mutex_exit(&nfs4_clnt_list_lock); 2926 } 2927 2928 int 2929 nfs4_subr_init(void) 2930 { 2931 /* 2932 * Allocate and initialize the client handle cache 2933 */ 2934 chtab4_cache = kmem_cache_create("client_handle4_cache", 2935 sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL, 2936 NULL, 0); 2937 2938 /* 2939 * Initialize the list of per-zone client handles (and associated data). 2940 * This needs to be done before we call zone_key_create(). 2941 */ 2942 list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt), 2943 offsetof(struct nfs4_clnt, nfscl_node)); 2944 2945 /* 2946 * Initialize the zone_key for per-zone client handle lists. 2947 */ 2948 zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone); 2949 2950 if (nfs4err_delay_time == 0) 2951 nfs4err_delay_time = NFS4ERR_DELAY_TIME; 2952 2953 return (0); 2954 } 2955 2956 int 2957 nfs4_subr_fini(void) 2958 { 2959 /* 2960 * Deallocate the client handle cache 2961 */ 2962 kmem_cache_destroy(chtab4_cache); 2963 2964 /* 2965 * Destroy the zone_key 2966 */ 2967 (void) zone_key_delete(nfs4clnt_zone_key); 2968 2969 return (0); 2970 } 2971 /* 2972 * Set or Clear direct I/O flag 2973 * VOP_RWLOCK() is held for write access to prevent a race condition 2974 * which would occur if a process is in the middle of a write when 2975 * directio flag gets set. It is possible that all pages may not get flushed. 2976 * 2977 * This is a copy of nfs_directio, changes here may need to be made 2978 * there and vice versa. 2979 */ 2980 2981 int 2982 nfs4_directio(vnode_t *vp, int cmd, cred_t *cr) 2983 { 2984 int error = 0; 2985 rnode4_t *rp; 2986 2987 rp = VTOR4(vp); 2988 2989 if (cmd == DIRECTIO_ON) { 2990 2991 if (rp->r_flags & R4DIRECTIO) 2992 return (0); 2993 2994 /* 2995 * Flush the page cache. 2996 */ 2997 2998 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 2999 3000 if (rp->r_flags & R4DIRECTIO) { 3001 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 3002 return (0); 3003 } 3004 3005 if (nfs4_has_pages(vp) && 3006 ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) { 3007 error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0, 3008 B_INVAL, cr, NULL); 3009 if (error) { 3010 if (error == ENOSPC || error == EDQUOT) { 3011 mutex_enter(&rp->r_statelock); 3012 if (!rp->r_error) 3013 rp->r_error = error; 3014 mutex_exit(&rp->r_statelock); 3015 } 3016 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 3017 return (error); 3018 } 3019 } 3020 3021 mutex_enter(&rp->r_statelock); 3022 rp->r_flags |= R4DIRECTIO; 3023 mutex_exit(&rp->r_statelock); 3024 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 3025 return (0); 3026 } 3027 3028 if (cmd == DIRECTIO_OFF) { 3029 mutex_enter(&rp->r_statelock); 3030 rp->r_flags &= ~R4DIRECTIO; /* disable direct mode */ 3031 mutex_exit(&rp->r_statelock); 3032 return (0); 3033 } 3034 3035 return (EINVAL); 3036 } 3037 3038 /* 3039 * Return TRUE if the file has any pages. Always go back to 3040 * the master vnode to check v_pages since none of the shadows 3041 * can have pages. 3042 */ 3043 3044 bool_t 3045 nfs4_has_pages(vnode_t *vp) 3046 { 3047 rnode4_t *rp; 3048 3049 rp = VTOR4(vp); 3050 if (IS_SHADOW(vp, rp)) 3051 vp = RTOV4(rp); /* RTOV4 always gives the master */ 3052 3053 return (vn_has_cached_data(vp)); 3054 } 3055 3056 /* 3057 * This table is used to determine whether the client should attempt 3058 * failover based on the clnt_stat value returned by CLNT_CALL. The 3059 * clnt_stat is used as an index into the table. If 3060 * the error value that corresponds to the clnt_stat value in the 3061 * table is non-zero, then that is the error to be returned AND 3062 * that signals that failover should be attempted. 3063 * 3064 * Special note: If the RPC_ values change, then direct indexing of the 3065 * table is no longer valid, but having the RPC_ values in the table 3066 * allow the functions to detect the change and issue a warning. 3067 * In this case, the code will always attempt failover as a defensive 3068 * measure. 3069 */ 3070 3071 static struct try_failover_tab { 3072 enum clnt_stat cstat; 3073 int error; 3074 } try_failover_table [] = { 3075 3076 RPC_SUCCESS, 0, 3077 RPC_CANTENCODEARGS, 0, 3078 RPC_CANTDECODERES, 0, 3079 RPC_CANTSEND, ECOMM, 3080 RPC_CANTRECV, ECOMM, 3081 RPC_TIMEDOUT, ETIMEDOUT, 3082 RPC_VERSMISMATCH, 0, 3083 RPC_AUTHERROR, 0, 3084 RPC_PROGUNAVAIL, 0, 3085 RPC_PROGVERSMISMATCH, 0, 3086 RPC_PROCUNAVAIL, 0, 3087 RPC_CANTDECODEARGS, 0, 3088 RPC_SYSTEMERROR, ENOSR, 3089 RPC_UNKNOWNHOST, EHOSTUNREACH, 3090 RPC_RPCBFAILURE, ENETUNREACH, 3091 RPC_PROGNOTREGISTERED, ECONNREFUSED, 3092 RPC_FAILED, ETIMEDOUT, 3093 RPC_UNKNOWNPROTO, EHOSTUNREACH, 3094 RPC_INTR, 0, 3095 RPC_UNKNOWNADDR, EHOSTUNREACH, 3096 RPC_TLIERROR, 0, 3097 RPC_NOBROADCAST, EHOSTUNREACH, 3098 RPC_N2AXLATEFAILURE, ECONNREFUSED, 3099 RPC_UDERROR, 0, 3100 RPC_INPROGRESS, 0, 3101 RPC_STALERACHANDLE, EINVAL, 3102 RPC_CANTCONNECT, ECONNREFUSED, 3103 RPC_XPRTFAILED, ECONNABORTED, 3104 RPC_CANTCREATESTREAM, ECONNREFUSED, 3105 RPC_CANTSTORE, ENOBUFS 3106 }; 3107 3108 /* 3109 * nfs4_try_failover - determine whether the client should 3110 * attempt failover based on the values stored in the nfs4_error_t. 3111 */ 3112 int 3113 nfs4_try_failover(nfs4_error_t *ep) 3114 { 3115 if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE) 3116 return (TRUE); 3117 3118 if (ep->error && ep->rpc_status != RPC_SUCCESS) 3119 return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE); 3120 3121 return (FALSE); 3122 } 3123 3124 /* 3125 * try_failover - internal version of nfs4_try_failover, called 3126 * only by rfscall and aclcall. Determine if failover is warranted 3127 * based on the clnt_stat and return the error number if it is. 3128 */ 3129 static int 3130 try_failover(enum clnt_stat rpc_status) 3131 { 3132 int err = 0; 3133 3134 if (rpc_status == RPC_SUCCESS) 3135 return (0); 3136 3137 #ifdef DEBUG 3138 if (rpc_status != 0 && nfs4_try_failover_any) { 3139 err = ETIMEDOUT; 3140 goto done; 3141 } 3142 #endif 3143 /* 3144 * The rpc status is used as an index into the table. 3145 * If the rpc status is outside of the range of the 3146 * table or if the rpc error numbers have been changed 3147 * since the table was constructed, then print a warning 3148 * (DEBUG only) and try failover anyway. Otherwise, just 3149 * grab the resulting error number out of the table. 3150 */ 3151 if (rpc_status < RPC_SUCCESS || rpc_status >= 3152 sizeof (try_failover_table)/sizeof (try_failover_table[0]) || 3153 try_failover_table[rpc_status].cstat != rpc_status) { 3154 3155 err = ETIMEDOUT; 3156 #ifdef DEBUG 3157 cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d", 3158 rpc_status); 3159 #endif 3160 } else 3161 err = try_failover_table[rpc_status].error; 3162 3163 done: 3164 if (rpc_status) 3165 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 3166 "nfs4_try_failover: %strying failover on error %d", 3167 err ? "" : "NOT ", rpc_status)); 3168 3169 return (err); 3170 } 3171 3172 void 3173 nfs4_error_zinit(nfs4_error_t *ep) 3174 { 3175 ep->error = 0; 3176 ep->stat = NFS4_OK; 3177 ep->rpc_status = RPC_SUCCESS; 3178 } 3179 3180 void 3181 nfs4_error_init(nfs4_error_t *ep, int error) 3182 { 3183 ep->error = error; 3184 ep->stat = NFS4_OK; 3185 ep->rpc_status = RPC_SUCCESS; 3186 } 3187 3188 3189 #ifdef DEBUG 3190 3191 /* 3192 * Return a 16-bit hash for filehandle, stateid, clientid, owner. 3193 * use the same algorithm as for NFS v3. 3194 * 3195 */ 3196 int 3197 hash16(void *p, int len) 3198 { 3199 int i, rem; 3200 uint_t *wp; 3201 uint_t key = 0; 3202 3203 /* protect against non word aligned */ 3204 if ((rem = len & 3) != 0) 3205 len &= ~3; 3206 3207 for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) { 3208 key ^= (*wp >> 16) ^ *wp; 3209 } 3210 3211 /* hash left-over bytes */ 3212 for (i = 0; i < rem; i++) 3213 key ^= *((uchar_t *)p + i); 3214 3215 return (key & 0xffff); 3216 } 3217 3218 /* 3219 * rnode4info - return filehandle and path information for an rnode. 3220 * XXX MT issues: uses a single static buffer, no locking of path. 3221 */ 3222 char * 3223 rnode4info(rnode4_t *rp) 3224 { 3225 static char buf[80]; 3226 nfs4_fhandle_t fhandle; 3227 char *path; 3228 char *type; 3229 3230 if (rp == NULL) 3231 return ("null"); 3232 if (rp->r_flags & R4ISXATTR) 3233 type = "attr"; 3234 else if (RTOV4(rp)->v_flag & V_XATTRDIR) 3235 type = "attrdir"; 3236 else if (RTOV4(rp)->v_flag & VROOT) 3237 type = "root"; 3238 else if (RTOV4(rp)->v_type == VDIR) 3239 type = "dir"; 3240 else if (RTOV4(rp)->v_type == VREG) 3241 type = "file"; 3242 else 3243 type = "other"; 3244 sfh4_copyval(rp->r_fh, &fhandle); 3245 path = fn_path(rp->r_svnode.sv_name); 3246 (void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n", 3247 (void *)rp, path, type, rp->r_flags, 3248 hash16((void *)&fhandle.fh_buf, fhandle.fh_len)); 3249 kmem_free(path, strlen(path)+1); 3250 return (buf); 3251 } 3252 #endif 3253