1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 28 * All Rights Reserved 29 */ 30 31 /* 32 * Portions of this source code were derived from Berkeley 4.3 BSD 33 * under license from the Regents of the University of California. 34 */ 35 36 37 /* 38 * Implements a kernel based, client side RPC. 39 */ 40 41 #include <sys/param.h> 42 #include <sys/types.h> 43 #include <sys/systm.h> 44 #include <sys/sysmacros.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/ddi.h> 48 #include <sys/tiuser.h> 49 #include <sys/tihdr.h> 50 #include <sys/t_kuser.h> 51 #include <sys/errno.h> 52 #include <sys/kmem.h> 53 #include <sys/debug.h> 54 #include <sys/kstat.h> 55 #include <sys/t_lock.h> 56 #include <sys/cmn_err.h> 57 #include <sys/conf.h> 58 #include <sys/disp.h> 59 #include <sys/taskq.h> 60 #include <sys/list.h> 61 #include <sys/atomic.h> 62 #include <sys/zone.h> 63 #include <netinet/in.h> 64 #include <rpc/types.h> 65 #include <rpc/xdr.h> 66 #include <rpc/auth.h> 67 #include <rpc/clnt.h> 68 #include <rpc/rpc_msg.h> 69 70 static enum clnt_stat clnt_clts_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 71 caddr_t, xdrproc_t, caddr_t, struct timeval); 72 static void clnt_clts_kabort(CLIENT *); 73 static void clnt_clts_kerror(CLIENT *, struct rpc_err *); 74 static bool_t clnt_clts_kfreeres(CLIENT *, xdrproc_t, caddr_t); 75 static bool_t clnt_clts_kcontrol(CLIENT *, int, char *); 76 static void clnt_clts_kdestroy(CLIENT *); 77 static int clnt_clts_ksettimers(CLIENT *, struct rpc_timers *, 78 struct rpc_timers *, int, void (*)(), caddr_t, uint32_t); 79 80 /* 81 * Operations vector for CLTS based RPC 82 */ 83 static struct clnt_ops clts_ops = { 84 clnt_clts_kcallit, /* do rpc call */ 85 clnt_clts_kabort, /* abort call */ 86 clnt_clts_kerror, /* return error status */ 87 clnt_clts_kfreeres, /* free results */ 88 clnt_clts_kdestroy, /* destroy rpc handle */ 89 clnt_clts_kcontrol, /* the ioctl() of rpc */ 90 clnt_clts_ksettimers /* set retry timers */ 91 }; 92 93 /* 94 * Endpoint for CLTS (INET, INET6, loopback, etc.) 95 */ 96 typedef struct endpnt_type { 97 struct endpnt_type *e_next; /* pointer to next endpoint type */ 98 list_t e_pool; /* list of available endpoints */ 99 list_t e_ilist; /* list of idle endpints */ 100 struct endpnt *e_pcurr; /* pointer to current endpoint */ 101 char e_protofmly[KNC_STRSIZE]; /* protocol family */ 102 dev_t e_rdev; /* device */ 103 kmutex_t e_plock; /* pool lock */ 104 kmutex_t e_ilock; /* idle list lock */ 105 timeout_id_t e_itimer; /* timer to dispatch the taskq */ 106 uint_t e_cnt; /* number of endpoints in the pool */ 107 zoneid_t e_zoneid; /* zoneid of endpoint type */ 108 kcondvar_t e_async_cv; /* cv for asynchronous reap threads */ 109 uint_t e_async_count; /* count of asynchronous reap threads */ 110 } endpnt_type_t; 111 112 typedef struct endpnt { 113 list_node_t e_node; /* link to the pool */ 114 list_node_t e_idle; /* link to the idle list */ 115 endpnt_type_t *e_type; /* back pointer to endpoint type */ 116 TIUSER *e_tiptr; /* pointer to transport endpoint */ 117 queue_t *e_wq; /* write queue */ 118 uint_t e_flags; /* endpoint flags */ 119 uint_t e_ref; /* ref count on endpoint */ 120 kcondvar_t e_cv; /* condition variable */ 121 kmutex_t e_lock; /* protects cv and flags */ 122 time_t e_itime; /* time when rele'd */ 123 } endpnt_t; 124 125 #define ENDPNT_ESTABLISHED 0x1 /* endpoint is established */ 126 #define ENDPNT_WAITING 0x2 /* thread waiting for endpoint */ 127 #define ENDPNT_BOUND 0x4 /* endpoint is bound */ 128 #define ENDPNT_STALE 0x8 /* endpoint is dead */ 129 #define ENDPNT_ONIDLE 0x10 /* endpoint is on the idle list */ 130 131 static krwlock_t endpnt_type_lock; /* protects endpnt_type_list */ 132 static endpnt_type_t *endpnt_type_list = NULL; /* list of CLTS endpoints */ 133 static struct kmem_cache *endpnt_cache; /* cache of endpnt_t's */ 134 static taskq_t *endpnt_taskq; /* endpnt_t reaper thread */ 135 static bool_t taskq_created; /* flag for endpnt_taskq */ 136 static kmutex_t endpnt_taskq_lock; /* taskq lock */ 137 static zone_key_t endpnt_destructor_key; 138 139 #define DEFAULT_ENDPOINT_REAP_INTERVAL 60 /* 1 minute */ 140 #define DEFAULT_INTERVAL_SHIFT 30 /* 30 seconds */ 141 142 /* 143 * Endpoint tunables 144 */ 145 static int clnt_clts_max_endpoints = -1; 146 static int clnt_clts_hash_size = DEFAULT_HASH_SIZE; 147 static time_t clnt_clts_endpoint_reap_interval = -1; 148 static clock_t clnt_clts_taskq_dispatch_interval; 149 150 /* 151 * Response completion hash queue 152 */ 153 static call_table_t *clts_call_ht; 154 155 /* 156 * Routines for the endpoint manager 157 */ 158 static struct endpnt_type *endpnt_type_create(struct knetconfig *); 159 static void endpnt_type_free(struct endpnt_type *); 160 static int check_endpnt(struct endpnt *, struct endpnt **); 161 static struct endpnt *endpnt_get(struct knetconfig *, int); 162 static void endpnt_rele(struct endpnt *); 163 static void endpnt_reap_settimer(endpnt_type_t *); 164 static void endpnt_reap(endpnt_type_t *); 165 static void endpnt_reap_dispatch(void *); 166 static void endpnt_reclaim(zoneid_t); 167 168 169 /* 170 * Request dipatching function. 171 */ 172 static int clnt_clts_dispatch_send(queue_t *q, mblk_t *, struct netbuf *addr, 173 calllist_t *, uint_t); 174 175 /* 176 * The size of the preserialized RPC header information. 177 */ 178 #define CKU_HDRSIZE 20 179 /* 180 * The initial allocation size. It is small to reduce space requirements. 181 */ 182 #define CKU_INITSIZE 2048 183 /* 184 * The size of additional allocations, if required. It is larger to 185 * reduce the number of actual allocations. 186 */ 187 #define CKU_ALLOCSIZE 8192 188 189 /* 190 * Private data per rpc handle. This structure is allocated by 191 * clnt_clts_kcreate, and freed by clnt_clts_kdestroy. 192 */ 193 struct cku_private { 194 CLIENT cku_client; /* client handle */ 195 int cku_retrys; /* request retrys */ 196 calllist_t cku_call; 197 struct endpnt *cku_endpnt; /* open end point */ 198 struct knetconfig cku_config; 199 struct netbuf cku_addr; /* remote address */ 200 struct rpc_err cku_err; /* error status */ 201 XDR cku_outxdr; /* xdr stream for output */ 202 XDR cku_inxdr; /* xdr stream for input */ 203 char cku_rpchdr[CKU_HDRSIZE + 4]; /* rpc header */ 204 struct cred *cku_cred; /* credentials */ 205 struct rpc_timers *cku_timers; /* for estimating RTT */ 206 struct rpc_timers *cku_timeall; /* for estimating RTT */ 207 void (*cku_feedback)(int, int, caddr_t); 208 /* ptr to feedback rtn */ 209 caddr_t cku_feedarg; /* argument for feedback func */ 210 uint32_t cku_xid; /* current XID */ 211 bool_t cku_bcast; /* RPC broadcast hint */ 212 int cku_useresvport; /* Use reserved port */ 213 struct rpc_clts_client *cku_stats; /* counters for the zone */ 214 }; 215 216 static const struct rpc_clts_client { 217 kstat_named_t rccalls; 218 kstat_named_t rcbadcalls; 219 kstat_named_t rcretrans; 220 kstat_named_t rcbadxids; 221 kstat_named_t rctimeouts; 222 kstat_named_t rcnewcreds; 223 kstat_named_t rcbadverfs; 224 kstat_named_t rctimers; 225 kstat_named_t rcnomem; 226 kstat_named_t rccantsend; 227 } clts_rcstat_tmpl = { 228 { "calls", KSTAT_DATA_UINT64 }, 229 { "badcalls", KSTAT_DATA_UINT64 }, 230 { "retrans", KSTAT_DATA_UINT64 }, 231 { "badxids", KSTAT_DATA_UINT64 }, 232 { "timeouts", KSTAT_DATA_UINT64 }, 233 { "newcreds", KSTAT_DATA_UINT64 }, 234 { "badverfs", KSTAT_DATA_UINT64 }, 235 { "timers", KSTAT_DATA_UINT64 }, 236 { "nomem", KSTAT_DATA_UINT64 }, 237 { "cantsend", KSTAT_DATA_UINT64 }, 238 }; 239 240 static uint_t clts_rcstat_ndata = 241 sizeof (clts_rcstat_tmpl) / sizeof (kstat_named_t); 242 243 #define RCSTAT_INCR(s, x) \ 244 atomic_add_64(&(s)->x.value.ui64, 1) 245 246 #define ptoh(p) (&((p)->cku_client)) 247 #define htop(h) ((struct cku_private *)((h)->cl_private)) 248 249 /* 250 * Times to retry 251 */ 252 #define SNDTRIES 4 253 #define REFRESHES 2 /* authentication refreshes */ 254 255 /* 256 * The following is used to determine the global default behavior for 257 * CLTS when binding to a local port. 258 * 259 * If the value is set to 1 the default will be to select a reserved 260 * (aka privileged) port, if the value is zero the default will be to 261 * use non-reserved ports. Users of kRPC may override this by using 262 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 263 */ 264 static int clnt_clts_do_bindresvport = 1; 265 266 #define BINDRESVPORT_RETRIES 5 267 268 void 269 clnt_clts_stats_init(zoneid_t zoneid, struct rpc_clts_client **statsp) 270 { 271 kstat_t *ksp; 272 kstat_named_t *knp; 273 274 knp = rpcstat_zone_init_common(zoneid, "unix", "rpc_clts_client", 275 (const kstat_named_t *)&clts_rcstat_tmpl, 276 sizeof (clts_rcstat_tmpl)); 277 /* 278 * Backwards compatibility for old kstat clients 279 */ 280 ksp = kstat_create_zone("unix", 0, "rpc_client", "rpc", 281 KSTAT_TYPE_NAMED, clts_rcstat_ndata, 282 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid); 283 if (ksp) { 284 ksp->ks_data = knp; 285 kstat_install(ksp); 286 } 287 *statsp = (struct rpc_clts_client *)knp; 288 } 289 290 void 291 clnt_clts_stats_fini(zoneid_t zoneid, struct rpc_clts_client **statsp) 292 { 293 rpcstat_zone_fini_common(zoneid, "unix", "rpc_clts_client"); 294 kstat_delete_byname_zone("unix", 0, "rpc_client", zoneid); 295 kmem_free(*statsp, sizeof (clts_rcstat_tmpl)); 296 } 297 298 /* 299 * Create an rpc handle for a clts rpc connection. 300 * Allocates space for the handle structure and the private data. 301 */ 302 /* ARGSUSED */ 303 int 304 clnt_clts_kcreate(struct knetconfig *config, struct netbuf *addr, 305 rpcprog_t pgm, rpcvers_t vers, int retrys, struct cred *cred, 306 CLIENT **cl) 307 { 308 CLIENT *h; 309 struct cku_private *p; 310 struct rpc_msg call_msg; 311 int error; 312 int plen; 313 314 if (cl == NULL) 315 return (EINVAL); 316 317 *cl = NULL; 318 error = 0; 319 320 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 321 322 h = ptoh(p); 323 324 /* handle */ 325 h->cl_ops = &clts_ops; 326 h->cl_private = (caddr_t)p; 327 h->cl_auth = authkern_create(); 328 329 /* call message, just used to pre-serialize below */ 330 call_msg.rm_xid = 0; 331 call_msg.rm_direction = CALL; 332 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 333 call_msg.rm_call.cb_prog = pgm; 334 call_msg.rm_call.cb_vers = vers; 335 336 /* private */ 337 clnt_clts_kinit(h, addr, retrys, cred); 338 339 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE); 340 341 /* pre-serialize call message header */ 342 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 343 error = EINVAL; /* XXX */ 344 goto bad; 345 } 346 347 p->cku_config.knc_rdev = config->knc_rdev; 348 p->cku_config.knc_semantics = config->knc_semantics; 349 plen = strlen(config->knc_protofmly) + 1; 350 p->cku_config.knc_protofmly = kmem_alloc(plen, KM_SLEEP); 351 bcopy(config->knc_protofmly, p->cku_config.knc_protofmly, plen); 352 p->cku_useresvport = -1; /* value is has not been set */ 353 354 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 355 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 356 357 *cl = h; 358 return (0); 359 360 bad: 361 auth_destroy(h->cl_auth); 362 kmem_free(p->cku_addr.buf, addr->maxlen); 363 kmem_free(p, sizeof (struct cku_private)); 364 365 return (error); 366 } 367 368 void 369 clnt_clts_kinit(CLIENT *h, struct netbuf *addr, int retrys, cred_t *cred) 370 { 371 /* LINTED pointer alignment */ 372 struct cku_private *p = htop(h); 373 struct rpcstat *rsp; 374 375 rsp = zone_getspecific(rpcstat_zone_key, rpc_zone()); 376 ASSERT(rsp != NULL); 377 378 p->cku_retrys = retrys; 379 380 if (p->cku_addr.maxlen < addr->len) { 381 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 382 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 383 384 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 385 p->cku_addr.maxlen = addr->maxlen; 386 } 387 388 p->cku_addr.len = addr->len; 389 bcopy(addr->buf, p->cku_addr.buf, addr->len); 390 391 p->cku_cred = cred; 392 p->cku_xid = 0; 393 p->cku_timers = NULL; 394 p->cku_timeall = NULL; 395 p->cku_feedback = NULL; 396 p->cku_bcast = FALSE; 397 p->cku_call.call_xid = 0; 398 p->cku_call.call_hash = 0; 399 p->cku_call.call_notified = FALSE; 400 p->cku_call.call_next = NULL; 401 p->cku_call.call_prev = NULL; 402 p->cku_call.call_reply = NULL; 403 p->cku_call.call_wq = NULL; 404 p->cku_stats = rsp->rpc_clts_client; 405 } 406 407 /* 408 * set the timers. Return current retransmission timeout. 409 */ 410 static int 411 clnt_clts_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 412 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, 413 uint32_t xid) 414 { 415 /* LINTED pointer alignment */ 416 struct cku_private *p = htop(h); 417 int value; 418 419 p->cku_feedback = feedback; 420 p->cku_feedarg = arg; 421 p->cku_timers = t; 422 p->cku_timeall = all; 423 if (xid) 424 p->cku_xid = xid; 425 value = all->rt_rtxcur; 426 value += t->rt_rtxcur; 427 if (value < minimum) 428 return (minimum); 429 RCSTAT_INCR(p->cku_stats, rctimers); 430 return (value); 431 } 432 433 /* 434 * Time out back off function. tim is in HZ 435 */ 436 #define MAXTIMO (20 * hz) 437 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 438 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 439 440 #define RETRY_POLL_TIMO 30 441 442 /* 443 * Call remote procedure. 444 * Most of the work of rpc is done here. We serialize what is left 445 * of the header (some was pre-serialized in the handle), serialize 446 * the arguments, and send it off. We wait for a reply or a time out. 447 * Timeout causes an immediate return, other packet problems may cause 448 * a retry on the receive. When a good packet is received we deserialize 449 * it, and check verification. A bad reply code will cause one retry 450 * with full (longhand) credentials. 451 */ 452 enum clnt_stat 453 clnt_clts_kcallit_addr(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 454 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 455 struct timeval wait, struct netbuf *sin) 456 { 457 /* LINTED pointer alignment */ 458 struct cku_private *p = htop(h); 459 XDR *xdrs; 460 int stries = p->cku_retrys; 461 int refreshes = REFRESHES; /* number of times to refresh cred */ 462 int round_trip; /* time the RPC */ 463 int error; 464 int hdrsz; 465 mblk_t *mp; 466 mblk_t *mpdup; 467 mblk_t *resp = NULL; 468 mblk_t *tmp; 469 calllist_t *call = &p->cku_call; 470 clock_t timout = 0; 471 bool_t interrupted; 472 enum clnt_stat status; 473 struct rpc_msg reply_msg; 474 enum clnt_stat re_status; 475 endpnt_t *endpt; 476 477 RCSTAT_INCR(p->cku_stats, rccalls); 478 479 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_sec: %ld\n", wait.tv_sec); 480 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_usec: %ld\n", wait.tv_usec); 481 482 timout = TIMEVAL_TO_TICK(&wait); 483 484 if (p->cku_xid == 0) { 485 p->cku_xid = alloc_xid(); 486 if (p->cku_endpnt != NULL) 487 endpnt_rele(p->cku_endpnt); 488 p->cku_endpnt = NULL; 489 } 490 call->call_zoneid = rpc_zoneid(); 491 492 mpdup = NULL; 493 call_again: 494 495 if (mpdup == NULL) { 496 497 while ((mp = allocb(CKU_INITSIZE, BPRI_LO)) == NULL) { 498 if (strwaitbuf(CKU_INITSIZE, BPRI_LO)) { 499 p->cku_err.re_status = RPC_SYSTEMERROR; 500 p->cku_err.re_errno = ENOSR; 501 goto done; 502 } 503 } 504 505 xdrs = &p->cku_outxdr; 506 xdrmblk_init(xdrs, mp, XDR_ENCODE, CKU_ALLOCSIZE); 507 508 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 509 /* 510 * Copy in the preserialized RPC header 511 * information. 512 */ 513 bcopy(p->cku_rpchdr, mp->b_rptr, CKU_HDRSIZE); 514 515 /* 516 * transaction id is the 1st thing in the output 517 * buffer. 518 */ 519 /* LINTED pointer alignment */ 520 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 521 522 /* Skip the preserialized stuff. */ 523 XDR_SETPOS(xdrs, CKU_HDRSIZE); 524 525 /* Serialize dynamic stuff into the output buffer. */ 526 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 527 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 528 (!(*xdr_args)(xdrs, argsp))) { 529 freemsg(mp); 530 p->cku_err.re_status = RPC_CANTENCODEARGS; 531 p->cku_err.re_errno = EIO; 532 goto done; 533 } 534 } else { 535 uint32_t *uproc = (uint32_t *) 536 &p->cku_rpchdr[CKU_HDRSIZE]; 537 IXDR_PUT_U_INT32(uproc, procnum); 538 539 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 540 XDR_SETPOS(xdrs, 0); 541 542 /* Serialize the procedure number and the arguments. */ 543 if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr, 544 CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) { 545 freemsg(mp); 546 p->cku_err.re_status = RPC_CANTENCODEARGS; 547 p->cku_err.re_errno = EIO; 548 goto done; 549 } 550 } 551 } else 552 mp = mpdup; 553 554 mpdup = dupmsg(mp); 555 if (mpdup == NULL) { 556 freemsg(mp); 557 p->cku_err.re_status = RPC_SYSTEMERROR; 558 p->cku_err.re_errno = ENOSR; 559 goto done; 560 } 561 562 /* 563 * Grab an endpnt only if the endpoint is NULL. We could be retrying 564 * the request and in this case we want to go through the same 565 * source port, so that the duplicate request cache may detect a 566 * retry. 567 */ 568 569 if (p->cku_endpnt == NULL) 570 p->cku_endpnt = endpnt_get(&p->cku_config, p->cku_useresvport); 571 572 if (p->cku_endpnt == NULL) { 573 freemsg(mp); 574 p->cku_err.re_status = RPC_SYSTEMERROR; 575 p->cku_err.re_errno = ENOSR; 576 goto done; 577 } 578 579 round_trip = lbolt; 580 581 error = clnt_clts_dispatch_send(p->cku_endpnt->e_wq, mp, 582 &p->cku_addr, call, p->cku_xid); 583 584 if (error != 0) { 585 freemsg(mp); 586 p->cku_err.re_status = RPC_CANTSEND; 587 p->cku_err.re_errno = error; 588 RCSTAT_INCR(p->cku_stats, rccantsend); 589 goto done1; 590 } 591 592 RPCLOG(64, "clnt_clts_kcallit_addr: sent call for xid 0x%x\n", 593 p->cku_xid); 594 595 /* 596 * There are two reasons for which we go back to to tryread. 597 * 598 * a) In case the status is RPC_PROCUNAVAIL and we sent out a 599 * broadcast we should not get any invalid messages with the 600 * RPC_PROCUNAVAIL error back. Some broken RPC implementations 601 * send them and for this we have to ignore them ( as we would 602 * have never received them ) and look for another message 603 * which might contain the valid response because we don't know 604 * how many broken implementations are in the network. So we are 605 * going to loop until 606 * - we received a valid response 607 * - we have processed all invalid responses and 608 * got a time out when we try to receive again a 609 * message. 610 * 611 * b) We will jump back to tryread also in case we failed 612 * within the AUTH_VALIDATE. In this case we should move 613 * on and loop until we received a valid response or we 614 * have processed all responses with broken authentication 615 * and we got a time out when we try to receive a message. 616 */ 617 tryread: 618 mutex_enter(&call->call_lock); 619 interrupted = FALSE; 620 if (call->call_notified == FALSE) { 621 klwp_t *lwp = ttolwp(curthread); 622 clock_t cv_wait_ret = 1; /* init to > 0 */ 623 clock_t cv_timout = timout; 624 625 if (lwp != NULL) 626 lwp->lwp_nostop++; 627 628 cv_timout += lbolt; 629 630 if (h->cl_nosignal) 631 while ((cv_wait_ret = 632 cv_timedwait(&call->call_cv, 633 &call->call_lock, cv_timout)) > 0 && 634 call->call_notified == FALSE) 635 ; 636 else 637 while ((cv_wait_ret = 638 cv_timedwait_sig(&call->call_cv, 639 &call->call_lock, cv_timout)) > 0 && 640 call->call_notified == FALSE) 641 ; 642 643 if (cv_wait_ret == 0) 644 interrupted = TRUE; 645 646 if (lwp != NULL) 647 lwp->lwp_nostop--; 648 } 649 resp = call->call_reply; 650 call->call_reply = NULL; 651 status = call->call_status; 652 /* 653 * We have to reset the call_notified here. In case we have 654 * to do a retry ( e.g. in case we got a RPC_PROCUNAVAIL 655 * error ) we need to set this to false to ensure that 656 * we will wait for the next message. When the next message 657 * is going to arrive the function clnt_clts_dispatch_notify 658 * will set this to true again. 659 */ 660 call->call_notified = FALSE; 661 mutex_exit(&call->call_lock); 662 663 if (status == RPC_TIMEDOUT) { 664 if (interrupted) { 665 /* 666 * We got interrupted, bail out 667 */ 668 p->cku_err.re_status = RPC_INTR; 669 p->cku_err.re_errno = EINTR; 670 goto done1; 671 } else { 672 /* 673 * It's possible that our response arrived 674 * right after we timed out. Check to see 675 * if it has arrived before we remove the 676 * calllist from the dispatch queue. 677 */ 678 mutex_enter(&call->call_lock); 679 if (call->call_notified == TRUE) { 680 resp = call->call_reply; 681 call->call_reply = NULL; 682 mutex_exit(&call->call_lock); 683 RPCLOG(8, "clnt_clts_kcallit_addr: " 684 "response received for request " 685 "w/xid 0x%x after timeout\n", 686 p->cku_xid); 687 goto getresponse; 688 } 689 mutex_exit(&call->call_lock); 690 691 RPCLOG(8, "clnt_clts_kcallit_addr: " 692 "request w/xid 0x%x timedout " 693 "waiting for reply\n", p->cku_xid); 694 #if 0 /* XXX not yet */ 695 /* 696 * Timeout may be due to a dead gateway. Send 697 * an ioctl downstream advising deletion of 698 * route when we reach the half-way point to 699 * timing out. 700 */ 701 if (stries == p->cku_retrys/2) { 702 t_kadvise(p->cku_endpnt->e_tiptr, 703 (uchar_t *)p->cku_addr.buf, 704 p->cku_addr.len); 705 } 706 #endif /* not yet */ 707 p->cku_err.re_status = RPC_TIMEDOUT; 708 p->cku_err.re_errno = ETIMEDOUT; 709 RCSTAT_INCR(p->cku_stats, rctimeouts); 710 goto done1; 711 } 712 } 713 714 getresponse: 715 /* 716 * Check to see if a response arrived. If it one is 717 * present then proceed to process the reponse. Otherwise 718 * fall through to retry or retransmit the request. This 719 * is probably not the optimal thing to do, but since we 720 * are most likely dealing with a unrealiable transport it 721 * is the safe thing to so. 722 */ 723 if (resp == NULL) { 724 p->cku_err.re_status = RPC_CANTRECV; 725 p->cku_err.re_errno = EIO; 726 goto done1; 727 } 728 729 /* 730 * Prepare the message for further processing. We need to remove 731 * the datagram header and copy the source address if necessary. No 732 * need to verify the header since rpcmod took care of that. 733 */ 734 /* 735 * Copy the source address if the caller has supplied a netbuf. 736 */ 737 if (sin != NULL) { 738 union T_primitives *pptr; 739 740 pptr = (union T_primitives *)resp->b_rptr; 741 bcopy(resp->b_rptr + pptr->unitdata_ind.SRC_offset, sin->buf, 742 pptr->unitdata_ind.SRC_length); 743 sin->len = pptr->unitdata_ind.SRC_length; 744 } 745 746 /* 747 * Pop off the datagram header. 748 */ 749 hdrsz = resp->b_wptr - resp->b_rptr; 750 if ((resp->b_wptr - (resp->b_rptr + hdrsz)) == 0) { 751 tmp = resp; 752 resp = resp->b_cont; 753 tmp->b_cont = NULL; 754 freeb(tmp); 755 } else { 756 unsigned char *ud_off = resp->b_rptr; 757 resp->b_rptr += hdrsz; 758 tmp = dupb(resp); 759 if (tmp == NULL) { 760 p->cku_err.re_status = RPC_SYSTEMERROR; 761 p->cku_err.re_errno = ENOSR; 762 freemsg(resp); 763 goto done1; 764 } 765 tmp->b_cont = resp->b_cont; 766 resp->b_rptr = ud_off; 767 freeb(resp); 768 resp = tmp; 769 } 770 771 round_trip = lbolt - round_trip; 772 /* 773 * Van Jacobson timer algorithm here, only if NOT a retransmission. 774 */ 775 if (p->cku_timers != NULL && stries == p->cku_retrys) { 776 int rt; 777 778 rt = round_trip; 779 rt -= (p->cku_timers->rt_srtt >> 3); 780 p->cku_timers->rt_srtt += rt; 781 if (rt < 0) 782 rt = - rt; 783 rt -= (p->cku_timers->rt_deviate >> 2); 784 p->cku_timers->rt_deviate += rt; 785 p->cku_timers->rt_rtxcur = 786 (clock_t)((p->cku_timers->rt_srtt >> 2) + 787 p->cku_timers->rt_deviate) >> 1; 788 789 rt = round_trip; 790 rt -= (p->cku_timeall->rt_srtt >> 3); 791 p->cku_timeall->rt_srtt += rt; 792 if (rt < 0) 793 rt = - rt; 794 rt -= (p->cku_timeall->rt_deviate >> 2); 795 p->cku_timeall->rt_deviate += rt; 796 p->cku_timeall->rt_rtxcur = 797 (clock_t)((p->cku_timeall->rt_srtt >> 2) + 798 p->cku_timeall->rt_deviate) >> 1; 799 if (p->cku_feedback != NULL) { 800 (*p->cku_feedback)(FEEDBACK_OK, procnum, 801 p->cku_feedarg); 802 } 803 } 804 805 /* 806 * Process reply 807 */ 808 xdrs = &(p->cku_inxdr); 809 xdrmblk_init(xdrs, resp, XDR_DECODE, 0); 810 811 reply_msg.rm_direction = REPLY; 812 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 813 reply_msg.acpted_rply.ar_stat = SUCCESS; 814 reply_msg.acpted_rply.ar_verf = _null_auth; 815 /* 816 * xdr_results will be done in AUTH_UNWRAP. 817 */ 818 reply_msg.acpted_rply.ar_results.where = NULL; 819 reply_msg.acpted_rply.ar_results.proc = xdr_void; 820 821 /* 822 * Decode and validate the response. 823 */ 824 if (!xdr_replymsg(xdrs, &reply_msg)) { 825 p->cku_err.re_status = RPC_CANTDECODERES; 826 p->cku_err.re_errno = EIO; 827 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 828 goto done1; 829 } 830 831 _seterr_reply(&reply_msg, &(p->cku_err)); 832 833 re_status = p->cku_err.re_status; 834 if (re_status == RPC_SUCCESS) { 835 /* 836 * Reply is good, check auth. 837 */ 838 if (!AUTH_VALIDATE(h->cl_auth, 839 &reply_msg.acpted_rply.ar_verf)) { 840 p->cku_err.re_status = RPC_AUTHERROR; 841 p->cku_err.re_why = AUTH_INVALIDRESP; 842 RCSTAT_INCR(p->cku_stats, rcbadverfs); 843 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 844 goto tryread; 845 } 846 if (!AUTH_UNWRAP(h->cl_auth, xdrs, xdr_results, resultsp)) { 847 p->cku_err.re_status = RPC_CANTDECODERES; 848 p->cku_err.re_errno = EIO; 849 } 850 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 851 goto done1; 852 } 853 /* set errno in case we can't recover */ 854 if (re_status != RPC_VERSMISMATCH && 855 re_status != RPC_AUTHERROR && re_status != RPC_PROGVERSMISMATCH) 856 p->cku_err.re_errno = EIO; 857 /* 858 * Determine whether or not we're doing an RPC 859 * broadcast. Some server implementations don't 860 * follow RFC 1050, section 7.4.2 in that they 861 * don't remain silent when they see a proc 862 * they don't support. Therefore we keep trying 863 * to receive on RPC_PROCUNAVAIL, hoping to get 864 * a valid response from a compliant server. 865 */ 866 if (re_status == RPC_PROCUNAVAIL && p->cku_bcast) { 867 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 868 goto tryread; 869 } 870 if (re_status == RPC_AUTHERROR) { 871 /* 872 * Maybe our credential need to be refreshed 873 */ 874 if (refreshes > 0 && 875 AUTH_REFRESH(h->cl_auth, &reply_msg, p->cku_cred)) { 876 /* 877 * The credential is refreshed. Try the request again. 878 * Even if stries == 0, we still retry as long as 879 * refreshes > 0. This prevents a soft authentication 880 * error turning into a hard one at an upper level. 881 */ 882 refreshes--; 883 RCSTAT_INCR(p->cku_stats, rcbadcalls); 884 RCSTAT_INCR(p->cku_stats, rcnewcreds); 885 886 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 887 freemsg(mpdup); 888 call_table_remove(call); 889 mutex_enter(&call->call_lock); 890 if (call->call_reply != NULL) { 891 freemsg(call->call_reply); 892 call->call_reply = NULL; 893 } 894 mutex_exit(&call->call_lock); 895 896 freemsg(resp); 897 mpdup = NULL; 898 goto call_again; 899 } 900 /* 901 * We have used the client handle to do an AUTH_REFRESH 902 * and the RPC status may be set to RPC_SUCCESS; 903 * Let's make sure to set it to RPC_AUTHERROR. 904 */ 905 p->cku_err.re_status = RPC_CANTDECODERES; 906 907 /* 908 * Map recoverable and unrecoverable 909 * authentication errors to appropriate errno 910 */ 911 switch (p->cku_err.re_why) { 912 case AUTH_TOOWEAK: 913 /* 914 * Could be an nfsportmon failure, set 915 * useresvport and try again. 916 */ 917 if (p->cku_useresvport != 1) { 918 p->cku_useresvport = 1; 919 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 920 freemsg(mpdup); 921 922 call_table_remove(call); 923 mutex_enter(&call->call_lock); 924 if (call->call_reply != NULL) { 925 freemsg(call->call_reply); 926 call->call_reply = NULL; 927 } 928 mutex_exit(&call->call_lock); 929 930 freemsg(resp); 931 mpdup = NULL; 932 endpt = p->cku_endpnt; 933 if (endpt->e_tiptr != NULL) { 934 mutex_enter(&endpt->e_lock); 935 endpt->e_flags &= ~ENDPNT_BOUND; 936 (void) t_kclose(endpt->e_tiptr, 1); 937 endpt->e_tiptr = NULL; 938 mutex_exit(&endpt->e_lock); 939 940 } 941 942 p->cku_xid = alloc_xid(); 943 endpnt_rele(p->cku_endpnt); 944 p->cku_endpnt = NULL; 945 goto call_again; 946 } 947 /* FALLTHRU */ 948 case AUTH_BADCRED: 949 case AUTH_BADVERF: 950 case AUTH_INVALIDRESP: 951 case AUTH_FAILED: 952 case RPCSEC_GSS_NOCRED: 953 case RPCSEC_GSS_FAILED: 954 p->cku_err.re_errno = EACCES; 955 break; 956 case AUTH_REJECTEDCRED: 957 case AUTH_REJECTEDVERF: 958 default: 959 p->cku_err.re_errno = EIO; 960 break; 961 } 962 RPCLOG(1, "clnt_clts_kcallit : authentication failed " 963 "with RPC_AUTHERROR of type %d\n", 964 p->cku_err.re_why); 965 } 966 967 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 968 969 done1: 970 call_table_remove(call); 971 mutex_enter(&call->call_lock); 972 if (call->call_reply != NULL) { 973 freemsg(call->call_reply); 974 call->call_reply = NULL; 975 } 976 mutex_exit(&call->call_lock); 977 RPCLOG(64, "clnt_clts_kcallit_addr: xid 0x%x taken off dispatch list", 978 p->cku_xid); 979 980 done: 981 if (resp != NULL) { 982 freemsg(resp); 983 resp = NULL; 984 } 985 986 if ((p->cku_err.re_status != RPC_SUCCESS) && 987 (p->cku_err.re_status != RPC_INTR) && 988 (p->cku_err.re_status != RPC_UDERROR) && 989 !IS_UNRECOVERABLE_RPC(p->cku_err.re_status)) { 990 if (p->cku_feedback != NULL && stries == p->cku_retrys) { 991 (*p->cku_feedback)(FEEDBACK_REXMIT1, procnum, 992 p->cku_feedarg); 993 } 994 995 timout = backoff(timout); 996 if (p->cku_timeall != (struct rpc_timers *)0) 997 p->cku_timeall->rt_rtxcur = timout; 998 999 if (p->cku_err.re_status == RPC_SYSTEMERROR || 1000 p->cku_err.re_status == RPC_CANTSEND) { 1001 /* 1002 * Errors due to lack of resources, wait a bit 1003 * and try again. 1004 */ 1005 (void) delay(hz/10); 1006 /* (void) sleep((caddr_t)&lbolt, PZERO-4); */ 1007 } 1008 if (stries-- > 0) { 1009 RCSTAT_INCR(p->cku_stats, rcretrans); 1010 goto call_again; 1011 } 1012 } 1013 1014 if (mpdup != NULL) 1015 freemsg(mpdup); 1016 1017 if (p->cku_err.re_status != RPC_SUCCESS) { 1018 RCSTAT_INCR(p->cku_stats, rcbadcalls); 1019 } 1020 1021 /* 1022 * Allow the endpoint to be held by the client handle in case this 1023 * RPC was not successful. A retry may occur at a higher level and 1024 * in this case we may want to send the request over the same 1025 * source port. 1026 */ 1027 if (p->cku_err.re_status == RPC_SUCCESS && p->cku_endpnt != NULL) { 1028 endpnt_rele(p->cku_endpnt); 1029 p->cku_endpnt = NULL; 1030 } 1031 1032 return (p->cku_err.re_status); 1033 } 1034 1035 static enum clnt_stat 1036 clnt_clts_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 1037 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 1038 struct timeval wait) 1039 { 1040 return (clnt_clts_kcallit_addr(h, procnum, xdr_args, argsp, 1041 xdr_results, resultsp, wait, NULL)); 1042 } 1043 1044 /* 1045 * Return error info on this handle. 1046 */ 1047 static void 1048 clnt_clts_kerror(CLIENT *h, struct rpc_err *err) 1049 { 1050 /* LINTED pointer alignment */ 1051 struct cku_private *p = htop(h); 1052 1053 *err = p->cku_err; 1054 } 1055 1056 static bool_t 1057 clnt_clts_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 1058 { 1059 /* LINTED pointer alignment */ 1060 struct cku_private *p = htop(h); 1061 XDR *xdrs; 1062 1063 xdrs = &(p->cku_outxdr); 1064 xdrs->x_op = XDR_FREE; 1065 return ((*xdr_res)(xdrs, res_ptr)); 1066 } 1067 1068 /*ARGSUSED*/ 1069 static void 1070 clnt_clts_kabort(CLIENT *h) 1071 { 1072 } 1073 1074 static bool_t 1075 clnt_clts_kcontrol(CLIENT *h, int cmd, char *arg) 1076 { 1077 /* LINTED pointer alignment */ 1078 struct cku_private *p = htop(h); 1079 1080 switch (cmd) { 1081 case CLSET_XID: 1082 p->cku_xid = *((uint32_t *)arg); 1083 return (TRUE); 1084 1085 case CLGET_XID: 1086 *((uint32_t *)arg) = p->cku_xid; 1087 return (TRUE); 1088 1089 case CLSET_BCAST: 1090 p->cku_bcast = *((uint32_t *)arg); 1091 return (TRUE); 1092 1093 case CLGET_BCAST: 1094 *((uint32_t *)arg) = p->cku_bcast; 1095 return (TRUE); 1096 case CLSET_BINDRESVPORT: 1097 if (arg == NULL) 1098 return (FALSE); 1099 1100 if (*(int *)arg != 1 && *(int *)arg != 0) 1101 return (FALSE); 1102 1103 p->cku_useresvport = *(int *)arg; 1104 1105 return (TRUE); 1106 1107 case CLGET_BINDRESVPORT: 1108 if (arg == NULL) 1109 return (FALSE); 1110 1111 *(int *)arg = p->cku_useresvport; 1112 1113 return (TRUE); 1114 1115 default: 1116 return (FALSE); 1117 } 1118 } 1119 1120 /* 1121 * Destroy rpc handle. 1122 * Frees the space used for output buffer, private data, and handle 1123 * structure, and the file pointer/TLI data on last reference. 1124 */ 1125 static void 1126 clnt_clts_kdestroy(CLIENT *h) 1127 { 1128 /* LINTED pointer alignment */ 1129 struct cku_private *p = htop(h); 1130 calllist_t *call = &p->cku_call; 1131 1132 int plen; 1133 1134 RPCLOG(8, "clnt_clts_kdestroy h: %p\n", (void *)h); 1135 RPCLOG(8, "clnt_clts_kdestroy h: xid=0x%x\n", p->cku_xid); 1136 1137 if (p->cku_endpnt != NULL) 1138 endpnt_rele(p->cku_endpnt); 1139 1140 cv_destroy(&call->call_cv); 1141 mutex_destroy(&call->call_lock); 1142 1143 plen = strlen(p->cku_config.knc_protofmly) + 1; 1144 kmem_free(p->cku_config.knc_protofmly, plen); 1145 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1146 kmem_free(p, sizeof (*p)); 1147 } 1148 1149 /* 1150 * The connectionless (CLTS) kRPC endpoint management subsystem. 1151 * 1152 * Because endpoints are potentially shared among threads making RPC calls, 1153 * they are managed in a pool according to type (endpnt_type_t). Each 1154 * endpnt_type_t points to a list of usable endpoints through the e_pool 1155 * field, which is of type list_t. list_t is a doubly-linked list. 1156 * The number of endpoints in the pool is stored in the e_cnt field of 1157 * endpnt_type_t and the endpoints are reference counted using the e_ref field 1158 * in the endpnt_t structure. 1159 * 1160 * As an optimization, endpoints that have no references are also linked 1161 * to an idle list via e_ilist which is also of type list_t. When a thread 1162 * calls endpnt_get() to obtain a transport endpoint, the idle list is first 1163 * consulted and if such an endpoint exists, it is removed from the idle list 1164 * and returned to the caller. 1165 * 1166 * If the idle list is empty, then a check is made to see if more endpoints 1167 * can be created. If so, we proceed and create a new endpoint which is added 1168 * to the pool and returned to the caller. If we have reached the limit and 1169 * cannot make a new endpoint then one is returned to the caller via round- 1170 * robin policy. 1171 * 1172 * When an endpoint is placed on the idle list by a thread calling 1173 * endpnt_rele(), it is timestamped and then a reaper taskq is scheduled to 1174 * be dispatched if one hasn't already been. When the timer fires, the 1175 * taskq traverses the idle list and checks to see which endpoints are 1176 * eligible to be closed. It determines this by checking if the timestamp 1177 * when the endpoint was released has exceeded the the threshold for how long 1178 * it should stay alive. 1179 * 1180 * endpnt_t structures remain persistent until the memory reclaim callback, 1181 * endpnt_reclaim(), is invoked. 1182 * 1183 * Here is an example of how the data structures would be laid out by the 1184 * subsystem: 1185 * 1186 * endpnt_type_t 1187 * 1188 * loopback inet 1189 * _______________ ______________ 1190 * | e_next |----------------------->| e_next |---->> 1191 * | e_pool |<---+ | e_pool |<----+ 1192 * | e_ilist |<---+--+ | e_ilist |<----+--+ 1193 * +->| e_pcurr |----+--+--+ +->| e_pcurr |-----+--+--+ 1194 * | | ... | | | | | | ... | | | | 1195 * | | e_itimer (90) | | | | | | e_itimer (0) | | | | 1196 * | | e_cnt (1) | | | | | | e_cnt (3) | | | | 1197 * | +---------------+ | | | | +--------------+ | | | 1198 * | | | | | | | | 1199 * | endpnt_t | | | | | | | 1200 * | ____________ | | | | ____________ | | | 1201 * | | e_node |<------+ | | | | e_node |<------+ | | 1202 * | | e_idle |<---------+ | | | e_idle | | | | 1203 * +--| e_type |<------------+ +--| e_type | | | | 1204 * | e_tiptr | | | e_tiptr | | | | 1205 * | ... | | | ... | | | | 1206 * | e_lock | | | e_lock | | | | 1207 * | ... | | | ... | | | | 1208 * | e_ref (0) | | | e_ref (2) | | | | 1209 * | e_itime | | | e_itime | | | | 1210 * +------------+ | +------------+ | | | 1211 * | | | | 1212 * | | | | 1213 * | ____________ | | | 1214 * | | e_node |<------+ | | 1215 * | | e_idle |<------+--+ | 1216 * +--| e_type | | | 1217 * | | e_tiptr | | | 1218 * | | ... | | | 1219 * | | e_lock | | | 1220 * | | ... | | | 1221 * | | e_ref (0) | | | 1222 * | | e_itime | | | 1223 * | +------------+ | | 1224 * | | | 1225 * | | | 1226 * | ____________ | | 1227 * | | e_node |<------+ | 1228 * | | e_idle | | 1229 * +--| e_type |<------------+ 1230 * | e_tiptr | 1231 * | ... | 1232 * | e_lock | 1233 * | ... | 1234 * | e_ref (1) | 1235 * | e_itime | 1236 * +------------+ 1237 * 1238 * Endpoint locking strategy: 1239 * 1240 * The following functions manipulate lists which hold the endpoint and the 1241 * endpoints themselves: 1242 * 1243 * endpnt_get()/check_endpnt()/endpnt_rele()/endpnt_reap()/do_endpnt_reclaim() 1244 * 1245 * Lock description follows: 1246 * 1247 * endpnt_type_lock: Global reader/writer lock which protects accesses to the 1248 * endpnt_type_list. 1249 * 1250 * e_plock: Lock defined in the endpnt_type_t. It is intended to 1251 * protect accesses to the pool of endopints (e_pool) for a given 1252 * endpnt_type_t. 1253 * 1254 * e_ilock: Lock defined in endpnt_type_t. It is intended to protect accesses 1255 * to the idle list (e_ilist) of available endpoints for a given 1256 * endpnt_type_t. It also protects access to the e_itimer, e_async_cv, 1257 * and e_async_count fields in endpnt_type_t. 1258 * 1259 * e_lock: Lock defined in the endpnt structure. It is intended to protect 1260 * flags, cv, and ref count. 1261 * 1262 * The order goes as follows so as not to induce deadlock. 1263 * 1264 * endpnt_type_lock -> e_plock -> e_ilock -> e_lock 1265 * 1266 * Interaction with Zones and shutting down: 1267 * 1268 * endpnt_type_ts are uniquely identified by the (e_zoneid, e_rdev, e_protofmly) 1269 * tuple, which means that a zone may not reuse another zone's idle endpoints 1270 * without first doing a t_kclose(). 1271 * 1272 * A zone's endpnt_type_ts are destroyed when a zone is shut down; e_async_cv 1273 * and e_async_count are used to keep track of the threads in endpnt_taskq 1274 * trying to reap endpnt_ts in the endpnt_type_t. 1275 */ 1276 1277 /* 1278 * Allocate and initialize an endpnt_type_t 1279 */ 1280 static struct endpnt_type * 1281 endpnt_type_create(struct knetconfig *config) 1282 { 1283 struct endpnt_type *etype; 1284 1285 /* 1286 * Allocate a new endpoint type to hang a list of 1287 * endpoints off of it. 1288 */ 1289 etype = kmem_alloc(sizeof (struct endpnt_type), KM_SLEEP); 1290 etype->e_next = NULL; 1291 etype->e_pcurr = NULL; 1292 etype->e_itimer = 0; 1293 etype->e_cnt = 0; 1294 1295 (void) strncpy(etype->e_protofmly, config->knc_protofmly, KNC_STRSIZE); 1296 mutex_init(&etype->e_plock, NULL, MUTEX_DEFAULT, NULL); 1297 mutex_init(&etype->e_ilock, NULL, MUTEX_DEFAULT, NULL); 1298 etype->e_rdev = config->knc_rdev; 1299 etype->e_zoneid = rpc_zoneid(); 1300 etype->e_async_count = 0; 1301 cv_init(&etype->e_async_cv, NULL, CV_DEFAULT, NULL); 1302 1303 list_create(&etype->e_pool, sizeof (endpnt_t), 1304 offsetof(endpnt_t, e_node)); 1305 list_create(&etype->e_ilist, sizeof (endpnt_t), 1306 offsetof(endpnt_t, e_idle)); 1307 1308 /* 1309 * Check to see if we need to create a taskq for endpoint 1310 * reaping 1311 */ 1312 mutex_enter(&endpnt_taskq_lock); 1313 if (taskq_created == FALSE) { 1314 taskq_created = TRUE; 1315 mutex_exit(&endpnt_taskq_lock); 1316 ASSERT(endpnt_taskq == NULL); 1317 endpnt_taskq = taskq_create("clts_endpnt_taskq", 1, 1318 minclsyspri, 200, INT_MAX, 0); 1319 } else 1320 mutex_exit(&endpnt_taskq_lock); 1321 1322 return (etype); 1323 } 1324 1325 /* 1326 * Free an endpnt_type_t 1327 */ 1328 static void 1329 endpnt_type_free(struct endpnt_type *etype) 1330 { 1331 mutex_destroy(&etype->e_plock); 1332 mutex_destroy(&etype->e_ilock); 1333 list_destroy(&etype->e_pool); 1334 list_destroy(&etype->e_ilist); 1335 kmem_free(etype, sizeof (endpnt_type_t)); 1336 } 1337 1338 /* 1339 * Check the endpoint to ensure that it is suitable for use. 1340 * 1341 * Possible return values: 1342 * 1343 * return (1) - Endpoint is established, but needs to be re-opened. 1344 * return (0) && *newp == NULL - Endpoint is established, but unusable. 1345 * return (0) && *newp != NULL - Endpoint is established and usable. 1346 */ 1347 static int 1348 check_endpnt(struct endpnt *endp, struct endpnt **newp) 1349 { 1350 *newp = endp; 1351 1352 mutex_enter(&endp->e_lock); 1353 ASSERT(endp->e_ref >= 1); 1354 1355 /* 1356 * The first condition we check for is if the endpoint has been 1357 * allocated, but is unusable either because it has been closed or 1358 * has been marked stale. Only *one* thread will be allowed to 1359 * execute the then clause. This is enforced becuase the first thread 1360 * to check this condition will clear the flags, so that subsequent 1361 * thread(s) checking this endpoint will move on. 1362 */ 1363 if ((endp->e_flags & ENDPNT_ESTABLISHED) && 1364 (!(endp->e_flags & ENDPNT_BOUND) || 1365 (endp->e_flags & ENDPNT_STALE))) { 1366 /* 1367 * Clear the flags here since they will be 1368 * set again by this thread. They need to be 1369 * individually cleared because we want to maintain 1370 * the state for ENDPNT_ONIDLE. 1371 */ 1372 endp->e_flags &= ~(ENDPNT_ESTABLISHED | 1373 ENDPNT_WAITING | ENDPNT_BOUND | ENDPNT_STALE); 1374 mutex_exit(&endp->e_lock); 1375 return (1); 1376 } 1377 1378 /* 1379 * The second condition is meant for any thread that is waiting for 1380 * an endpoint to become established. It will cv_wait() until 1381 * the condition for the endpoint has been changed to ENDPNT_BOUND or 1382 * ENDPNT_STALE. 1383 */ 1384 while (!(endp->e_flags & ENDPNT_BOUND) && 1385 !(endp->e_flags & ENDPNT_STALE)) { 1386 endp->e_flags |= ENDPNT_WAITING; 1387 cv_wait(&endp->e_cv, &endp->e_lock); 1388 } 1389 1390 ASSERT(endp->e_flags & ENDPNT_ESTABLISHED); 1391 1392 /* 1393 * The last case we check for is if the endpoint has been marked stale. 1394 * If this is the case then set *newp to NULL and return, so that the 1395 * caller is notified of the error and can take appropriate action. 1396 */ 1397 if (endp->e_flags & ENDPNT_STALE) { 1398 endp->e_ref--; 1399 *newp = NULL; 1400 } 1401 mutex_exit(&endp->e_lock); 1402 return (0); 1403 } 1404 1405 #ifdef DEBUG 1406 /* 1407 * Provide a fault injection setting to test error conditions. 1408 */ 1409 static int endpnt_get_return_null = 0; 1410 #endif 1411 1412 /* 1413 * Returns a handle (struct endpnt *) to an open and bound endpoint 1414 * specified by the knetconfig passed in. Returns NULL if no valid endpoint 1415 * can be obtained. 1416 */ 1417 static struct endpnt * 1418 endpnt_get(struct knetconfig *config, int useresvport) 1419 { 1420 struct endpnt_type *n_etype = NULL; 1421 struct endpnt_type *np = NULL; 1422 struct endpnt *new = NULL; 1423 struct endpnt *endp = NULL; 1424 struct endpnt *next = NULL; 1425 TIUSER *tiptr = NULL; 1426 int rtries = BINDRESVPORT_RETRIES; 1427 int i = 0; 1428 int error; 1429 int retval; 1430 zoneid_t zoneid = rpc_zoneid(); 1431 cred_t *cr; 1432 1433 RPCLOG(1, "endpnt_get: protofmly %s, ", config->knc_protofmly); 1434 RPCLOG(1, "rdev %ld\n", config->knc_rdev); 1435 1436 #ifdef DEBUG 1437 /* 1438 * Inject fault if desired. Pretend we have a stale endpoint 1439 * and return NULL. 1440 */ 1441 if (endpnt_get_return_null > 0) { 1442 endpnt_get_return_null--; 1443 return (NULL); 1444 } 1445 #endif 1446 rw_enter(&endpnt_type_lock, RW_READER); 1447 1448 top: 1449 for (np = endpnt_type_list; np != NULL; np = np->e_next) 1450 if ((np->e_zoneid == zoneid) && 1451 (np->e_rdev == config->knc_rdev) && 1452 (strcmp(np->e_protofmly, 1453 config->knc_protofmly) == 0)) 1454 break; 1455 1456 if (np == NULL && n_etype != NULL) { 1457 ASSERT(rw_write_held(&endpnt_type_lock)); 1458 1459 /* 1460 * Link the endpoint type onto the list 1461 */ 1462 n_etype->e_next = endpnt_type_list; 1463 endpnt_type_list = n_etype; 1464 np = n_etype; 1465 n_etype = NULL; 1466 } 1467 1468 if (np == NULL) { 1469 /* 1470 * The logic here is that we were unable to find an 1471 * endpnt_type_t that matched our criteria, so we allocate a 1472 * new one. Because kmem_alloc() needs to be called with 1473 * KM_SLEEP, we drop our locks so that we don't induce 1474 * deadlock. After allocating and initializing the 1475 * endpnt_type_t, we reaquire the lock and go back to check 1476 * if this entry needs to be added to the list. Since we do 1477 * some operations without any locking other threads may 1478 * have been looking for the same endpnt_type_t and gone 1479 * through this code path. We check for this case and allow 1480 * one thread to link its endpnt_type_t to the list and the 1481 * other threads will simply free theirs. 1482 */ 1483 rw_exit(&endpnt_type_lock); 1484 n_etype = endpnt_type_create(config); 1485 1486 /* 1487 * We need to reaquire the lock with RW_WRITER here so that 1488 * we can safely link the new endpoint type onto the list. 1489 */ 1490 rw_enter(&endpnt_type_lock, RW_WRITER); 1491 goto top; 1492 } 1493 1494 rw_exit(&endpnt_type_lock); 1495 /* 1496 * If n_etype is not NULL, then another thread was able to 1497 * insert an endpnt_type_t of this type onto the list before 1498 * we did. Go ahead and free ours. 1499 */ 1500 if (n_etype != NULL) 1501 endpnt_type_free(n_etype); 1502 1503 mutex_enter(&np->e_ilock); 1504 /* 1505 * The algorithm to hand out endpoints is to first 1506 * give out those that are idle if such endpoints 1507 * exist. Otherwise, create a new one if we haven't 1508 * reached the max threshold. Finally, we give out 1509 * endpoints in a pseudo LRU fashion (round-robin). 1510 * 1511 * Note: The idle list is merely a hint of those endpoints 1512 * that should be idle. There exists a window after the 1513 * endpoint is released and before it is linked back onto the 1514 * idle list where a thread could get a reference to it and 1515 * use it. This is okay, since the reference counts will 1516 * still be consistent. 1517 */ 1518 if ((endp = (endpnt_t *)list_head(&np->e_ilist)) != NULL) { 1519 timeout_id_t t_id = 0; 1520 1521 mutex_enter(&endp->e_lock); 1522 endp->e_ref++; 1523 endp->e_itime = 0; 1524 endp->e_flags &= ~ENDPNT_ONIDLE; 1525 mutex_exit(&endp->e_lock); 1526 1527 /* 1528 * Pop the endpoint off the idle list and hand it off 1529 */ 1530 list_remove(&np->e_ilist, endp); 1531 1532 if (np->e_itimer != 0) { 1533 t_id = np->e_itimer; 1534 np->e_itimer = 0; 1535 } 1536 mutex_exit(&np->e_ilock); 1537 /* 1538 * Reset the idle timer if it has been set 1539 */ 1540 if (t_id != (timeout_id_t)0) 1541 (void) untimeout(t_id); 1542 1543 if (check_endpnt(endp, &new) == 0) 1544 return (new); 1545 } else if (np->e_cnt >= clnt_clts_max_endpoints) { 1546 /* 1547 * There are no idle endpoints currently, so 1548 * create a new one if we have not reached the maximum or 1549 * hand one out in round-robin. 1550 */ 1551 mutex_exit(&np->e_ilock); 1552 mutex_enter(&np->e_plock); 1553 endp = np->e_pcurr; 1554 mutex_enter(&endp->e_lock); 1555 endp->e_ref++; 1556 mutex_exit(&endp->e_lock); 1557 1558 ASSERT(endp != NULL); 1559 /* 1560 * Advance the pointer to the next eligible endpoint, if 1561 * necessary. 1562 */ 1563 if (np->e_cnt > 1) { 1564 next = (endpnt_t *)list_next(&np->e_pool, np->e_pcurr); 1565 if (next == NULL) 1566 next = (endpnt_t *)list_head(&np->e_pool); 1567 np->e_pcurr = next; 1568 } 1569 1570 mutex_exit(&np->e_plock); 1571 1572 /* 1573 * We need to check to see if this endpoint is bound or 1574 * not. If it is in progress then just wait until 1575 * the set up is complete 1576 */ 1577 if (check_endpnt(endp, &new) == 0) 1578 return (new); 1579 } else { 1580 mutex_exit(&np->e_ilock); 1581 mutex_enter(&np->e_plock); 1582 1583 /* 1584 * Allocate a new endpoint to use. If we can't allocate any 1585 * more memory then use one that is already established if any 1586 * such endpoints exist. 1587 */ 1588 new = kmem_cache_alloc(endpnt_cache, KM_NOSLEEP); 1589 if (new == NULL) { 1590 RPCLOG0(1, "endpnt_get: kmem_cache_alloc failed\n"); 1591 /* 1592 * Try to recover by using an existing endpoint. 1593 */ 1594 if (np->e_cnt <= 0) { 1595 mutex_exit(&np->e_plock); 1596 return (NULL); 1597 } 1598 endp = np->e_pcurr; 1599 if ((next = list_next(&np->e_pool, np->e_pcurr)) != 1600 NULL) 1601 np->e_pcurr = next; 1602 ASSERT(endp != NULL); 1603 mutex_enter(&endp->e_lock); 1604 endp->e_ref++; 1605 mutex_exit(&endp->e_lock); 1606 mutex_exit(&np->e_plock); 1607 1608 if (check_endpnt(endp, &new) == 0) 1609 return (new); 1610 } else { 1611 /* 1612 * Partially init an endpoint structure and put 1613 * it on the list, so that other interested threads 1614 * know that one is being created 1615 */ 1616 bzero(new, sizeof (struct endpnt)); 1617 1618 cv_init(&new->e_cv, NULL, CV_DEFAULT, NULL); 1619 mutex_init(&new->e_lock, NULL, MUTEX_DEFAULT, NULL); 1620 new->e_ref = 1; 1621 new->e_type = np; 1622 1623 /* 1624 * Link the endpoint into the pool. 1625 */ 1626 list_insert_head(&np->e_pool, new); 1627 np->e_cnt++; 1628 if (np->e_pcurr == NULL) 1629 np->e_pcurr = new; 1630 mutex_exit(&np->e_plock); 1631 } 1632 } 1633 1634 /* 1635 * The transport should be opened with sufficient privs 1636 */ 1637 cr = zone_kcred(); 1638 error = t_kopen(NULL, config->knc_rdev, FREAD|FWRITE|FNDELAY, &tiptr, 1639 cr); 1640 if (error) { 1641 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1642 goto bad; 1643 } 1644 1645 new->e_tiptr = tiptr; 1646 rpc_poptimod(tiptr->fp->f_vnode); 1647 1648 /* 1649 * Allow the kernel to push the module on behalf of the user. 1650 */ 1651 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 1652 K_TO_K, cr, &retval); 1653 if (error) { 1654 RPCLOG(1, "endpnt_get: kstr_push on rpcmod failed %d\n", error); 1655 goto bad; 1656 } 1657 1658 error = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 1659 cr, &retval); 1660 if (error) { 1661 RPCLOG(1, "endpnt_get: strioctl failed %d\n", error); 1662 goto bad; 1663 } 1664 1665 /* 1666 * Connectionless data flow should bypass the stream head. 1667 */ 1668 new->e_wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 1669 1670 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 1671 K_TO_K, cr, &retval); 1672 if (error) { 1673 RPCLOG(1, "endpnt_get: kstr_push on timod failed %d\n", error); 1674 goto bad; 1675 } 1676 1677 /* 1678 * Attempt to bind the endpoint. If we fail then propogate 1679 * error back to calling subsystem, so that it can be handled 1680 * appropriately. 1681 * If the caller has not specified reserved port usage then 1682 * take the system default. 1683 */ 1684 if (useresvport == -1) 1685 useresvport = clnt_clts_do_bindresvport; 1686 1687 if (useresvport && 1688 (strcmp(config->knc_protofmly, NC_INET) == 0 || 1689 strcmp(config->knc_protofmly, NC_INET6) == 0)) { 1690 1691 while ((error = 1692 bindresvport(new->e_tiptr, NULL, NULL, FALSE)) != 0) { 1693 RPCLOG(1, 1694 "endpnt_get: bindresvport error %d\n", error); 1695 if (error != EPROTO) { 1696 if (rtries-- <= 0) 1697 goto bad; 1698 1699 delay(hz << i++); 1700 continue; 1701 } 1702 1703 (void) t_kclose(new->e_tiptr, 1); 1704 /* 1705 * reopen with all privileges 1706 */ 1707 error = t_kopen(NULL, config->knc_rdev, 1708 FREAD|FWRITE|FNDELAY, 1709 &new->e_tiptr, cr); 1710 if (error) { 1711 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1712 new->e_tiptr = NULL; 1713 goto bad; 1714 } 1715 } 1716 } else if ((error = t_kbind(new->e_tiptr, NULL, NULL)) != 0) { 1717 RPCLOG(1, "endpnt_get: t_kbind failed: %d\n", error); 1718 goto bad; 1719 } 1720 1721 /* 1722 * Set the flags and notify and waiters that we have an established 1723 * endpoint. 1724 */ 1725 mutex_enter(&new->e_lock); 1726 new->e_flags |= ENDPNT_ESTABLISHED; 1727 new->e_flags |= ENDPNT_BOUND; 1728 if (new->e_flags & ENDPNT_WAITING) { 1729 cv_broadcast(&new->e_cv); 1730 new->e_flags &= ~ENDPNT_WAITING; 1731 } 1732 mutex_exit(&new->e_lock); 1733 1734 return (new); 1735 1736 bad: 1737 ASSERT(new != NULL); 1738 /* 1739 * mark this endpoint as stale and notify any threads waiting 1740 * on this endpoint that it will be going away. 1741 */ 1742 mutex_enter(&new->e_lock); 1743 if (new->e_ref > 0) { 1744 new->e_flags |= ENDPNT_ESTABLISHED; 1745 new->e_flags |= ENDPNT_STALE; 1746 if (new->e_flags & ENDPNT_WAITING) { 1747 cv_broadcast(&new->e_cv); 1748 new->e_flags &= ~ENDPNT_WAITING; 1749 } 1750 } 1751 new->e_ref--; 1752 new->e_tiptr = NULL; 1753 mutex_exit(&new->e_lock); 1754 1755 /* 1756 * If there was a transport endopoint opened, then close it. 1757 */ 1758 if (tiptr != NULL) 1759 (void) t_kclose(tiptr, 1); 1760 1761 return (NULL); 1762 } 1763 1764 /* 1765 * Release a referece to the endpoint 1766 */ 1767 static void 1768 endpnt_rele(struct endpnt *sp) 1769 { 1770 mutex_enter(&sp->e_lock); 1771 ASSERT(sp->e_ref > 0); 1772 sp->e_ref--; 1773 /* 1774 * If the ref count is zero, then start the idle timer and link 1775 * the endpoint onto the idle list. 1776 */ 1777 if (sp->e_ref == 0) { 1778 sp->e_itime = gethrestime_sec(); 1779 1780 /* 1781 * Check to see if the endpoint is already linked to the idle 1782 * list, so that we don't try to reinsert it. 1783 */ 1784 if (sp->e_flags & ENDPNT_ONIDLE) { 1785 mutex_exit(&sp->e_lock); 1786 mutex_enter(&sp->e_type->e_ilock); 1787 endpnt_reap_settimer(sp->e_type); 1788 mutex_exit(&sp->e_type->e_ilock); 1789 return; 1790 } 1791 1792 sp->e_flags |= ENDPNT_ONIDLE; 1793 mutex_exit(&sp->e_lock); 1794 mutex_enter(&sp->e_type->e_ilock); 1795 list_insert_tail(&sp->e_type->e_ilist, sp); 1796 endpnt_reap_settimer(sp->e_type); 1797 mutex_exit(&sp->e_type->e_ilock); 1798 } else 1799 mutex_exit(&sp->e_lock); 1800 } 1801 1802 static void 1803 endpnt_reap_settimer(endpnt_type_t *etp) 1804 { 1805 if (etp->e_itimer == (timeout_id_t)0) 1806 etp->e_itimer = timeout(endpnt_reap_dispatch, (void *)etp, 1807 clnt_clts_taskq_dispatch_interval); 1808 } 1809 1810 static void 1811 endpnt_reap_dispatch(void *a) 1812 { 1813 endpnt_type_t *etp = a; 1814 1815 /* 1816 * The idle timer has fired, so dispatch the taskq to close the 1817 * endpoint. 1818 */ 1819 if (taskq_dispatch(endpnt_taskq, (task_func_t *)endpnt_reap, etp, 1820 TQ_NOSLEEP) == NULL) 1821 return; 1822 mutex_enter(&etp->e_ilock); 1823 etp->e_async_count++; 1824 mutex_exit(&etp->e_ilock); 1825 } 1826 1827 /* 1828 * Traverse the idle list and close those endpoints that have reached their 1829 * timeout interval. 1830 */ 1831 static void 1832 endpnt_reap(endpnt_type_t *etp) 1833 { 1834 struct endpnt *e; 1835 struct endpnt *next_node = NULL; 1836 1837 mutex_enter(&etp->e_ilock); 1838 e = list_head(&etp->e_ilist); 1839 while (e != NULL) { 1840 next_node = list_next(&etp->e_ilist, e); 1841 1842 mutex_enter(&e->e_lock); 1843 if (e->e_ref > 0) { 1844 mutex_exit(&e->e_lock); 1845 e = next_node; 1846 continue; 1847 } 1848 1849 ASSERT(e->e_ref == 0); 1850 if (e->e_itime > 0 && 1851 (e->e_itime + clnt_clts_endpoint_reap_interval) < 1852 gethrestime_sec()) { 1853 e->e_flags &= ~ENDPNT_BOUND; 1854 (void) t_kclose(e->e_tiptr, 1); 1855 e->e_tiptr = NULL; 1856 e->e_itime = 0; 1857 } 1858 mutex_exit(&e->e_lock); 1859 e = next_node; 1860 } 1861 etp->e_itimer = 0; 1862 if (--etp->e_async_count == 0) 1863 cv_signal(&etp->e_async_cv); 1864 mutex_exit(&etp->e_ilock); 1865 } 1866 1867 static void 1868 endpnt_reclaim(zoneid_t zoneid) 1869 { 1870 struct endpnt_type *np; 1871 struct endpnt *e; 1872 struct endpnt *next_node = NULL; 1873 list_t free_list; 1874 int rcnt = 0; 1875 1876 list_create(&free_list, sizeof (endpnt_t), offsetof(endpnt_t, e_node)); 1877 1878 RPCLOG0(1, "endpnt_reclaim: reclaim callback started\n"); 1879 rw_enter(&endpnt_type_lock, RW_READER); 1880 for (np = endpnt_type_list; np != NULL; np = np->e_next) { 1881 if (zoneid != ALL_ZONES && zoneid != np->e_zoneid) 1882 continue; 1883 1884 mutex_enter(&np->e_plock); 1885 RPCLOG(1, "endpnt_reclaim: protofmly %s, ", 1886 np->e_protofmly); 1887 RPCLOG(1, "rdev %ld\n", np->e_rdev); 1888 RPCLOG(1, "endpnt_reclaim: found %d endpoint(s)\n", 1889 np->e_cnt); 1890 1891 if (np->e_cnt == 0) { 1892 mutex_exit(&np->e_plock); 1893 continue; 1894 } 1895 1896 /* 1897 * The nice thing about maintaining an idle list is that if 1898 * there are any endpoints to reclaim, they are going to be 1899 * on this list. Just go through and reap the one's that 1900 * have ref counts of zero. 1901 */ 1902 mutex_enter(&np->e_ilock); 1903 e = list_head(&np->e_ilist); 1904 while (e != NULL) { 1905 next_node = list_next(&np->e_ilist, e); 1906 mutex_enter(&e->e_lock); 1907 if (e->e_ref > 0) { 1908 mutex_exit(&e->e_lock); 1909 e = next_node; 1910 continue; 1911 } 1912 ASSERT(e->e_ref == 0); 1913 mutex_exit(&e->e_lock); 1914 1915 list_remove(&np->e_ilist, e); 1916 list_remove(&np->e_pool, e); 1917 list_insert_head(&free_list, e); 1918 1919 rcnt++; 1920 np->e_cnt--; 1921 e = next_node; 1922 } 1923 mutex_exit(&np->e_ilock); 1924 /* 1925 * Reset the current pointer to be safe 1926 */ 1927 if ((e = (struct endpnt *)list_head(&np->e_pool)) != NULL) 1928 np->e_pcurr = e; 1929 else { 1930 ASSERT(np->e_cnt == 0); 1931 np->e_pcurr = NULL; 1932 } 1933 1934 mutex_exit(&np->e_plock); 1935 } 1936 rw_exit(&endpnt_type_lock); 1937 1938 while ((e = list_head(&free_list)) != NULL) { 1939 list_remove(&free_list, e); 1940 if (e->e_tiptr != NULL) 1941 (void) t_kclose(e->e_tiptr, 1); 1942 1943 cv_destroy(&e->e_cv); 1944 mutex_destroy(&e->e_lock); 1945 kmem_cache_free(endpnt_cache, e); 1946 } 1947 list_destroy(&free_list); 1948 RPCLOG(1, "endpnt_reclaim: reclaimed %d endpoint(s)\n", rcnt); 1949 } 1950 1951 /* 1952 * Endpoint reclaim zones destructor callback routine. 1953 * 1954 * After reclaiming any cached entries, we basically go through the endpnt_type 1955 * list, canceling outstanding timeouts and free'ing data structures. 1956 */ 1957 /* ARGSUSED */ 1958 static void 1959 endpnt_destructor(zoneid_t zoneid, void *a) 1960 { 1961 struct endpnt_type **npp; 1962 struct endpnt_type *np; 1963 struct endpnt_type *free_list = NULL; 1964 timeout_id_t t_id = 0; 1965 extern void clcleanup_zone(zoneid_t); 1966 extern void clcleanup4_zone(zoneid_t); 1967 1968 /* Make sure NFS client handles are released. */ 1969 clcleanup_zone(zoneid); 1970 clcleanup4_zone(zoneid); 1971 1972 endpnt_reclaim(zoneid); 1973 /* 1974 * We don't need to be holding on to any locks across the call to 1975 * endpnt_reclaim() and the code below; we know that no-one can 1976 * be holding open connections for this zone (all processes and kernel 1977 * threads are gone), so nothing could be adding anything to the list. 1978 */ 1979 rw_enter(&endpnt_type_lock, RW_WRITER); 1980 npp = &endpnt_type_list; 1981 while ((np = *npp) != NULL) { 1982 if (np->e_zoneid != zoneid) { 1983 npp = &np->e_next; 1984 continue; 1985 } 1986 mutex_enter(&np->e_plock); 1987 mutex_enter(&np->e_ilock); 1988 if (np->e_itimer != 0) { 1989 t_id = np->e_itimer; 1990 np->e_itimer = 0; 1991 } 1992 ASSERT(np->e_cnt == 0); 1993 ASSERT(list_head(&np->e_pool) == NULL); 1994 ASSERT(list_head(&np->e_ilist) == NULL); 1995 1996 mutex_exit(&np->e_ilock); 1997 mutex_exit(&np->e_plock); 1998 1999 /* 2000 * untimeout() any outstanding timers that have not yet fired. 2001 */ 2002 if (t_id != (timeout_id_t)0) 2003 (void) untimeout(t_id); 2004 *npp = np->e_next; 2005 np->e_next = free_list; 2006 free_list = np; 2007 } 2008 rw_exit(&endpnt_type_lock); 2009 2010 while (free_list != NULL) { 2011 np = free_list; 2012 free_list = free_list->e_next; 2013 /* 2014 * Wait for threads in endpnt_taskq trying to reap endpnt_ts in 2015 * the endpnt_type_t. 2016 */ 2017 mutex_enter(&np->e_ilock); 2018 while (np->e_async_count > 0) 2019 cv_wait(&np->e_async_cv, &np->e_ilock); 2020 cv_destroy(&np->e_async_cv); 2021 mutex_destroy(&np->e_plock); 2022 mutex_destroy(&np->e_ilock); 2023 list_destroy(&np->e_pool); 2024 list_destroy(&np->e_ilist); 2025 kmem_free(np, sizeof (endpnt_type_t)); 2026 } 2027 } 2028 2029 /* 2030 * Endpoint reclaim kmem callback routine. 2031 */ 2032 /* ARGSUSED */ 2033 static void 2034 endpnt_repossess(void *a) 2035 { 2036 /* 2037 * Reclaim idle endpnt's from all zones. 2038 */ 2039 if (endpnt_taskq != NULL) 2040 (void) taskq_dispatch(endpnt_taskq, 2041 (task_func_t *)endpnt_reclaim, (void *)ALL_ZONES, 2042 TQ_NOSLEEP); 2043 } 2044 2045 /* 2046 * RPC request dispatch routine. Constructs a datagram message and wraps it 2047 * around the RPC request to pass downstream. 2048 */ 2049 static int 2050 clnt_clts_dispatch_send(queue_t *q, mblk_t *mp, struct netbuf *addr, 2051 calllist_t *cp, uint_t xid) 2052 { 2053 mblk_t *bp; 2054 int msgsz; 2055 struct T_unitdata_req *udreq; 2056 2057 /* 2058 * Set up the call record. 2059 */ 2060 cp->call_wq = q; 2061 cp->call_xid = xid; 2062 cp->call_status = RPC_TIMEDOUT; 2063 cp->call_notified = FALSE; 2064 RPCLOG(64, 2065 "clnt_clts_dispatch_send: putting xid 0x%x on " 2066 "dispatch list\n", xid); 2067 cp->call_hash = call_hash(xid, clnt_clts_hash_size); 2068 cp->call_bucket = &clts_call_ht[cp->call_hash]; 2069 call_table_enter(cp); 2070 2071 /* 2072 * Construct the datagram 2073 */ 2074 msgsz = (int)TUNITDATAREQSZ; 2075 while (!(bp = allocb(msgsz + addr->len, BPRI_LO))) { 2076 if (strwaitbuf(msgsz + addr->len, BPRI_LO)) 2077 return (ENOSR); 2078 } 2079 2080 udreq = (struct T_unitdata_req *)bp->b_wptr; 2081 udreq->PRIM_type = T_UNITDATA_REQ; 2082 udreq->DEST_length = addr->len; 2083 2084 if (addr->len) { 2085 bcopy(addr->buf, bp->b_wptr + msgsz, addr->len); 2086 udreq->DEST_offset = (t_scalar_t)msgsz; 2087 msgsz += addr->len; 2088 } else 2089 udreq->DEST_offset = 0; 2090 udreq->OPT_length = 0; 2091 udreq->OPT_offset = 0; 2092 2093 bp->b_datap->db_type = M_PROTO; 2094 bp->b_wptr += msgsz; 2095 2096 /* 2097 * Link the datagram header with the actual data 2098 */ 2099 linkb(bp, mp); 2100 2101 /* 2102 * Send downstream. 2103 */ 2104 if (canput(cp->call_wq)) { 2105 put(cp->call_wq, bp); 2106 return (0); 2107 } 2108 2109 return (EIO); 2110 } 2111 2112 /* 2113 * RPC response delivery routine. Deliver the response to the waiting 2114 * thread by matching the xid. 2115 */ 2116 void 2117 clnt_clts_dispatch_notify(mblk_t *mp, int resp_off, zoneid_t zoneid) 2118 { 2119 calllist_t *e = NULL; 2120 call_table_t *chtp; 2121 uint32_t xid; 2122 uint_t hash; 2123 unsigned char *hdr_offset; 2124 mblk_t *resp; 2125 2126 /* 2127 * If the RPC response is not contained in the same mblk as the 2128 * datagram header, then move to the next mblk. 2129 */ 2130 hdr_offset = mp->b_rptr; 2131 resp = mp; 2132 if ((mp->b_wptr - (mp->b_rptr + resp_off)) == 0) 2133 resp = mp->b_cont; 2134 else 2135 resp->b_rptr += resp_off; 2136 2137 ASSERT(resp != NULL); 2138 2139 if ((IS_P2ALIGNED(resp->b_rptr, sizeof (uint32_t))) && 2140 (resp->b_wptr - resp->b_rptr) >= sizeof (xid)) 2141 xid = *((uint32_t *)resp->b_rptr); 2142 else { 2143 int i = 0; 2144 unsigned char *p = (unsigned char *)&xid; 2145 unsigned char *rptr; 2146 mblk_t *tmp = resp; 2147 2148 /* 2149 * Copy the xid, byte-by-byte into xid. 2150 */ 2151 while (tmp) { 2152 rptr = tmp->b_rptr; 2153 while (rptr < tmp->b_wptr) { 2154 *p++ = *rptr++; 2155 if (++i >= sizeof (xid)) 2156 goto done_xid_copy; 2157 } 2158 tmp = tmp->b_cont; 2159 } 2160 2161 /* 2162 * If we got here, we ran out of mblk space before the 2163 * xid could be copied. 2164 */ 2165 ASSERT(tmp == NULL && i < sizeof (xid)); 2166 2167 RPCLOG0(1, 2168 "clnt_dispatch_notify(clts): message less than " 2169 "size of xid\n"); 2170 2171 freemsg(mp); 2172 return; 2173 } 2174 2175 done_xid_copy: 2176 2177 /* 2178 * Reset the read pointer back to the beginning of the protocol 2179 * header if we moved it. 2180 */ 2181 if (mp->b_rptr != hdr_offset) 2182 mp->b_rptr = hdr_offset; 2183 2184 hash = call_hash(xid, clnt_clts_hash_size); 2185 chtp = &clts_call_ht[hash]; 2186 /* call_table_find returns with the hash bucket locked */ 2187 call_table_find(chtp, xid, e); 2188 2189 if (e != NULL) { 2190 mutex_enter(&e->call_lock); 2191 2192 /* 2193 * verify that the reply is coming in on 2194 * the same zone that it was sent from. 2195 */ 2196 if (e->call_zoneid != zoneid) { 2197 mutex_exit(&e->call_lock); 2198 mutex_exit(&chtp->ct_lock); 2199 freemsg(mp); 2200 return; 2201 } 2202 2203 /* 2204 * found thread waiting for this reply. 2205 */ 2206 if (e->call_reply) { 2207 RPCLOG(8, 2208 "clnt_dispatch_notify (clts): discarding old " 2209 "reply for xid 0x%x\n", 2210 xid); 2211 freemsg(e->call_reply); 2212 } 2213 e->call_notified = TRUE; 2214 e->call_reply = mp; 2215 e->call_status = RPC_SUCCESS; 2216 cv_signal(&e->call_cv); 2217 mutex_exit(&e->call_lock); 2218 mutex_exit(&chtp->ct_lock); 2219 } else { 2220 zone_t *zone; 2221 struct rpcstat *rpcstat; 2222 2223 mutex_exit(&chtp->ct_lock); 2224 RPCLOG(8, "clnt_dispatch_notify (clts): no caller for reply " 2225 "0x%x\n", xid); 2226 freemsg(mp); 2227 /* 2228 * This is unfortunate, but we need to lookup the zone so we 2229 * can increment its "rcbadxids" counter. 2230 */ 2231 zone = zone_find_by_id(zoneid); 2232 if (zone == NULL) { 2233 /* 2234 * The zone went away... 2235 */ 2236 return; 2237 } 2238 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 2239 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 2240 /* 2241 * Not interested 2242 */ 2243 zone_rele(zone); 2244 return; 2245 } 2246 RCSTAT_INCR(rpcstat->rpc_clts_client, rcbadxids); 2247 zone_rele(zone); 2248 } 2249 } 2250 2251 /* 2252 * Init routine. Called when rpcmod is loaded. 2253 */ 2254 void 2255 clnt_clts_init(void) 2256 { 2257 endpnt_cache = kmem_cache_create("clnt_clts_endpnt_cache", 2258 sizeof (struct endpnt), 0, NULL, NULL, endpnt_repossess, NULL, 2259 NULL, 0); 2260 2261 rw_init(&endpnt_type_lock, NULL, RW_DEFAULT, NULL); 2262 2263 /* 2264 * Perform simple bounds checking to make sure that the setting is 2265 * reasonable 2266 */ 2267 if (clnt_clts_max_endpoints <= 0) { 2268 if (clnt_clts_do_bindresvport) 2269 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2270 else 2271 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2272 } 2273 2274 if (clnt_clts_do_bindresvport && 2275 clnt_clts_max_endpoints > RESERVED_PORTSPACE) 2276 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2277 else if (clnt_clts_max_endpoints > NONRESERVED_PORTSPACE) 2278 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2279 2280 if (clnt_clts_hash_size < DEFAULT_MIN_HASH_SIZE) 2281 clnt_clts_hash_size = DEFAULT_MIN_HASH_SIZE; 2282 2283 /* 2284 * Defer creating the taskq until rpcmod gets pushed. If we are 2285 * in diskless boot mode, rpcmod will get loaded early even before 2286 * thread_create() is available. 2287 */ 2288 endpnt_taskq = NULL; 2289 taskq_created = FALSE; 2290 mutex_init(&endpnt_taskq_lock, NULL, MUTEX_DEFAULT, NULL); 2291 2292 if (clnt_clts_endpoint_reap_interval < DEFAULT_ENDPOINT_REAP_INTERVAL) 2293 clnt_clts_endpoint_reap_interval = 2294 DEFAULT_ENDPOINT_REAP_INTERVAL; 2295 2296 /* 2297 * Dispatch the taskq at an interval which is offset from the 2298 * interval that the endpoints should be reaped. 2299 */ 2300 clnt_clts_taskq_dispatch_interval = 2301 (clnt_clts_endpoint_reap_interval + DEFAULT_INTERVAL_SHIFT) * hz; 2302 2303 /* 2304 * Initialize the completion queue 2305 */ 2306 clts_call_ht = call_table_init(clnt_clts_hash_size); 2307 /* 2308 * Initialize the zone destructor callback. 2309 */ 2310 zone_key_create(&endpnt_destructor_key, NULL, NULL, endpnt_destructor); 2311 } 2312 2313 void 2314 clnt_clts_fini(void) 2315 { 2316 (void) zone_key_delete(endpnt_destructor_key); 2317 } 2318