1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 28 * All Rights Reserved 29 */ 30 31 /* 32 * Portions of this source code were derived from Berkeley 4.3 BSD 33 * under license from the Regents of the University of California. 34 */ 35 36 #pragma ident "%Z%%M% %I% %E% SMI" 37 38 /* 39 * Implements a kernel based, client side RPC. 40 */ 41 42 #include <sys/param.h> 43 #include <sys/types.h> 44 #include <sys/systm.h> 45 #include <sys/sysmacros.h> 46 #include <sys/stream.h> 47 #include <sys/strsubr.h> 48 #include <sys/ddi.h> 49 #include <sys/tiuser.h> 50 #include <sys/tihdr.h> 51 #include <sys/t_kuser.h> 52 #include <sys/errno.h> 53 #include <sys/kmem.h> 54 #include <sys/debug.h> 55 #include <sys/kstat.h> 56 #include <sys/t_lock.h> 57 #include <sys/cmn_err.h> 58 #include <sys/conf.h> 59 #include <sys/disp.h> 60 #include <sys/taskq.h> 61 #include <sys/list.h> 62 #include <sys/atomic.h> 63 #include <sys/zone.h> 64 #include <netinet/in.h> 65 #include <rpc/types.h> 66 #include <rpc/xdr.h> 67 #include <rpc/auth.h> 68 #include <rpc/clnt.h> 69 #include <rpc/rpc_msg.h> 70 71 static enum clnt_stat clnt_clts_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 72 caddr_t, xdrproc_t, caddr_t, struct timeval); 73 static void clnt_clts_kabort(CLIENT *); 74 static void clnt_clts_kerror(CLIENT *, struct rpc_err *); 75 static bool_t clnt_clts_kfreeres(CLIENT *, xdrproc_t, caddr_t); 76 static bool_t clnt_clts_kcontrol(CLIENT *, int, char *); 77 static void clnt_clts_kdestroy(CLIENT *); 78 static int clnt_clts_ksettimers(CLIENT *, struct rpc_timers *, 79 struct rpc_timers *, int, void (*)(), caddr_t, uint32_t); 80 81 /* 82 * Operations vector for CLTS based RPC 83 */ 84 static struct clnt_ops clts_ops = { 85 clnt_clts_kcallit, /* do rpc call */ 86 clnt_clts_kabort, /* abort call */ 87 clnt_clts_kerror, /* return error status */ 88 clnt_clts_kfreeres, /* free results */ 89 clnt_clts_kdestroy, /* destroy rpc handle */ 90 clnt_clts_kcontrol, /* the ioctl() of rpc */ 91 clnt_clts_ksettimers /* set retry timers */ 92 }; 93 94 /* 95 * Endpoint for CLTS (INET, INET6, loopback, etc.) 96 */ 97 typedef struct endpnt_type { 98 struct endpnt_type *e_next; /* pointer to next endpoint type */ 99 list_t e_pool; /* list of available endpoints */ 100 list_t e_ilist; /* list of idle endpints */ 101 struct endpnt *e_pcurr; /* pointer to current endpoint */ 102 char e_protofmly[KNC_STRSIZE]; /* protocol family */ 103 dev_t e_rdev; /* device */ 104 kmutex_t e_plock; /* pool lock */ 105 kmutex_t e_ilock; /* idle list lock */ 106 timeout_id_t e_itimer; /* timer to dispatch the taskq */ 107 uint_t e_cnt; /* number of endpoints in the pool */ 108 zoneid_t e_zoneid; /* zoneid of endpoint type */ 109 kcondvar_t e_async_cv; /* cv for asynchronous reap threads */ 110 uint_t e_async_count; /* count of asynchronous reap threads */ 111 } endpnt_type_t; 112 113 typedef struct endpnt { 114 list_node_t e_node; /* link to the pool */ 115 list_node_t e_idle; /* link to the idle list */ 116 endpnt_type_t *e_type; /* back pointer to endpoint type */ 117 TIUSER *e_tiptr; /* pointer to transport endpoint */ 118 queue_t *e_wq; /* write queue */ 119 uint_t e_flags; /* endpoint flags */ 120 uint_t e_ref; /* ref count on endpoint */ 121 kcondvar_t e_cv; /* condition variable */ 122 kmutex_t e_lock; /* protects cv and flags */ 123 time_t e_itime; /* time when rele'd */ 124 } endpnt_t; 125 126 #define ENDPNT_ESTABLISHED 0x1 /* endpoint is established */ 127 #define ENDPNT_WAITING 0x2 /* thread waiting for endpoint */ 128 #define ENDPNT_BOUND 0x4 /* endpoint is bound */ 129 #define ENDPNT_STALE 0x8 /* endpoint is dead */ 130 #define ENDPNT_ONIDLE 0x10 /* endpoint is on the idle list */ 131 132 static krwlock_t endpnt_type_lock; /* protects endpnt_type_list */ 133 static endpnt_type_t *endpnt_type_list = NULL; /* list of CLTS endpoints */ 134 static struct kmem_cache *endpnt_cache; /* cache of endpnt_t's */ 135 static taskq_t *endpnt_taskq; /* endpnt_t reaper thread */ 136 static bool_t taskq_created; /* flag for endpnt_taskq */ 137 static kmutex_t endpnt_taskq_lock; /* taskq lock */ 138 static zone_key_t endpnt_destructor_key; 139 140 #define DEFAULT_ENDPOINT_REAP_INTERVAL 60 /* 1 minute */ 141 #define DEFAULT_INTERVAL_SHIFT 30 /* 30 seconds */ 142 143 /* 144 * Endpoint tunables 145 */ 146 static int clnt_clts_max_endpoints = -1; 147 static int clnt_clts_hash_size = DEFAULT_HASH_SIZE; 148 static time_t clnt_clts_endpoint_reap_interval = -1; 149 static clock_t clnt_clts_taskq_dispatch_interval; 150 151 /* 152 * Response completion hash queue 153 */ 154 static call_table_t *clts_call_ht; 155 156 /* 157 * Routines for the endpoint manager 158 */ 159 static struct endpnt_type *endpnt_type_create(struct knetconfig *); 160 static void endpnt_type_free(struct endpnt_type *); 161 static int check_endpnt(struct endpnt *, struct endpnt **); 162 static struct endpnt *endpnt_get(struct knetconfig *, int); 163 static void endpnt_rele(struct endpnt *); 164 static void endpnt_reap_settimer(endpnt_type_t *); 165 static void endpnt_reap(endpnt_type_t *); 166 static void endpnt_reap_dispatch(void *); 167 static void endpnt_reclaim(zoneid_t); 168 169 170 /* 171 * Request dipatching function. 172 */ 173 static int clnt_clts_dispatch_send(queue_t *q, mblk_t *, struct netbuf *addr, 174 calllist_t *, uint_t); 175 176 /* 177 * The size of the preserialized RPC header information. 178 */ 179 #define CKU_HDRSIZE 20 180 /* 181 * The initial allocation size. It is small to reduce space requirements. 182 */ 183 #define CKU_INITSIZE 2048 184 /* 185 * The size of additional allocations, if required. It is larger to 186 * reduce the number of actual allocations. 187 */ 188 #define CKU_ALLOCSIZE 8192 189 190 /* 191 * Private data per rpc handle. This structure is allocated by 192 * clnt_clts_kcreate, and freed by clnt_clts_kdestroy. 193 */ 194 struct cku_private { 195 CLIENT cku_client; /* client handle */ 196 int cku_retrys; /* request retrys */ 197 calllist_t cku_call; 198 struct endpnt *cku_endpnt; /* open end point */ 199 struct knetconfig cku_config; 200 struct netbuf cku_addr; /* remote address */ 201 struct rpc_err cku_err; /* error status */ 202 XDR cku_outxdr; /* xdr stream for output */ 203 XDR cku_inxdr; /* xdr stream for input */ 204 char cku_rpchdr[CKU_HDRSIZE + 4]; /* rpc header */ 205 struct cred *cku_cred; /* credentials */ 206 struct rpc_timers *cku_timers; /* for estimating RTT */ 207 struct rpc_timers *cku_timeall; /* for estimating RTT */ 208 void (*cku_feedback)(int, int, caddr_t); 209 /* ptr to feedback rtn */ 210 caddr_t cku_feedarg; /* argument for feedback func */ 211 uint32_t cku_xid; /* current XID */ 212 bool_t cku_bcast; /* RPC broadcast hint */ 213 int cku_useresvport; /* Use reserved port */ 214 struct rpc_clts_client *cku_stats; /* counters for the zone */ 215 }; 216 217 static const struct rpc_clts_client { 218 kstat_named_t rccalls; 219 kstat_named_t rcbadcalls; 220 kstat_named_t rcretrans; 221 kstat_named_t rcbadxids; 222 kstat_named_t rctimeouts; 223 kstat_named_t rcnewcreds; 224 kstat_named_t rcbadverfs; 225 kstat_named_t rctimers; 226 kstat_named_t rcnomem; 227 kstat_named_t rccantsend; 228 } clts_rcstat_tmpl = { 229 { "calls", KSTAT_DATA_UINT64 }, 230 { "badcalls", KSTAT_DATA_UINT64 }, 231 { "retrans", KSTAT_DATA_UINT64 }, 232 { "badxids", KSTAT_DATA_UINT64 }, 233 { "timeouts", KSTAT_DATA_UINT64 }, 234 { "newcreds", KSTAT_DATA_UINT64 }, 235 { "badverfs", KSTAT_DATA_UINT64 }, 236 { "timers", KSTAT_DATA_UINT64 }, 237 { "nomem", KSTAT_DATA_UINT64 }, 238 { "cantsend", KSTAT_DATA_UINT64 }, 239 }; 240 241 static uint_t clts_rcstat_ndata = 242 sizeof (clts_rcstat_tmpl) / sizeof (kstat_named_t); 243 244 #define RCSTAT_INCR(s, x) \ 245 atomic_add_64(&(s)->x.value.ui64, 1) 246 247 #define ptoh(p) (&((p)->cku_client)) 248 #define htop(h) ((struct cku_private *)((h)->cl_private)) 249 250 /* 251 * Times to retry 252 */ 253 #define SNDTRIES 4 254 #define REFRESHES 2 /* authentication refreshes */ 255 256 /* 257 * The following is used to determine the global default behavior for 258 * CLTS when binding to a local port. 259 * 260 * If the value is set to 1 the default will be to select a reserved 261 * (aka privileged) port, if the value is zero the default will be to 262 * use non-reserved ports. Users of kRPC may override this by using 263 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 264 */ 265 static int clnt_clts_do_bindresvport = 1; 266 267 #define BINDRESVPORT_RETRIES 5 268 269 void 270 clnt_clts_stats_init(zoneid_t zoneid, struct rpc_clts_client **statsp) 271 { 272 kstat_t *ksp; 273 kstat_named_t *knp; 274 275 knp = rpcstat_zone_init_common(zoneid, "unix", "rpc_clts_client", 276 (const kstat_named_t *)&clts_rcstat_tmpl, 277 sizeof (clts_rcstat_tmpl)); 278 /* 279 * Backwards compatibility for old kstat clients 280 */ 281 ksp = kstat_create_zone("unix", 0, "rpc_client", "rpc", 282 KSTAT_TYPE_NAMED, clts_rcstat_ndata, 283 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid); 284 if (ksp) { 285 ksp->ks_data = knp; 286 kstat_install(ksp); 287 } 288 *statsp = (struct rpc_clts_client *)knp; 289 } 290 291 void 292 clnt_clts_stats_fini(zoneid_t zoneid, struct rpc_clts_client **statsp) 293 { 294 rpcstat_zone_fini_common(zoneid, "unix", "rpc_clts_client"); 295 kstat_delete_byname_zone("unix", 0, "rpc_client", zoneid); 296 kmem_free(*statsp, sizeof (clts_rcstat_tmpl)); 297 } 298 299 /* 300 * Create an rpc handle for a clts rpc connection. 301 * Allocates space for the handle structure and the private data. 302 */ 303 /* ARGSUSED */ 304 int 305 clnt_clts_kcreate(struct knetconfig *config, struct netbuf *addr, 306 rpcprog_t pgm, rpcvers_t vers, int retrys, struct cred *cred, 307 CLIENT **cl) 308 { 309 CLIENT *h; 310 struct cku_private *p; 311 struct rpc_msg call_msg; 312 int error; 313 int plen; 314 315 if (cl == NULL) 316 return (EINVAL); 317 318 *cl = NULL; 319 error = 0; 320 321 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 322 323 h = ptoh(p); 324 325 /* handle */ 326 h->cl_ops = &clts_ops; 327 h->cl_private = (caddr_t)p; 328 h->cl_auth = authkern_create(); 329 330 /* call message, just used to pre-serialize below */ 331 call_msg.rm_xid = 0; 332 call_msg.rm_direction = CALL; 333 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 334 call_msg.rm_call.cb_prog = pgm; 335 call_msg.rm_call.cb_vers = vers; 336 337 /* private */ 338 clnt_clts_kinit(h, addr, retrys, cred); 339 340 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE); 341 342 /* pre-serialize call message header */ 343 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 344 error = EINVAL; /* XXX */ 345 goto bad; 346 } 347 348 p->cku_config.knc_rdev = config->knc_rdev; 349 p->cku_config.knc_semantics = config->knc_semantics; 350 plen = strlen(config->knc_protofmly) + 1; 351 p->cku_config.knc_protofmly = kmem_alloc(plen, KM_SLEEP); 352 bcopy(config->knc_protofmly, p->cku_config.knc_protofmly, plen); 353 p->cku_useresvport = -1; /* value is has not been set */ 354 355 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 356 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 357 358 *cl = h; 359 return (0); 360 361 bad: 362 auth_destroy(h->cl_auth); 363 kmem_free(p->cku_addr.buf, addr->maxlen); 364 kmem_free(p, sizeof (struct cku_private)); 365 366 return (error); 367 } 368 369 void 370 clnt_clts_kinit(CLIENT *h, struct netbuf *addr, int retrys, cred_t *cred) 371 { 372 /* LINTED pointer alignment */ 373 struct cku_private *p = htop(h); 374 struct rpcstat *rsp; 375 376 rsp = zone_getspecific(rpcstat_zone_key, rpc_zone()); 377 ASSERT(rsp != NULL); 378 379 p->cku_retrys = retrys; 380 381 if (p->cku_addr.maxlen < addr->len) { 382 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 383 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 384 385 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 386 p->cku_addr.maxlen = addr->maxlen; 387 } 388 389 p->cku_addr.len = addr->len; 390 bcopy(addr->buf, p->cku_addr.buf, addr->len); 391 392 p->cku_cred = cred; 393 p->cku_xid = 0; 394 p->cku_timers = NULL; 395 p->cku_timeall = NULL; 396 p->cku_feedback = NULL; 397 p->cku_bcast = FALSE; 398 p->cku_call.call_xid = 0; 399 p->cku_call.call_hash = 0; 400 p->cku_call.call_notified = FALSE; 401 p->cku_call.call_next = NULL; 402 p->cku_call.call_prev = NULL; 403 p->cku_call.call_reply = NULL; 404 p->cku_call.call_wq = NULL; 405 p->cku_stats = rsp->rpc_clts_client; 406 } 407 408 /* 409 * set the timers. Return current retransmission timeout. 410 */ 411 static int 412 clnt_clts_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 413 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, 414 uint32_t xid) 415 { 416 /* LINTED pointer alignment */ 417 struct cku_private *p = htop(h); 418 int value; 419 420 p->cku_feedback = feedback; 421 p->cku_feedarg = arg; 422 p->cku_timers = t; 423 p->cku_timeall = all; 424 if (xid) 425 p->cku_xid = xid; 426 value = all->rt_rtxcur; 427 value += t->rt_rtxcur; 428 if (value < minimum) 429 return (minimum); 430 RCSTAT_INCR(p->cku_stats, rctimers); 431 return (value); 432 } 433 434 /* 435 * Time out back off function. tim is in HZ 436 */ 437 #define MAXTIMO (20 * hz) 438 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 439 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 440 441 #define RETRY_POLL_TIMO 30 442 443 /* 444 * Call remote procedure. 445 * Most of the work of rpc is done here. We serialize what is left 446 * of the header (some was pre-serialized in the handle), serialize 447 * the arguments, and send it off. We wait for a reply or a time out. 448 * Timeout causes an immediate return, other packet problems may cause 449 * a retry on the receive. When a good packet is received we deserialize 450 * it, and check verification. A bad reply code will cause one retry 451 * with full (longhand) credentials. 452 */ 453 enum clnt_stat 454 clnt_clts_kcallit_addr(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 455 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 456 struct timeval wait, struct netbuf *sin) 457 { 458 /* LINTED pointer alignment */ 459 struct cku_private *p = htop(h); 460 XDR *xdrs; 461 int stries = p->cku_retrys; 462 int refreshes = REFRESHES; /* number of times to refresh cred */ 463 int round_trip; /* time the RPC */ 464 int error; 465 int hdrsz; 466 mblk_t *mp; 467 mblk_t *mpdup; 468 mblk_t *resp = NULL; 469 mblk_t *tmp; 470 calllist_t *call = &p->cku_call; 471 clock_t timout = 0; 472 bool_t interrupted; 473 enum clnt_stat status; 474 struct rpc_msg reply_msg; 475 enum clnt_stat re_status; 476 endpnt_t *endpt; 477 478 RCSTAT_INCR(p->cku_stats, rccalls); 479 480 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_sec: %ld\n", wait.tv_sec); 481 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_usec: %ld\n", wait.tv_usec); 482 483 timout = TIMEVAL_TO_TICK(&wait); 484 485 if (p->cku_xid == 0) { 486 p->cku_xid = alloc_xid(); 487 if (p->cku_endpnt != NULL) 488 endpnt_rele(p->cku_endpnt); 489 p->cku_endpnt = NULL; 490 } 491 call->call_zoneid = rpc_zoneid(); 492 493 mpdup = NULL; 494 call_again: 495 496 if (mpdup == NULL) { 497 498 while ((mp = allocb(CKU_INITSIZE, BPRI_LO)) == NULL) { 499 if (strwaitbuf(CKU_INITSIZE, BPRI_LO)) { 500 p->cku_err.re_status = RPC_SYSTEMERROR; 501 p->cku_err.re_errno = ENOSR; 502 goto done; 503 } 504 } 505 506 xdrs = &p->cku_outxdr; 507 xdrmblk_init(xdrs, mp, XDR_ENCODE, CKU_ALLOCSIZE); 508 509 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 510 /* 511 * Copy in the preserialized RPC header 512 * information. 513 */ 514 bcopy(p->cku_rpchdr, mp->b_rptr, CKU_HDRSIZE); 515 516 /* 517 * transaction id is the 1st thing in the output 518 * buffer. 519 */ 520 /* LINTED pointer alignment */ 521 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 522 523 /* Skip the preserialized stuff. */ 524 XDR_SETPOS(xdrs, CKU_HDRSIZE); 525 526 /* Serialize dynamic stuff into the output buffer. */ 527 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 528 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 529 (!(*xdr_args)(xdrs, argsp))) { 530 freemsg(mp); 531 p->cku_err.re_status = RPC_CANTENCODEARGS; 532 p->cku_err.re_errno = EIO; 533 goto done; 534 } 535 } else { 536 uint32_t *uproc = (uint32_t *) 537 &p->cku_rpchdr[CKU_HDRSIZE]; 538 IXDR_PUT_U_INT32(uproc, procnum); 539 540 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 541 XDR_SETPOS(xdrs, 0); 542 543 /* Serialize the procedure number and the arguments. */ 544 if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr, 545 CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) { 546 freemsg(mp); 547 p->cku_err.re_status = RPC_CANTENCODEARGS; 548 p->cku_err.re_errno = EIO; 549 goto done; 550 } 551 } 552 } else 553 mp = mpdup; 554 555 mpdup = dupmsg(mp); 556 if (mpdup == NULL) { 557 freemsg(mp); 558 p->cku_err.re_status = RPC_SYSTEMERROR; 559 p->cku_err.re_errno = ENOSR; 560 goto done; 561 } 562 563 /* 564 * Grab an endpnt only if the endpoint is NULL. We could be retrying 565 * the request and in this case we want to go through the same 566 * source port, so that the duplicate request cache may detect a 567 * retry. 568 */ 569 570 if (p->cku_endpnt == NULL) 571 p->cku_endpnt = endpnt_get(&p->cku_config, p->cku_useresvport); 572 573 if (p->cku_endpnt == NULL) { 574 freemsg(mp); 575 p->cku_err.re_status = RPC_SYSTEMERROR; 576 p->cku_err.re_errno = ENOSR; 577 goto done; 578 } 579 580 round_trip = lbolt; 581 582 error = clnt_clts_dispatch_send(p->cku_endpnt->e_wq, mp, 583 &p->cku_addr, call, p->cku_xid); 584 585 if (error != 0) { 586 freemsg(mp); 587 p->cku_err.re_status = RPC_CANTSEND; 588 p->cku_err.re_errno = error; 589 RCSTAT_INCR(p->cku_stats, rccantsend); 590 goto done1; 591 } 592 593 RPCLOG(64, "clnt_clts_kcallit_addr: sent call for xid 0x%x\n", 594 p->cku_xid); 595 596 /* 597 * There are two reasons for which we go back to to tryread. 598 * 599 * a) In case the status is RPC_PROCUNAVAIL and we sent out a 600 * broadcast we should not get any invalid messages with the 601 * RPC_PROCUNAVAIL error back. Some broken RPC implementations 602 * send them and for this we have to ignore them ( as we would 603 * have never received them ) and look for another message 604 * which might contain the valid response because we don't know 605 * how many broken implementations are in the network. So we are 606 * going to loop until 607 * - we received a valid response 608 * - we have processed all invalid responses and 609 * got a time out when we try to receive again a 610 * message. 611 * 612 * b) We will jump back to tryread also in case we failed 613 * within the AUTH_VALIDATE. In this case we should move 614 * on and loop until we received a valid response or we 615 * have processed all responses with broken authentication 616 * and we got a time out when we try to receive a message. 617 */ 618 tryread: 619 mutex_enter(&call->call_lock); 620 interrupted = FALSE; 621 if (call->call_notified == FALSE) { 622 klwp_t *lwp = ttolwp(curthread); 623 clock_t cv_wait_ret = 1; /* init to > 0 */ 624 clock_t cv_timout = timout; 625 626 if (lwp != NULL) 627 lwp->lwp_nostop++; 628 629 cv_timout += lbolt; 630 631 if (h->cl_nosignal) 632 while ((cv_wait_ret = 633 cv_timedwait(&call->call_cv, 634 &call->call_lock, cv_timout)) > 0 && 635 call->call_notified == FALSE) 636 ; 637 else 638 while ((cv_wait_ret = 639 cv_timedwait_sig(&call->call_cv, 640 &call->call_lock, cv_timout)) > 0 && 641 call->call_notified == FALSE) 642 ; 643 644 if (cv_wait_ret == 0) 645 interrupted = TRUE; 646 647 if (lwp != NULL) 648 lwp->lwp_nostop--; 649 } 650 resp = call->call_reply; 651 call->call_reply = NULL; 652 status = call->call_status; 653 /* 654 * We have to reset the call_notified here. In case we have 655 * to do a retry ( e.g. in case we got a RPC_PROCUNAVAIL 656 * error ) we need to set this to false to ensure that 657 * we will wait for the next message. When the next message 658 * is going to arrive the function clnt_clts_dispatch_notify 659 * will set this to true again. 660 */ 661 call->call_notified = FALSE; 662 mutex_exit(&call->call_lock); 663 664 if (status == RPC_TIMEDOUT) { 665 if (interrupted) { 666 /* 667 * We got interrupted, bail out 668 */ 669 p->cku_err.re_status = RPC_INTR; 670 p->cku_err.re_errno = EINTR; 671 goto done1; 672 } else { 673 /* 674 * It's possible that our response arrived 675 * right after we timed out. Check to see 676 * if it has arrived before we remove the 677 * calllist from the dispatch queue. 678 */ 679 mutex_enter(&call->call_lock); 680 if (call->call_notified == TRUE) { 681 resp = call->call_reply; 682 call->call_reply = NULL; 683 mutex_exit(&call->call_lock); 684 RPCLOG(8, "clnt_clts_kcallit_addr: " 685 "response received for request " 686 "w/xid 0x%x after timeout\n", 687 p->cku_xid); 688 goto getresponse; 689 } 690 mutex_exit(&call->call_lock); 691 692 RPCLOG(8, "clnt_clts_kcallit_addr: " 693 "request w/xid 0x%x timedout " 694 "waiting for reply\n", p->cku_xid); 695 #if 0 /* XXX not yet */ 696 /* 697 * Timeout may be due to a dead gateway. Send 698 * an ioctl downstream advising deletion of 699 * route when we reach the half-way point to 700 * timing out. 701 */ 702 if (stries == p->cku_retrys/2) { 703 t_kadvise(p->cku_endpnt->e_tiptr, 704 (uchar_t *)p->cku_addr.buf, 705 p->cku_addr.len); 706 } 707 #endif /* not yet */ 708 p->cku_err.re_status = RPC_TIMEDOUT; 709 p->cku_err.re_errno = ETIMEDOUT; 710 RCSTAT_INCR(p->cku_stats, rctimeouts); 711 goto done1; 712 } 713 } 714 715 getresponse: 716 /* 717 * Check to see if a response arrived. If it one is 718 * present then proceed to process the reponse. Otherwise 719 * fall through to retry or retransmit the request. This 720 * is probably not the optimal thing to do, but since we 721 * are most likely dealing with a unrealiable transport it 722 * is the safe thing to so. 723 */ 724 if (resp == NULL) { 725 p->cku_err.re_status = RPC_CANTRECV; 726 p->cku_err.re_errno = EIO; 727 goto done1; 728 } 729 730 /* 731 * Prepare the message for further processing. We need to remove 732 * the datagram header and copy the source address if necessary. No 733 * need to verify the header since rpcmod took care of that. 734 */ 735 /* 736 * Copy the source address if the caller has supplied a netbuf. 737 */ 738 if (sin != NULL) { 739 union T_primitives *pptr; 740 741 pptr = (union T_primitives *)resp->b_rptr; 742 bcopy(resp->b_rptr + pptr->unitdata_ind.SRC_offset, sin->buf, 743 pptr->unitdata_ind.SRC_length); 744 sin->len = pptr->unitdata_ind.SRC_length; 745 } 746 747 /* 748 * Pop off the datagram header. 749 */ 750 hdrsz = resp->b_wptr - resp->b_rptr; 751 if ((resp->b_wptr - (resp->b_rptr + hdrsz)) == 0) { 752 tmp = resp; 753 resp = resp->b_cont; 754 tmp->b_cont = NULL; 755 freeb(tmp); 756 } else { 757 unsigned char *ud_off = resp->b_rptr; 758 resp->b_rptr += hdrsz; 759 tmp = dupb(resp); 760 if (tmp == NULL) { 761 p->cku_err.re_status = RPC_SYSTEMERROR; 762 p->cku_err.re_errno = ENOSR; 763 freemsg(resp); 764 goto done1; 765 } 766 tmp->b_cont = resp->b_cont; 767 resp->b_rptr = ud_off; 768 freeb(resp); 769 resp = tmp; 770 } 771 772 round_trip = lbolt - round_trip; 773 /* 774 * Van Jacobson timer algorithm here, only if NOT a retransmission. 775 */ 776 if (p->cku_timers != NULL && stries == p->cku_retrys) { 777 int rt; 778 779 rt = round_trip; 780 rt -= (p->cku_timers->rt_srtt >> 3); 781 p->cku_timers->rt_srtt += rt; 782 if (rt < 0) 783 rt = - rt; 784 rt -= (p->cku_timers->rt_deviate >> 2); 785 p->cku_timers->rt_deviate += rt; 786 p->cku_timers->rt_rtxcur = 787 (clock_t)((p->cku_timers->rt_srtt >> 2) + 788 p->cku_timers->rt_deviate) >> 1; 789 790 rt = round_trip; 791 rt -= (p->cku_timeall->rt_srtt >> 3); 792 p->cku_timeall->rt_srtt += rt; 793 if (rt < 0) 794 rt = - rt; 795 rt -= (p->cku_timeall->rt_deviate >> 2); 796 p->cku_timeall->rt_deviate += rt; 797 p->cku_timeall->rt_rtxcur = 798 (clock_t)((p->cku_timeall->rt_srtt >> 2) + 799 p->cku_timeall->rt_deviate) >> 1; 800 if (p->cku_feedback != NULL) { 801 (*p->cku_feedback)(FEEDBACK_OK, procnum, 802 p->cku_feedarg); 803 } 804 } 805 806 /* 807 * Process reply 808 */ 809 xdrs = &(p->cku_inxdr); 810 xdrmblk_init(xdrs, resp, XDR_DECODE, 0); 811 812 reply_msg.rm_direction = REPLY; 813 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 814 reply_msg.acpted_rply.ar_stat = SUCCESS; 815 reply_msg.acpted_rply.ar_verf = _null_auth; 816 /* 817 * xdr_results will be done in AUTH_UNWRAP. 818 */ 819 reply_msg.acpted_rply.ar_results.where = NULL; 820 reply_msg.acpted_rply.ar_results.proc = xdr_void; 821 822 /* 823 * Decode and validate the response. 824 */ 825 if (!xdr_replymsg(xdrs, &reply_msg)) { 826 p->cku_err.re_status = RPC_CANTDECODERES; 827 p->cku_err.re_errno = EIO; 828 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 829 goto done1; 830 } 831 832 _seterr_reply(&reply_msg, &(p->cku_err)); 833 834 re_status = p->cku_err.re_status; 835 if (re_status == RPC_SUCCESS) { 836 /* 837 * Reply is good, check auth. 838 */ 839 if (!AUTH_VALIDATE(h->cl_auth, 840 &reply_msg.acpted_rply.ar_verf)) { 841 p->cku_err.re_status = RPC_AUTHERROR; 842 p->cku_err.re_why = AUTH_INVALIDRESP; 843 RCSTAT_INCR(p->cku_stats, rcbadverfs); 844 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 845 goto tryread; 846 } 847 if (!AUTH_UNWRAP(h->cl_auth, xdrs, xdr_results, resultsp)) { 848 p->cku_err.re_status = RPC_CANTDECODERES; 849 p->cku_err.re_errno = EIO; 850 } 851 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 852 goto done1; 853 } 854 /* set errno in case we can't recover */ 855 if (re_status != RPC_VERSMISMATCH && 856 re_status != RPC_AUTHERROR && re_status != RPC_PROGVERSMISMATCH) 857 p->cku_err.re_errno = EIO; 858 /* 859 * Determine whether or not we're doing an RPC 860 * broadcast. Some server implementations don't 861 * follow RFC 1050, section 7.4.2 in that they 862 * don't remain silent when they see a proc 863 * they don't support. Therefore we keep trying 864 * to receive on RPC_PROCUNAVAIL, hoping to get 865 * a valid response from a compliant server. 866 */ 867 if (re_status == RPC_PROCUNAVAIL && p->cku_bcast) { 868 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 869 goto tryread; 870 } 871 if (re_status == RPC_AUTHERROR) { 872 /* 873 * Maybe our credential need to be refreshed 874 */ 875 if (refreshes > 0 && 876 AUTH_REFRESH(h->cl_auth, &reply_msg, p->cku_cred)) { 877 /* 878 * The credential is refreshed. Try the request again. 879 * Even if stries == 0, we still retry as long as 880 * refreshes > 0. This prevents a soft authentication 881 * error turning into a hard one at an upper level. 882 */ 883 refreshes--; 884 RCSTAT_INCR(p->cku_stats, rcbadcalls); 885 RCSTAT_INCR(p->cku_stats, rcnewcreds); 886 887 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 888 freemsg(mpdup); 889 call_table_remove(call); 890 mutex_enter(&call->call_lock); 891 if (call->call_reply != NULL) { 892 freemsg(call->call_reply); 893 call->call_reply = NULL; 894 } 895 mutex_exit(&call->call_lock); 896 897 freemsg(resp); 898 mpdup = NULL; 899 goto call_again; 900 } 901 /* 902 * We have used the client handle to do an AUTH_REFRESH 903 * and the RPC status may be set to RPC_SUCCESS; 904 * Let's make sure to set it to RPC_AUTHERROR. 905 */ 906 p->cku_err.re_status = RPC_CANTDECODERES; 907 908 /* 909 * Map recoverable and unrecoverable 910 * authentication errors to appropriate errno 911 */ 912 switch (p->cku_err.re_why) { 913 case AUTH_TOOWEAK: 914 /* 915 * Could be an nfsportmon failure, set 916 * useresvport and try again. 917 */ 918 if (p->cku_useresvport != 1) { 919 p->cku_useresvport = 1; 920 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 921 freemsg(mpdup); 922 923 call_table_remove(call); 924 mutex_enter(&call->call_lock); 925 if (call->call_reply != NULL) { 926 freemsg(call->call_reply); 927 call->call_reply = NULL; 928 } 929 mutex_exit(&call->call_lock); 930 931 freemsg(resp); 932 mpdup = NULL; 933 endpt = p->cku_endpnt; 934 if (endpt->e_tiptr != NULL) { 935 mutex_enter(&endpt->e_lock); 936 endpt->e_flags &= ~ENDPNT_BOUND; 937 (void) t_kclose(endpt->e_tiptr, 1); 938 endpt->e_tiptr = NULL; 939 mutex_exit(&endpt->e_lock); 940 941 } 942 943 p->cku_xid = alloc_xid(); 944 endpnt_rele(p->cku_endpnt); 945 p->cku_endpnt = NULL; 946 goto call_again; 947 } 948 /* FALLTHRU */ 949 case AUTH_BADCRED: 950 case AUTH_BADVERF: 951 case AUTH_INVALIDRESP: 952 case AUTH_FAILED: 953 case RPCSEC_GSS_NOCRED: 954 case RPCSEC_GSS_FAILED: 955 p->cku_err.re_errno = EACCES; 956 break; 957 case AUTH_REJECTEDCRED: 958 case AUTH_REJECTEDVERF: 959 default: 960 p->cku_err.re_errno = EIO; 961 break; 962 } 963 RPCLOG(1, "clnt_clts_kcallit : authentication failed " 964 "with RPC_AUTHERROR of type %d\n", 965 p->cku_err.re_why); 966 } 967 968 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 969 970 done1: 971 call_table_remove(call); 972 mutex_enter(&call->call_lock); 973 if (call->call_reply != NULL) { 974 freemsg(call->call_reply); 975 call->call_reply = NULL; 976 } 977 mutex_exit(&call->call_lock); 978 RPCLOG(64, "clnt_clts_kcallit_addr: xid 0x%x taken off dispatch list", 979 p->cku_xid); 980 981 done: 982 if (resp != NULL) { 983 freemsg(resp); 984 resp = NULL; 985 } 986 987 if ((p->cku_err.re_status != RPC_SUCCESS) && 988 (p->cku_err.re_status != RPC_INTR) && 989 (p->cku_err.re_status != RPC_UDERROR) && 990 !IS_UNRECOVERABLE_RPC(p->cku_err.re_status)) { 991 if (p->cku_feedback != NULL && stries == p->cku_retrys) { 992 (*p->cku_feedback)(FEEDBACK_REXMIT1, procnum, 993 p->cku_feedarg); 994 } 995 996 timout = backoff(timout); 997 if (p->cku_timeall != (struct rpc_timers *)0) 998 p->cku_timeall->rt_rtxcur = timout; 999 1000 if (p->cku_err.re_status == RPC_SYSTEMERROR || 1001 p->cku_err.re_status == RPC_CANTSEND) { 1002 /* 1003 * Errors due to lack of resources, wait a bit 1004 * and try again. 1005 */ 1006 (void) delay(hz/10); 1007 /* (void) sleep((caddr_t)&lbolt, PZERO-4); */ 1008 } 1009 if (stries-- > 0) { 1010 RCSTAT_INCR(p->cku_stats, rcretrans); 1011 goto call_again; 1012 } 1013 } 1014 1015 if (mpdup != NULL) 1016 freemsg(mpdup); 1017 1018 if (p->cku_err.re_status != RPC_SUCCESS) { 1019 RCSTAT_INCR(p->cku_stats, rcbadcalls); 1020 } 1021 1022 /* 1023 * Allow the endpoint to be held by the client handle in case this 1024 * RPC was not successful. A retry may occur at a higher level and 1025 * in this case we may want to send the request over the same 1026 * source port. 1027 */ 1028 if (p->cku_err.re_status == RPC_SUCCESS && p->cku_endpnt != NULL) { 1029 endpnt_rele(p->cku_endpnt); 1030 p->cku_endpnt = NULL; 1031 } 1032 1033 return (p->cku_err.re_status); 1034 } 1035 1036 static enum clnt_stat 1037 clnt_clts_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 1038 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 1039 struct timeval wait) 1040 { 1041 return (clnt_clts_kcallit_addr(h, procnum, xdr_args, argsp, 1042 xdr_results, resultsp, wait, NULL)); 1043 } 1044 1045 /* 1046 * Return error info on this handle. 1047 */ 1048 static void 1049 clnt_clts_kerror(CLIENT *h, struct rpc_err *err) 1050 { 1051 /* LINTED pointer alignment */ 1052 struct cku_private *p = htop(h); 1053 1054 *err = p->cku_err; 1055 } 1056 1057 static bool_t 1058 clnt_clts_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 1059 { 1060 /* LINTED pointer alignment */ 1061 struct cku_private *p = htop(h); 1062 XDR *xdrs; 1063 1064 xdrs = &(p->cku_outxdr); 1065 xdrs->x_op = XDR_FREE; 1066 return ((*xdr_res)(xdrs, res_ptr)); 1067 } 1068 1069 /*ARGSUSED*/ 1070 static void 1071 clnt_clts_kabort(CLIENT *h) 1072 { 1073 } 1074 1075 static bool_t 1076 clnt_clts_kcontrol(CLIENT *h, int cmd, char *arg) 1077 { 1078 /* LINTED pointer alignment */ 1079 struct cku_private *p = htop(h); 1080 1081 switch (cmd) { 1082 case CLSET_XID: 1083 p->cku_xid = *((uint32_t *)arg); 1084 return (TRUE); 1085 1086 case CLGET_XID: 1087 *((uint32_t *)arg) = p->cku_xid; 1088 return (TRUE); 1089 1090 case CLSET_BCAST: 1091 p->cku_bcast = *((uint32_t *)arg); 1092 return (TRUE); 1093 1094 case CLGET_BCAST: 1095 *((uint32_t *)arg) = p->cku_bcast; 1096 return (TRUE); 1097 case CLSET_BINDRESVPORT: 1098 if (arg == NULL) 1099 return (FALSE); 1100 1101 if (*(int *)arg != 1 && *(int *)arg != 0) 1102 return (FALSE); 1103 1104 p->cku_useresvport = *(int *)arg; 1105 1106 return (TRUE); 1107 1108 case CLGET_BINDRESVPORT: 1109 if (arg == NULL) 1110 return (FALSE); 1111 1112 *(int *)arg = p->cku_useresvport; 1113 1114 return (TRUE); 1115 1116 default: 1117 return (FALSE); 1118 } 1119 } 1120 1121 /* 1122 * Destroy rpc handle. 1123 * Frees the space used for output buffer, private data, and handle 1124 * structure, and the file pointer/TLI data on last reference. 1125 */ 1126 static void 1127 clnt_clts_kdestroy(CLIENT *h) 1128 { 1129 /* LINTED pointer alignment */ 1130 struct cku_private *p = htop(h); 1131 calllist_t *call = &p->cku_call; 1132 1133 int plen; 1134 1135 RPCLOG(8, "clnt_clts_kdestroy h: %p\n", (void *)h); 1136 RPCLOG(8, "clnt_clts_kdestroy h: xid=0x%x\n", p->cku_xid); 1137 1138 if (p->cku_endpnt != NULL) 1139 endpnt_rele(p->cku_endpnt); 1140 1141 cv_destroy(&call->call_cv); 1142 mutex_destroy(&call->call_lock); 1143 1144 plen = strlen(p->cku_config.knc_protofmly) + 1; 1145 kmem_free(p->cku_config.knc_protofmly, plen); 1146 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1147 kmem_free(p, sizeof (*p)); 1148 } 1149 1150 /* 1151 * The connectionless (CLTS) kRPC endpoint management subsystem. 1152 * 1153 * Because endpoints are potentially shared among threads making RPC calls, 1154 * they are managed in a pool according to type (endpnt_type_t). Each 1155 * endpnt_type_t points to a list of usable endpoints through the e_pool 1156 * field, which is of type list_t. list_t is a doubly-linked list. 1157 * The number of endpoints in the pool is stored in the e_cnt field of 1158 * endpnt_type_t and the endpoints are reference counted using the e_ref field 1159 * in the endpnt_t structure. 1160 * 1161 * As an optimization, endpoints that have no references are also linked 1162 * to an idle list via e_ilist which is also of type list_t. When a thread 1163 * calls endpnt_get() to obtain a transport endpoint, the idle list is first 1164 * consulted and if such an endpoint exists, it is removed from the idle list 1165 * and returned to the caller. 1166 * 1167 * If the idle list is empty, then a check is made to see if more endpoints 1168 * can be created. If so, we proceed and create a new endpoint which is added 1169 * to the pool and returned to the caller. If we have reached the limit and 1170 * cannot make a new endpoint then one is returned to the caller via round- 1171 * robin policy. 1172 * 1173 * When an endpoint is placed on the idle list by a thread calling 1174 * endpnt_rele(), it is timestamped and then a reaper taskq is scheduled to 1175 * be dispatched if one hasn't already been. When the timer fires, the 1176 * taskq traverses the idle list and checks to see which endpoints are 1177 * eligible to be closed. It determines this by checking if the timestamp 1178 * when the endpoint was released has exceeded the the threshold for how long 1179 * it should stay alive. 1180 * 1181 * endpnt_t structures remain persistent until the memory reclaim callback, 1182 * endpnt_reclaim(), is invoked. 1183 * 1184 * Here is an example of how the data structures would be laid out by the 1185 * subsystem: 1186 * 1187 * endpnt_type_t 1188 * 1189 * loopback inet 1190 * _______________ ______________ 1191 * | e_next |----------------------->| e_next |---->> 1192 * | e_pool |<---+ | e_pool |<----+ 1193 * | e_ilist |<---+--+ | e_ilist |<----+--+ 1194 * +->| e_pcurr |----+--+--+ +->| e_pcurr |-----+--+--+ 1195 * | | ... | | | | | | ... | | | | 1196 * | | e_itimer (90) | | | | | | e_itimer (0) | | | | 1197 * | | e_cnt (1) | | | | | | e_cnt (3) | | | | 1198 * | +---------------+ | | | | +--------------+ | | | 1199 * | | | | | | | | 1200 * | endpnt_t | | | | | | | 1201 * | ____________ | | | | ____________ | | | 1202 * | | e_node |<------+ | | | | e_node |<------+ | | 1203 * | | e_idle |<---------+ | | | e_idle | | | | 1204 * +--| e_type |<------------+ +--| e_type | | | | 1205 * | e_tiptr | | | e_tiptr | | | | 1206 * | ... | | | ... | | | | 1207 * | e_lock | | | e_lock | | | | 1208 * | ... | | | ... | | | | 1209 * | e_ref (0) | | | e_ref (2) | | | | 1210 * | e_itime | | | e_itime | | | | 1211 * +------------+ | +------------+ | | | 1212 * | | | | 1213 * | | | | 1214 * | ____________ | | | 1215 * | | e_node |<------+ | | 1216 * | | e_idle |<------+--+ | 1217 * +--| e_type | | | 1218 * | | e_tiptr | | | 1219 * | | ... | | | 1220 * | | e_lock | | | 1221 * | | ... | | | 1222 * | | e_ref (0) | | | 1223 * | | e_itime | | | 1224 * | +------------+ | | 1225 * | | | 1226 * | | | 1227 * | ____________ | | 1228 * | | e_node |<------+ | 1229 * | | e_idle | | 1230 * +--| e_type |<------------+ 1231 * | e_tiptr | 1232 * | ... | 1233 * | e_lock | 1234 * | ... | 1235 * | e_ref (1) | 1236 * | e_itime | 1237 * +------------+ 1238 * 1239 * Endpoint locking strategy: 1240 * 1241 * The following functions manipulate lists which hold the endpoint and the 1242 * endpoints themselves: 1243 * 1244 * endpnt_get()/check_endpnt()/endpnt_rele()/endpnt_reap()/do_endpnt_reclaim() 1245 * 1246 * Lock description follows: 1247 * 1248 * endpnt_type_lock: Global reader/writer lock which protects accesses to the 1249 * endpnt_type_list. 1250 * 1251 * e_plock: Lock defined in the endpnt_type_t. It is intended to 1252 * protect accesses to the pool of endopints (e_pool) for a given 1253 * endpnt_type_t. 1254 * 1255 * e_ilock: Lock defined in endpnt_type_t. It is intended to protect accesses 1256 * to the idle list (e_ilist) of available endpoints for a given 1257 * endpnt_type_t. It also protects access to the e_itimer, e_async_cv, 1258 * and e_async_count fields in endpnt_type_t. 1259 * 1260 * e_lock: Lock defined in the endpnt structure. It is intended to protect 1261 * flags, cv, and ref count. 1262 * 1263 * The order goes as follows so as not to induce deadlock. 1264 * 1265 * endpnt_type_lock -> e_plock -> e_ilock -> e_lock 1266 * 1267 * Interaction with Zones and shutting down: 1268 * 1269 * endpnt_type_ts are uniquely identified by the (e_zoneid, e_rdev, e_protofmly) 1270 * tuple, which means that a zone may not reuse another zone's idle endpoints 1271 * without first doing a t_kclose(). 1272 * 1273 * A zone's endpnt_type_ts are destroyed when a zone is shut down; e_async_cv 1274 * and e_async_count are used to keep track of the threads in endpnt_taskq 1275 * trying to reap endpnt_ts in the endpnt_type_t. 1276 */ 1277 1278 /* 1279 * Allocate and initialize an endpnt_type_t 1280 */ 1281 static struct endpnt_type * 1282 endpnt_type_create(struct knetconfig *config) 1283 { 1284 struct endpnt_type *etype; 1285 1286 /* 1287 * Allocate a new endpoint type to hang a list of 1288 * endpoints off of it. 1289 */ 1290 etype = kmem_alloc(sizeof (struct endpnt_type), KM_SLEEP); 1291 etype->e_next = NULL; 1292 etype->e_pcurr = NULL; 1293 etype->e_itimer = 0; 1294 etype->e_cnt = 0; 1295 1296 (void) strncpy(etype->e_protofmly, config->knc_protofmly, KNC_STRSIZE); 1297 mutex_init(&etype->e_plock, NULL, MUTEX_DEFAULT, NULL); 1298 mutex_init(&etype->e_ilock, NULL, MUTEX_DEFAULT, NULL); 1299 etype->e_rdev = config->knc_rdev; 1300 etype->e_zoneid = rpc_zoneid(); 1301 etype->e_async_count = 0; 1302 cv_init(&etype->e_async_cv, NULL, CV_DEFAULT, NULL); 1303 1304 list_create(&etype->e_pool, sizeof (endpnt_t), 1305 offsetof(endpnt_t, e_node)); 1306 list_create(&etype->e_ilist, sizeof (endpnt_t), 1307 offsetof(endpnt_t, e_idle)); 1308 1309 /* 1310 * Check to see if we need to create a taskq for endpoint 1311 * reaping 1312 */ 1313 mutex_enter(&endpnt_taskq_lock); 1314 if (taskq_created == FALSE) { 1315 taskq_created = TRUE; 1316 mutex_exit(&endpnt_taskq_lock); 1317 ASSERT(endpnt_taskq == NULL); 1318 endpnt_taskq = taskq_create("clts_endpnt_taskq", 1, 1319 minclsyspri, 200, INT_MAX, 0); 1320 } else 1321 mutex_exit(&endpnt_taskq_lock); 1322 1323 return (etype); 1324 } 1325 1326 /* 1327 * Free an endpnt_type_t 1328 */ 1329 static void 1330 endpnt_type_free(struct endpnt_type *etype) 1331 { 1332 mutex_destroy(&etype->e_plock); 1333 mutex_destroy(&etype->e_ilock); 1334 list_destroy(&etype->e_pool); 1335 list_destroy(&etype->e_ilist); 1336 kmem_free(etype, sizeof (endpnt_type_t)); 1337 } 1338 1339 /* 1340 * Check the endpoint to ensure that it is suitable for use. 1341 * 1342 * Possible return values: 1343 * 1344 * return (1) - Endpoint is established, but needs to be re-opened. 1345 * return (0) && *newp == NULL - Endpoint is established, but unusable. 1346 * return (0) && *newp != NULL - Endpoint is established and usable. 1347 */ 1348 static int 1349 check_endpnt(struct endpnt *endp, struct endpnt **newp) 1350 { 1351 *newp = endp; 1352 1353 mutex_enter(&endp->e_lock); 1354 ASSERT(endp->e_ref >= 1); 1355 1356 /* 1357 * The first condition we check for is if the endpoint has been 1358 * allocated, but is unusable either because it has been closed or 1359 * has been marked stale. Only *one* thread will be allowed to 1360 * execute the then clause. This is enforced becuase the first thread 1361 * to check this condition will clear the flags, so that subsequent 1362 * thread(s) checking this endpoint will move on. 1363 */ 1364 if ((endp->e_flags & ENDPNT_ESTABLISHED) && 1365 (!(endp->e_flags & ENDPNT_BOUND) || 1366 (endp->e_flags & ENDPNT_STALE))) { 1367 /* 1368 * Clear the flags here since they will be 1369 * set again by this thread. They need to be 1370 * individually cleared because we want to maintain 1371 * the state for ENDPNT_ONIDLE. 1372 */ 1373 endp->e_flags &= ~(ENDPNT_ESTABLISHED | 1374 ENDPNT_WAITING | ENDPNT_BOUND | ENDPNT_STALE); 1375 mutex_exit(&endp->e_lock); 1376 return (1); 1377 } 1378 1379 /* 1380 * The second condition is meant for any thread that is waiting for 1381 * an endpoint to become established. It will cv_wait() until 1382 * the condition for the endpoint has been changed to ENDPNT_BOUND or 1383 * ENDPNT_STALE. 1384 */ 1385 while (!(endp->e_flags & ENDPNT_BOUND) && 1386 !(endp->e_flags & ENDPNT_STALE)) { 1387 endp->e_flags |= ENDPNT_WAITING; 1388 cv_wait(&endp->e_cv, &endp->e_lock); 1389 } 1390 1391 ASSERT(endp->e_flags & ENDPNT_ESTABLISHED); 1392 1393 /* 1394 * The last case we check for is if the endpoint has been marked stale. 1395 * If this is the case then set *newp to NULL and return, so that the 1396 * caller is notified of the error and can take appropriate action. 1397 */ 1398 if (endp->e_flags & ENDPNT_STALE) { 1399 endp->e_ref--; 1400 *newp = NULL; 1401 } 1402 mutex_exit(&endp->e_lock); 1403 return (0); 1404 } 1405 1406 #ifdef DEBUG 1407 /* 1408 * Provide a fault injection setting to test error conditions. 1409 */ 1410 static int endpnt_get_return_null = 0; 1411 #endif 1412 1413 /* 1414 * Returns a handle (struct endpnt *) to an open and bound endpoint 1415 * specified by the knetconfig passed in. Returns NULL if no valid endpoint 1416 * can be obtained. 1417 */ 1418 static struct endpnt * 1419 endpnt_get(struct knetconfig *config, int useresvport) 1420 { 1421 struct endpnt_type *n_etype = NULL; 1422 struct endpnt_type *np = NULL; 1423 struct endpnt *new = NULL; 1424 struct endpnt *endp = NULL; 1425 struct endpnt *next = NULL; 1426 TIUSER *tiptr = NULL; 1427 int rtries = BINDRESVPORT_RETRIES; 1428 int i = 0; 1429 int error; 1430 int retval; 1431 zoneid_t zoneid = rpc_zoneid(); 1432 cred_t *cr; 1433 1434 RPCLOG(1, "endpnt_get: protofmly %s, ", config->knc_protofmly); 1435 RPCLOG(1, "rdev %ld\n", config->knc_rdev); 1436 1437 #ifdef DEBUG 1438 /* 1439 * Inject fault if desired. Pretend we have a stale endpoint 1440 * and return NULL. 1441 */ 1442 if (endpnt_get_return_null > 0) { 1443 endpnt_get_return_null--; 1444 return (NULL); 1445 } 1446 #endif 1447 rw_enter(&endpnt_type_lock, RW_READER); 1448 1449 top: 1450 for (np = endpnt_type_list; np != NULL; np = np->e_next) 1451 if ((np->e_zoneid == zoneid) && 1452 (np->e_rdev == config->knc_rdev) && 1453 (strcmp(np->e_protofmly, 1454 config->knc_protofmly) == 0)) 1455 break; 1456 1457 if (np == NULL && n_etype != NULL) { 1458 ASSERT(rw_write_held(&endpnt_type_lock)); 1459 1460 /* 1461 * Link the endpoint type onto the list 1462 */ 1463 n_etype->e_next = endpnt_type_list; 1464 endpnt_type_list = n_etype; 1465 np = n_etype; 1466 n_etype = NULL; 1467 } 1468 1469 if (np == NULL) { 1470 /* 1471 * The logic here is that we were unable to find an 1472 * endpnt_type_t that matched our criteria, so we allocate a 1473 * new one. Because kmem_alloc() needs to be called with 1474 * KM_SLEEP, we drop our locks so that we don't induce 1475 * deadlock. After allocating and initializing the 1476 * endpnt_type_t, we reaquire the lock and go back to check 1477 * if this entry needs to be added to the list. Since we do 1478 * some operations without any locking other threads may 1479 * have been looking for the same endpnt_type_t and gone 1480 * through this code path. We check for this case and allow 1481 * one thread to link its endpnt_type_t to the list and the 1482 * other threads will simply free theirs. 1483 */ 1484 rw_exit(&endpnt_type_lock); 1485 n_etype = endpnt_type_create(config); 1486 1487 /* 1488 * We need to reaquire the lock with RW_WRITER here so that 1489 * we can safely link the new endpoint type onto the list. 1490 */ 1491 rw_enter(&endpnt_type_lock, RW_WRITER); 1492 goto top; 1493 } 1494 1495 rw_exit(&endpnt_type_lock); 1496 /* 1497 * If n_etype is not NULL, then another thread was able to 1498 * insert an endpnt_type_t of this type onto the list before 1499 * we did. Go ahead and free ours. 1500 */ 1501 if (n_etype != NULL) 1502 endpnt_type_free(n_etype); 1503 1504 mutex_enter(&np->e_ilock); 1505 /* 1506 * The algorithm to hand out endpoints is to first 1507 * give out those that are idle if such endpoints 1508 * exist. Otherwise, create a new one if we haven't 1509 * reached the max threshold. Finally, we give out 1510 * endpoints in a pseudo LRU fashion (round-robin). 1511 * 1512 * Note: The idle list is merely a hint of those endpoints 1513 * that should be idle. There exists a window after the 1514 * endpoint is released and before it is linked back onto the 1515 * idle list where a thread could get a reference to it and 1516 * use it. This is okay, since the reference counts will 1517 * still be consistent. 1518 */ 1519 if ((endp = (endpnt_t *)list_head(&np->e_ilist)) != NULL) { 1520 timeout_id_t t_id = 0; 1521 1522 mutex_enter(&endp->e_lock); 1523 endp->e_ref++; 1524 endp->e_itime = 0; 1525 endp->e_flags &= ~ENDPNT_ONIDLE; 1526 mutex_exit(&endp->e_lock); 1527 1528 /* 1529 * Pop the endpoint off the idle list and hand it off 1530 */ 1531 list_remove(&np->e_ilist, endp); 1532 1533 if (np->e_itimer != 0) { 1534 t_id = np->e_itimer; 1535 np->e_itimer = 0; 1536 } 1537 mutex_exit(&np->e_ilock); 1538 /* 1539 * Reset the idle timer if it has been set 1540 */ 1541 if (t_id != (timeout_id_t)0) 1542 (void) untimeout(t_id); 1543 1544 if (check_endpnt(endp, &new) == 0) 1545 return (new); 1546 } else if (np->e_cnt >= clnt_clts_max_endpoints) { 1547 /* 1548 * There are no idle endpoints currently, so 1549 * create a new one if we have not reached the maximum or 1550 * hand one out in round-robin. 1551 */ 1552 mutex_exit(&np->e_ilock); 1553 mutex_enter(&np->e_plock); 1554 endp = np->e_pcurr; 1555 mutex_enter(&endp->e_lock); 1556 endp->e_ref++; 1557 mutex_exit(&endp->e_lock); 1558 1559 ASSERT(endp != NULL); 1560 /* 1561 * Advance the pointer to the next eligible endpoint, if 1562 * necessary. 1563 */ 1564 if (np->e_cnt > 1) { 1565 next = (endpnt_t *)list_next(&np->e_pool, np->e_pcurr); 1566 if (next == NULL) 1567 next = (endpnt_t *)list_head(&np->e_pool); 1568 np->e_pcurr = next; 1569 } 1570 1571 mutex_exit(&np->e_plock); 1572 1573 /* 1574 * We need to check to see if this endpoint is bound or 1575 * not. If it is in progress then just wait until 1576 * the set up is complete 1577 */ 1578 if (check_endpnt(endp, &new) == 0) 1579 return (new); 1580 } else { 1581 mutex_exit(&np->e_ilock); 1582 mutex_enter(&np->e_plock); 1583 1584 /* 1585 * Allocate a new endpoint to use. If we can't allocate any 1586 * more memory then use one that is already established if any 1587 * such endpoints exist. 1588 */ 1589 new = kmem_cache_alloc(endpnt_cache, KM_NOSLEEP); 1590 if (new == NULL) { 1591 RPCLOG0(1, "endpnt_get: kmem_cache_alloc failed\n"); 1592 /* 1593 * Try to recover by using an existing endpoint. 1594 */ 1595 if (np->e_cnt <= 0) { 1596 mutex_exit(&np->e_plock); 1597 return (NULL); 1598 } 1599 endp = np->e_pcurr; 1600 if ((next = list_next(&np->e_pool, np->e_pcurr)) != 1601 NULL) 1602 np->e_pcurr = next; 1603 ASSERT(endp != NULL); 1604 mutex_enter(&endp->e_lock); 1605 endp->e_ref++; 1606 mutex_exit(&endp->e_lock); 1607 mutex_exit(&np->e_plock); 1608 1609 if (check_endpnt(endp, &new) == 0) 1610 return (new); 1611 } else { 1612 /* 1613 * Partially init an endpoint structure and put 1614 * it on the list, so that other interested threads 1615 * know that one is being created 1616 */ 1617 bzero(new, sizeof (struct endpnt)); 1618 1619 cv_init(&new->e_cv, NULL, CV_DEFAULT, NULL); 1620 mutex_init(&new->e_lock, NULL, MUTEX_DEFAULT, NULL); 1621 new->e_ref = 1; 1622 new->e_type = np; 1623 1624 /* 1625 * Link the endpoint into the pool. 1626 */ 1627 list_insert_head(&np->e_pool, new); 1628 np->e_cnt++; 1629 if (np->e_pcurr == NULL) 1630 np->e_pcurr = new; 1631 mutex_exit(&np->e_plock); 1632 } 1633 } 1634 1635 /* 1636 * The transport should be opened with sufficient privs 1637 */ 1638 cr = zone_kcred(); 1639 error = t_kopen(NULL, config->knc_rdev, FREAD|FWRITE|FNDELAY, &tiptr, 1640 cr); 1641 if (error) { 1642 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1643 goto bad; 1644 } 1645 1646 new->e_tiptr = tiptr; 1647 rpc_poptimod(tiptr->fp->f_vnode); 1648 1649 /* 1650 * Allow the kernel to push the module on behalf of the user. 1651 */ 1652 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 1653 K_TO_K, cr, &retval); 1654 if (error) { 1655 RPCLOG(1, "endpnt_get: kstr_push on rpcmod failed %d\n", error); 1656 goto bad; 1657 } 1658 1659 error = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 1660 cr, &retval); 1661 if (error) { 1662 RPCLOG(1, "endpnt_get: strioctl failed %d\n", error); 1663 goto bad; 1664 } 1665 1666 /* 1667 * Connectionless data flow should bypass the stream head. 1668 */ 1669 new->e_wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 1670 1671 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 1672 K_TO_K, cr, &retval); 1673 if (error) { 1674 RPCLOG(1, "endpnt_get: kstr_push on timod failed %d\n", error); 1675 goto bad; 1676 } 1677 1678 /* 1679 * Attempt to bind the endpoint. If we fail then propogate 1680 * error back to calling subsystem, so that it can be handled 1681 * appropriately. 1682 * If the caller has not specified reserved port usage then 1683 * take the system default. 1684 */ 1685 if (useresvport == -1) 1686 useresvport = clnt_clts_do_bindresvport; 1687 1688 if (useresvport && 1689 (strcmp(config->knc_protofmly, NC_INET) == 0 || 1690 strcmp(config->knc_protofmly, NC_INET6) == 0)) { 1691 1692 while ((error = 1693 bindresvport(new->e_tiptr, NULL, NULL, FALSE)) != 0) { 1694 RPCLOG(1, 1695 "endpnt_get: bindresvport error %d\n", error); 1696 if (error != EPROTO) { 1697 if (rtries-- <= 0) 1698 goto bad; 1699 1700 delay(hz << i++); 1701 continue; 1702 } 1703 1704 (void) t_kclose(new->e_tiptr, 1); 1705 /* 1706 * reopen with all privileges 1707 */ 1708 error = t_kopen(NULL, config->knc_rdev, 1709 FREAD|FWRITE|FNDELAY, 1710 &new->e_tiptr, cr); 1711 if (error) { 1712 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1713 new->e_tiptr = NULL; 1714 goto bad; 1715 } 1716 } 1717 } else if ((error = t_kbind(new->e_tiptr, NULL, NULL)) != 0) { 1718 RPCLOG(1, "endpnt_get: t_kbind failed: %d\n", error); 1719 goto bad; 1720 } 1721 1722 /* 1723 * Set the flags and notify and waiters that we have an established 1724 * endpoint. 1725 */ 1726 mutex_enter(&new->e_lock); 1727 new->e_flags |= ENDPNT_ESTABLISHED; 1728 new->e_flags |= ENDPNT_BOUND; 1729 if (new->e_flags & ENDPNT_WAITING) { 1730 cv_broadcast(&new->e_cv); 1731 new->e_flags &= ~ENDPNT_WAITING; 1732 } 1733 mutex_exit(&new->e_lock); 1734 1735 return (new); 1736 1737 bad: 1738 ASSERT(new != NULL); 1739 /* 1740 * mark this endpoint as stale and notify any threads waiting 1741 * on this endpoint that it will be going away. 1742 */ 1743 mutex_enter(&new->e_lock); 1744 if (new->e_ref > 0) { 1745 new->e_flags |= ENDPNT_ESTABLISHED; 1746 new->e_flags |= ENDPNT_STALE; 1747 if (new->e_flags & ENDPNT_WAITING) { 1748 cv_broadcast(&new->e_cv); 1749 new->e_flags &= ~ENDPNT_WAITING; 1750 } 1751 } 1752 new->e_ref--; 1753 new->e_tiptr = NULL; 1754 mutex_exit(&new->e_lock); 1755 1756 /* 1757 * If there was a transport endopoint opened, then close it. 1758 */ 1759 if (tiptr != NULL) 1760 (void) t_kclose(tiptr, 1); 1761 1762 return (NULL); 1763 } 1764 1765 /* 1766 * Release a referece to the endpoint 1767 */ 1768 static void 1769 endpnt_rele(struct endpnt *sp) 1770 { 1771 mutex_enter(&sp->e_lock); 1772 ASSERT(sp->e_ref > 0); 1773 sp->e_ref--; 1774 /* 1775 * If the ref count is zero, then start the idle timer and link 1776 * the endpoint onto the idle list. 1777 */ 1778 if (sp->e_ref == 0) { 1779 sp->e_itime = gethrestime_sec(); 1780 1781 /* 1782 * Check to see if the endpoint is already linked to the idle 1783 * list, so that we don't try to reinsert it. 1784 */ 1785 if (sp->e_flags & ENDPNT_ONIDLE) { 1786 mutex_exit(&sp->e_lock); 1787 mutex_enter(&sp->e_type->e_ilock); 1788 endpnt_reap_settimer(sp->e_type); 1789 mutex_exit(&sp->e_type->e_ilock); 1790 return; 1791 } 1792 1793 sp->e_flags |= ENDPNT_ONIDLE; 1794 mutex_exit(&sp->e_lock); 1795 mutex_enter(&sp->e_type->e_ilock); 1796 list_insert_tail(&sp->e_type->e_ilist, sp); 1797 endpnt_reap_settimer(sp->e_type); 1798 mutex_exit(&sp->e_type->e_ilock); 1799 } else 1800 mutex_exit(&sp->e_lock); 1801 } 1802 1803 static void 1804 endpnt_reap_settimer(endpnt_type_t *etp) 1805 { 1806 if (etp->e_itimer == (timeout_id_t)0) 1807 etp->e_itimer = timeout(endpnt_reap_dispatch, (void *)etp, 1808 clnt_clts_taskq_dispatch_interval); 1809 } 1810 1811 static void 1812 endpnt_reap_dispatch(void *a) 1813 { 1814 endpnt_type_t *etp = a; 1815 1816 /* 1817 * The idle timer has fired, so dispatch the taskq to close the 1818 * endpoint. 1819 */ 1820 if (taskq_dispatch(endpnt_taskq, (task_func_t *)endpnt_reap, etp, 1821 TQ_NOSLEEP) == NULL) 1822 return; 1823 mutex_enter(&etp->e_ilock); 1824 etp->e_async_count++; 1825 mutex_exit(&etp->e_ilock); 1826 } 1827 1828 /* 1829 * Traverse the idle list and close those endpoints that have reached their 1830 * timeout interval. 1831 */ 1832 static void 1833 endpnt_reap(endpnt_type_t *etp) 1834 { 1835 struct endpnt *e; 1836 struct endpnt *next_node = NULL; 1837 1838 mutex_enter(&etp->e_ilock); 1839 e = list_head(&etp->e_ilist); 1840 while (e != NULL) { 1841 next_node = list_next(&etp->e_ilist, e); 1842 1843 mutex_enter(&e->e_lock); 1844 if (e->e_ref > 0) { 1845 mutex_exit(&e->e_lock); 1846 e = next_node; 1847 continue; 1848 } 1849 1850 ASSERT(e->e_ref == 0); 1851 if (e->e_itime > 0 && 1852 (e->e_itime + clnt_clts_endpoint_reap_interval) < 1853 gethrestime_sec()) { 1854 e->e_flags &= ~ENDPNT_BOUND; 1855 (void) t_kclose(e->e_tiptr, 1); 1856 e->e_tiptr = NULL; 1857 e->e_itime = 0; 1858 } 1859 mutex_exit(&e->e_lock); 1860 e = next_node; 1861 } 1862 etp->e_itimer = 0; 1863 if (--etp->e_async_count == 0) 1864 cv_signal(&etp->e_async_cv); 1865 mutex_exit(&etp->e_ilock); 1866 } 1867 1868 static void 1869 endpnt_reclaim(zoneid_t zoneid) 1870 { 1871 struct endpnt_type *np; 1872 struct endpnt *e; 1873 struct endpnt *next_node = NULL; 1874 list_t free_list; 1875 int rcnt = 0; 1876 1877 list_create(&free_list, sizeof (endpnt_t), offsetof(endpnt_t, e_node)); 1878 1879 RPCLOG0(1, "endpnt_reclaim: reclaim callback started\n"); 1880 rw_enter(&endpnt_type_lock, RW_READER); 1881 for (np = endpnt_type_list; np != NULL; np = np->e_next) { 1882 if (zoneid != ALL_ZONES && zoneid != np->e_zoneid) 1883 continue; 1884 1885 mutex_enter(&np->e_plock); 1886 RPCLOG(1, "endpnt_reclaim: protofmly %s, ", 1887 np->e_protofmly); 1888 RPCLOG(1, "rdev %ld\n", np->e_rdev); 1889 RPCLOG(1, "endpnt_reclaim: found %d endpoint(s)\n", 1890 np->e_cnt); 1891 1892 if (np->e_cnt == 0) { 1893 mutex_exit(&np->e_plock); 1894 continue; 1895 } 1896 1897 /* 1898 * The nice thing about maintaining an idle list is that if 1899 * there are any endpoints to reclaim, they are going to be 1900 * on this list. Just go through and reap the one's that 1901 * have ref counts of zero. 1902 */ 1903 mutex_enter(&np->e_ilock); 1904 e = list_head(&np->e_ilist); 1905 while (e != NULL) { 1906 next_node = list_next(&np->e_ilist, e); 1907 mutex_enter(&e->e_lock); 1908 if (e->e_ref > 0) { 1909 mutex_exit(&e->e_lock); 1910 e = next_node; 1911 continue; 1912 } 1913 ASSERT(e->e_ref == 0); 1914 mutex_exit(&e->e_lock); 1915 1916 list_remove(&np->e_ilist, e); 1917 list_remove(&np->e_pool, e); 1918 list_insert_head(&free_list, e); 1919 1920 rcnt++; 1921 np->e_cnt--; 1922 e = next_node; 1923 } 1924 mutex_exit(&np->e_ilock); 1925 /* 1926 * Reset the current pointer to be safe 1927 */ 1928 if ((e = (struct endpnt *)list_head(&np->e_pool)) != NULL) 1929 np->e_pcurr = e; 1930 else { 1931 ASSERT(np->e_cnt == 0); 1932 np->e_pcurr = NULL; 1933 } 1934 1935 mutex_exit(&np->e_plock); 1936 } 1937 rw_exit(&endpnt_type_lock); 1938 1939 while ((e = list_head(&free_list)) != NULL) { 1940 list_remove(&free_list, e); 1941 if (e->e_tiptr != NULL) 1942 (void) t_kclose(e->e_tiptr, 1); 1943 1944 cv_destroy(&e->e_cv); 1945 mutex_destroy(&e->e_lock); 1946 kmem_cache_free(endpnt_cache, e); 1947 } 1948 list_destroy(&free_list); 1949 RPCLOG(1, "endpnt_reclaim: reclaimed %d endpoint(s)\n", rcnt); 1950 } 1951 1952 /* 1953 * Endpoint reclaim zones destructor callback routine. 1954 * 1955 * After reclaiming any cached entries, we basically go through the endpnt_type 1956 * list, canceling outstanding timeouts and free'ing data structures. 1957 */ 1958 /* ARGSUSED */ 1959 static void 1960 endpnt_destructor(zoneid_t zoneid, void *a) 1961 { 1962 struct endpnt_type **npp; 1963 struct endpnt_type *np; 1964 struct endpnt_type *free_list = NULL; 1965 timeout_id_t t_id = 0; 1966 extern void clcleanup_zone(zoneid_t); 1967 extern void clcleanup4_zone(zoneid_t); 1968 1969 /* Make sure NFS client handles are released. */ 1970 clcleanup_zone(zoneid); 1971 clcleanup4_zone(zoneid); 1972 1973 endpnt_reclaim(zoneid); 1974 /* 1975 * We don't need to be holding on to any locks across the call to 1976 * endpnt_reclaim() and the code below; we know that no-one can 1977 * be holding open connections for this zone (all processes and kernel 1978 * threads are gone), so nothing could be adding anything to the list. 1979 */ 1980 rw_enter(&endpnt_type_lock, RW_WRITER); 1981 npp = &endpnt_type_list; 1982 while ((np = *npp) != NULL) { 1983 if (np->e_zoneid != zoneid) { 1984 npp = &np->e_next; 1985 continue; 1986 } 1987 mutex_enter(&np->e_plock); 1988 mutex_enter(&np->e_ilock); 1989 if (np->e_itimer != 0) { 1990 t_id = np->e_itimer; 1991 np->e_itimer = 0; 1992 } 1993 ASSERT(np->e_cnt == 0); 1994 ASSERT(list_head(&np->e_pool) == NULL); 1995 ASSERT(list_head(&np->e_ilist) == NULL); 1996 1997 mutex_exit(&np->e_ilock); 1998 mutex_exit(&np->e_plock); 1999 2000 /* 2001 * untimeout() any outstanding timers that have not yet fired. 2002 */ 2003 if (t_id != (timeout_id_t)0) 2004 (void) untimeout(t_id); 2005 *npp = np->e_next; 2006 np->e_next = free_list; 2007 free_list = np; 2008 } 2009 rw_exit(&endpnt_type_lock); 2010 2011 while (free_list != NULL) { 2012 np = free_list; 2013 free_list = free_list->e_next; 2014 /* 2015 * Wait for threads in endpnt_taskq trying to reap endpnt_ts in 2016 * the endpnt_type_t. 2017 */ 2018 mutex_enter(&np->e_ilock); 2019 while (np->e_async_count > 0) 2020 cv_wait(&np->e_async_cv, &np->e_ilock); 2021 cv_destroy(&np->e_async_cv); 2022 mutex_destroy(&np->e_plock); 2023 mutex_destroy(&np->e_ilock); 2024 list_destroy(&np->e_pool); 2025 list_destroy(&np->e_ilist); 2026 kmem_free(np, sizeof (endpnt_type_t)); 2027 } 2028 } 2029 2030 /* 2031 * Endpoint reclaim kmem callback routine. 2032 */ 2033 /* ARGSUSED */ 2034 static void 2035 endpnt_repossess(void *a) 2036 { 2037 /* 2038 * Reclaim idle endpnt's from all zones. 2039 */ 2040 if (endpnt_taskq != NULL) 2041 (void) taskq_dispatch(endpnt_taskq, 2042 (task_func_t *)endpnt_reclaim, (void *)ALL_ZONES, 2043 TQ_NOSLEEP); 2044 } 2045 2046 /* 2047 * RPC request dispatch routine. Constructs a datagram message and wraps it 2048 * around the RPC request to pass downstream. 2049 */ 2050 static int 2051 clnt_clts_dispatch_send(queue_t *q, mblk_t *mp, struct netbuf *addr, 2052 calllist_t *cp, uint_t xid) 2053 { 2054 mblk_t *bp; 2055 int msgsz; 2056 struct T_unitdata_req *udreq; 2057 2058 /* 2059 * Set up the call record. 2060 */ 2061 cp->call_wq = q; 2062 cp->call_xid = xid; 2063 cp->call_status = RPC_TIMEDOUT; 2064 cp->call_notified = FALSE; 2065 RPCLOG(64, 2066 "clnt_clts_dispatch_send: putting xid 0x%x on " 2067 "dispatch list\n", xid); 2068 cp->call_hash = call_hash(xid, clnt_clts_hash_size); 2069 cp->call_bucket = &clts_call_ht[cp->call_hash]; 2070 call_table_enter(cp); 2071 2072 /* 2073 * Construct the datagram 2074 */ 2075 msgsz = (int)TUNITDATAREQSZ; 2076 while (!(bp = allocb(msgsz + addr->len, BPRI_LO))) { 2077 if (strwaitbuf(msgsz + addr->len, BPRI_LO)) 2078 return (ENOSR); 2079 } 2080 2081 udreq = (struct T_unitdata_req *)bp->b_wptr; 2082 udreq->PRIM_type = T_UNITDATA_REQ; 2083 udreq->DEST_length = addr->len; 2084 2085 if (addr->len) { 2086 bcopy(addr->buf, bp->b_wptr + msgsz, addr->len); 2087 udreq->DEST_offset = (t_scalar_t)msgsz; 2088 msgsz += addr->len; 2089 } else 2090 udreq->DEST_offset = 0; 2091 udreq->OPT_length = 0; 2092 udreq->OPT_offset = 0; 2093 2094 bp->b_datap->db_type = M_PROTO; 2095 bp->b_wptr += msgsz; 2096 2097 /* 2098 * Link the datagram header with the actual data 2099 */ 2100 linkb(bp, mp); 2101 2102 /* 2103 * Send downstream. 2104 */ 2105 put(cp->call_wq, bp); 2106 2107 return (0); 2108 } 2109 2110 /* 2111 * RPC response delivery routine. Deliver the response to the waiting 2112 * thread by matching the xid. 2113 */ 2114 void 2115 clnt_clts_dispatch_notify(mblk_t *mp, int resp_off, zoneid_t zoneid) 2116 { 2117 calllist_t *e = NULL; 2118 call_table_t *chtp; 2119 uint32_t xid; 2120 uint_t hash; 2121 unsigned char *hdr_offset; 2122 mblk_t *resp; 2123 2124 /* 2125 * If the RPC response is not contained in the same mblk as the 2126 * datagram header, then move to the next mblk. 2127 */ 2128 hdr_offset = mp->b_rptr; 2129 resp = mp; 2130 if ((mp->b_wptr - (mp->b_rptr + resp_off)) == 0) 2131 resp = mp->b_cont; 2132 else 2133 resp->b_rptr += resp_off; 2134 2135 ASSERT(resp != NULL); 2136 2137 if ((IS_P2ALIGNED(resp->b_rptr, sizeof (uint32_t))) && 2138 (resp->b_wptr - resp->b_rptr) >= sizeof (xid)) 2139 xid = *((uint32_t *)resp->b_rptr); 2140 else { 2141 int i = 0; 2142 unsigned char *p = (unsigned char *)&xid; 2143 unsigned char *rptr; 2144 mblk_t *tmp = resp; 2145 2146 /* 2147 * Copy the xid, byte-by-byte into xid. 2148 */ 2149 while (tmp) { 2150 rptr = tmp->b_rptr; 2151 while (rptr < tmp->b_wptr) { 2152 *p++ = *rptr++; 2153 if (++i >= sizeof (xid)) 2154 goto done_xid_copy; 2155 } 2156 tmp = tmp->b_cont; 2157 } 2158 2159 /* 2160 * If we got here, we ran out of mblk space before the 2161 * xid could be copied. 2162 */ 2163 ASSERT(tmp == NULL && i < sizeof (xid)); 2164 2165 RPCLOG0(1, 2166 "clnt_dispatch_notify(clts): message less than " 2167 "size of xid\n"); 2168 2169 freemsg(mp); 2170 return; 2171 } 2172 2173 done_xid_copy: 2174 2175 /* 2176 * Reset the read pointer back to the beginning of the protocol 2177 * header if we moved it. 2178 */ 2179 if (mp->b_rptr != hdr_offset) 2180 mp->b_rptr = hdr_offset; 2181 2182 hash = call_hash(xid, clnt_clts_hash_size); 2183 chtp = &clts_call_ht[hash]; 2184 /* call_table_find returns with the hash bucket locked */ 2185 call_table_find(chtp, xid, e); 2186 2187 if (e != NULL) { 2188 mutex_enter(&e->call_lock); 2189 2190 /* 2191 * verify that the reply is coming in on 2192 * the same zone that it was sent from. 2193 */ 2194 if (e->call_zoneid != zoneid) { 2195 mutex_exit(&e->call_lock); 2196 mutex_exit(&chtp->ct_lock); 2197 freemsg(mp); 2198 return; 2199 } 2200 2201 /* 2202 * found thread waiting for this reply. 2203 */ 2204 if (e->call_reply) { 2205 RPCLOG(8, 2206 "clnt_dispatch_notify (clts): discarding old " 2207 "reply for xid 0x%x\n", 2208 xid); 2209 freemsg(e->call_reply); 2210 } 2211 e->call_notified = TRUE; 2212 e->call_reply = mp; 2213 e->call_status = RPC_SUCCESS; 2214 cv_signal(&e->call_cv); 2215 mutex_exit(&e->call_lock); 2216 mutex_exit(&chtp->ct_lock); 2217 } else { 2218 zone_t *zone; 2219 struct rpcstat *rpcstat; 2220 2221 mutex_exit(&chtp->ct_lock); 2222 RPCLOG(8, "clnt_dispatch_notify (clts): no caller for reply " 2223 "0x%x\n", xid); 2224 freemsg(mp); 2225 /* 2226 * This is unfortunate, but we need to lookup the zone so we 2227 * can increment its "rcbadxids" counter. 2228 */ 2229 zone = zone_find_by_id(zoneid); 2230 if (zone == NULL) { 2231 /* 2232 * The zone went away... 2233 */ 2234 return; 2235 } 2236 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 2237 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 2238 /* 2239 * Not interested 2240 */ 2241 zone_rele(zone); 2242 return; 2243 } 2244 RCSTAT_INCR(rpcstat->rpc_clts_client, rcbadxids); 2245 zone_rele(zone); 2246 } 2247 } 2248 2249 /* 2250 * Init routine. Called when rpcmod is loaded. 2251 */ 2252 void 2253 clnt_clts_init(void) 2254 { 2255 endpnt_cache = kmem_cache_create("clnt_clts_endpnt_cache", 2256 sizeof (struct endpnt), 0, NULL, NULL, endpnt_repossess, NULL, 2257 NULL, 0); 2258 2259 rw_init(&endpnt_type_lock, NULL, RW_DEFAULT, NULL); 2260 2261 /* 2262 * Perform simple bounds checking to make sure that the setting is 2263 * reasonable 2264 */ 2265 if (clnt_clts_max_endpoints <= 0) { 2266 if (clnt_clts_do_bindresvport) 2267 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2268 else 2269 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2270 } 2271 2272 if (clnt_clts_do_bindresvport && 2273 clnt_clts_max_endpoints > RESERVED_PORTSPACE) 2274 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2275 else if (clnt_clts_max_endpoints > NONRESERVED_PORTSPACE) 2276 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2277 2278 if (clnt_clts_hash_size < DEFAULT_MIN_HASH_SIZE) 2279 clnt_clts_hash_size = DEFAULT_MIN_HASH_SIZE; 2280 2281 /* 2282 * Defer creating the taskq until rpcmod gets pushed. If we are 2283 * in diskless boot mode, rpcmod will get loaded early even before 2284 * thread_create() is available. 2285 */ 2286 endpnt_taskq = NULL; 2287 taskq_created = FALSE; 2288 mutex_init(&endpnt_taskq_lock, NULL, MUTEX_DEFAULT, NULL); 2289 2290 if (clnt_clts_endpoint_reap_interval < DEFAULT_ENDPOINT_REAP_INTERVAL) 2291 clnt_clts_endpoint_reap_interval = 2292 DEFAULT_ENDPOINT_REAP_INTERVAL; 2293 2294 /* 2295 * Dispatch the taskq at an interval which is offset from the 2296 * interval that the endpoints should be reaped. 2297 */ 2298 clnt_clts_taskq_dispatch_interval = 2299 (clnt_clts_endpoint_reap_interval + DEFAULT_INTERVAL_SHIFT) * hz; 2300 2301 /* 2302 * Initialize the completion queue 2303 */ 2304 clts_call_ht = call_table_init(clnt_clts_hash_size); 2305 /* 2306 * Initialize the zone destructor callback. 2307 */ 2308 zone_key_create(&endpnt_destructor_key, NULL, NULL, endpnt_destructor); 2309 } 2310 2311 void 2312 clnt_clts_fini(void) 2313 { 2314 (void) zone_key_delete(endpnt_destructor_key); 2315 } 2316