1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 /* 32 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 33 * All Rights Reserved 34 */ 35 36 /* 37 * Portions of this source code were derived from Berkeley 4.3 BSD 38 * under license from the Regents of the University of California. 39 */ 40 41 42 /* 43 * Implements a kernel based, client side RPC. 44 */ 45 46 #include <sys/param.h> 47 #include <sys/types.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/stream.h> 51 #include <sys/strsubr.h> 52 #include <sys/ddi.h> 53 #include <sys/tiuser.h> 54 #include <sys/tihdr.h> 55 #include <sys/t_kuser.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/kstat.h> 60 #include <sys/t_lock.h> 61 #include <sys/cmn_err.h> 62 #include <sys/conf.h> 63 #include <sys/disp.h> 64 #include <sys/taskq.h> 65 #include <sys/list.h> 66 #include <sys/atomic.h> 67 #include <sys/zone.h> 68 #include <netinet/in.h> 69 #include <rpc/types.h> 70 #include <rpc/xdr.h> 71 #include <rpc/auth.h> 72 #include <rpc/clnt.h> 73 #include <rpc/rpc_msg.h> 74 75 #include <sys/sdt.h> 76 77 static enum clnt_stat clnt_clts_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 78 caddr_t, xdrproc_t, caddr_t, struct timeval); 79 static void clnt_clts_kabort(CLIENT *); 80 static void clnt_clts_kerror(CLIENT *, struct rpc_err *); 81 static bool_t clnt_clts_kfreeres(CLIENT *, xdrproc_t, caddr_t); 82 static bool_t clnt_clts_kcontrol(CLIENT *, int, char *); 83 static void clnt_clts_kdestroy(CLIENT *); 84 static int clnt_clts_ksettimers(CLIENT *, struct rpc_timers *, 85 struct rpc_timers *, int, void (*)(), caddr_t, uint32_t); 86 87 /* 88 * Operations vector for CLTS based RPC 89 */ 90 static struct clnt_ops clts_ops = { 91 clnt_clts_kcallit, /* do rpc call */ 92 clnt_clts_kabort, /* abort call */ 93 clnt_clts_kerror, /* return error status */ 94 clnt_clts_kfreeres, /* free results */ 95 clnt_clts_kdestroy, /* destroy rpc handle */ 96 clnt_clts_kcontrol, /* the ioctl() of rpc */ 97 clnt_clts_ksettimers /* set retry timers */ 98 }; 99 100 /* 101 * Endpoint for CLTS (INET, INET6, loopback, etc.) 102 */ 103 typedef struct endpnt_type { 104 struct endpnt_type *e_next; /* pointer to next endpoint type */ 105 list_t e_pool; /* list of available endpoints */ 106 list_t e_ilist; /* list of idle endpoints */ 107 struct endpnt *e_pcurr; /* pointer to current endpoint */ 108 char e_protofmly[KNC_STRSIZE]; /* protocol family */ 109 dev_t e_rdev; /* device */ 110 kmutex_t e_plock; /* pool lock */ 111 kmutex_t e_ilock; /* idle list lock */ 112 timeout_id_t e_itimer; /* timer to dispatch the taskq */ 113 uint_t e_cnt; /* number of endpoints in the pool */ 114 zoneid_t e_zoneid; /* zoneid of endpoint type */ 115 kcondvar_t e_async_cv; /* cv for asynchronous reap threads */ 116 uint_t e_async_count; /* count of asynchronous reap threads */ 117 } endpnt_type_t; 118 119 typedef struct endpnt { 120 list_node_t e_node; /* link to the pool */ 121 list_node_t e_idle; /* link to the idle list */ 122 endpnt_type_t *e_type; /* back pointer to endpoint type */ 123 TIUSER *e_tiptr; /* pointer to transport endpoint */ 124 queue_t *e_wq; /* write queue */ 125 uint_t e_flags; /* endpoint flags */ 126 uint_t e_ref; /* ref count on endpoint */ 127 kcondvar_t e_cv; /* condition variable */ 128 kmutex_t e_lock; /* protects cv and flags */ 129 time_t e_itime; /* time when rele'd */ 130 } endpnt_t; 131 132 #define ENDPNT_ESTABLISHED 0x1 /* endpoint is established */ 133 #define ENDPNT_WAITING 0x2 /* thread waiting for endpoint */ 134 #define ENDPNT_BOUND 0x4 /* endpoint is bound */ 135 #define ENDPNT_STALE 0x8 /* endpoint is dead */ 136 #define ENDPNT_ONIDLE 0x10 /* endpoint is on the idle list */ 137 138 static krwlock_t endpnt_type_lock; /* protects endpnt_type_list */ 139 static endpnt_type_t *endpnt_type_list = NULL; /* list of CLTS endpoints */ 140 static struct kmem_cache *endpnt_cache; /* cache of endpnt_t's */ 141 static taskq_t *endpnt_taskq; /* endpnt_t reaper thread */ 142 static bool_t taskq_created; /* flag for endpnt_taskq */ 143 static kmutex_t endpnt_taskq_lock; /* taskq lock */ 144 static zone_key_t endpnt_destructor_key; 145 146 #define DEFAULT_ENDPOINT_REAP_INTERVAL 60 /* 1 minute */ 147 #define DEFAULT_INTERVAL_SHIFT 30 /* 30 seconds */ 148 149 /* 150 * Endpoint tunables 151 */ 152 static int clnt_clts_max_endpoints = -1; 153 static int clnt_clts_hash_size = DEFAULT_HASH_SIZE; 154 static time_t clnt_clts_endpoint_reap_interval = -1; 155 static clock_t clnt_clts_taskq_dispatch_interval; 156 157 /* 158 * Response completion hash queue 159 */ 160 static call_table_t *clts_call_ht; 161 162 /* 163 * Routines for the endpoint manager 164 */ 165 static struct endpnt_type *endpnt_type_create(struct knetconfig *); 166 static void endpnt_type_free(struct endpnt_type *); 167 static int check_endpnt(struct endpnt *, struct endpnt **); 168 static struct endpnt *endpnt_get(struct knetconfig *, int); 169 static void endpnt_rele(struct endpnt *); 170 static void endpnt_reap_settimer(endpnt_type_t *); 171 static void endpnt_reap(endpnt_type_t *); 172 static void endpnt_reap_dispatch(void *); 173 static void endpnt_reclaim(zoneid_t); 174 175 176 /* 177 * Request dipatching function. 178 */ 179 static int clnt_clts_dispatch_send(queue_t *q, mblk_t *, struct netbuf *addr, 180 calllist_t *, uint_t, cred_t *); 181 182 /* 183 * The size of the preserialized RPC header information. 184 */ 185 #define CKU_HDRSIZE 20 186 /* 187 * The initial allocation size. It is small to reduce space requirements. 188 */ 189 #define CKU_INITSIZE 2048 190 /* 191 * The size of additional allocations, if required. It is larger to 192 * reduce the number of actual allocations. 193 */ 194 #define CKU_ALLOCSIZE 8192 195 196 /* 197 * Private data per rpc handle. This structure is allocated by 198 * clnt_clts_kcreate, and freed by clnt_clts_kdestroy. 199 */ 200 struct cku_private { 201 CLIENT cku_client; /* client handle */ 202 int cku_retrys; /* request retrys */ 203 calllist_t cku_call; 204 struct endpnt *cku_endpnt; /* open end point */ 205 struct knetconfig cku_config; 206 struct netbuf cku_addr; /* remote address */ 207 struct rpc_err cku_err; /* error status */ 208 XDR cku_outxdr; /* xdr stream for output */ 209 XDR cku_inxdr; /* xdr stream for input */ 210 char cku_rpchdr[CKU_HDRSIZE + 4]; /* rpc header */ 211 struct cred *cku_cred; /* credentials */ 212 struct rpc_timers *cku_timers; /* for estimating RTT */ 213 struct rpc_timers *cku_timeall; /* for estimating RTT */ 214 void (*cku_feedback)(int, int, caddr_t); 215 /* ptr to feedback rtn */ 216 caddr_t cku_feedarg; /* argument for feedback func */ 217 uint32_t cku_xid; /* current XID */ 218 bool_t cku_bcast; /* RPC broadcast hint */ 219 int cku_useresvport; /* Use reserved port */ 220 struct rpc_clts_client *cku_stats; /* counters for the zone */ 221 }; 222 223 static const struct rpc_clts_client { 224 kstat_named_t rccalls; 225 kstat_named_t rcbadcalls; 226 kstat_named_t rcretrans; 227 kstat_named_t rcbadxids; 228 kstat_named_t rctimeouts; 229 kstat_named_t rcnewcreds; 230 kstat_named_t rcbadverfs; 231 kstat_named_t rctimers; 232 kstat_named_t rcnomem; 233 kstat_named_t rccantsend; 234 } clts_rcstat_tmpl = { 235 { "calls", KSTAT_DATA_UINT64 }, 236 { "badcalls", KSTAT_DATA_UINT64 }, 237 { "retrans", KSTAT_DATA_UINT64 }, 238 { "badxids", KSTAT_DATA_UINT64 }, 239 { "timeouts", KSTAT_DATA_UINT64 }, 240 { "newcreds", KSTAT_DATA_UINT64 }, 241 { "badverfs", KSTAT_DATA_UINT64 }, 242 { "timers", KSTAT_DATA_UINT64 }, 243 { "nomem", KSTAT_DATA_UINT64 }, 244 { "cantsend", KSTAT_DATA_UINT64 }, 245 }; 246 247 static uint_t clts_rcstat_ndata = 248 sizeof (clts_rcstat_tmpl) / sizeof (kstat_named_t); 249 250 #define RCSTAT_INCR(s, x) \ 251 atomic_inc_64(&(s)->x.value.ui64) 252 253 #define ptoh(p) (&((p)->cku_client)) 254 #define htop(h) ((struct cku_private *)((h)->cl_private)) 255 256 /* 257 * Times to retry 258 */ 259 #define SNDTRIES 4 260 #define REFRESHES 2 /* authentication refreshes */ 261 262 /* 263 * The following is used to determine the global default behavior for 264 * CLTS when binding to a local port. 265 * 266 * If the value is set to 1 the default will be to select a reserved 267 * (aka privileged) port, if the value is zero the default will be to 268 * use non-reserved ports. Users of kRPC may override this by using 269 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 270 */ 271 static int clnt_clts_do_bindresvport = 1; 272 273 #define BINDRESVPORT_RETRIES 5 274 275 void 276 clnt_clts_stats_init(zoneid_t zoneid, struct rpc_clts_client **statsp) 277 { 278 kstat_t *ksp; 279 kstat_named_t *knp; 280 281 knp = rpcstat_zone_init_common(zoneid, "unix", "rpc_clts_client", 282 (const kstat_named_t *)&clts_rcstat_tmpl, 283 sizeof (clts_rcstat_tmpl)); 284 /* 285 * Backwards compatibility for old kstat clients 286 */ 287 ksp = kstat_create_zone("unix", 0, "rpc_client", "rpc", 288 KSTAT_TYPE_NAMED, clts_rcstat_ndata, 289 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid); 290 if (ksp) { 291 ksp->ks_data = knp; 292 kstat_install(ksp); 293 } 294 *statsp = (struct rpc_clts_client *)knp; 295 } 296 297 void 298 clnt_clts_stats_fini(zoneid_t zoneid, struct rpc_clts_client **statsp) 299 { 300 rpcstat_zone_fini_common(zoneid, "unix", "rpc_clts_client"); 301 kstat_delete_byname_zone("unix", 0, "rpc_client", zoneid); 302 kmem_free(*statsp, sizeof (clts_rcstat_tmpl)); 303 } 304 305 /* 306 * Create an rpc handle for a clts rpc connection. 307 * Allocates space for the handle structure and the private data. 308 */ 309 /* ARGSUSED */ 310 int 311 clnt_clts_kcreate(struct knetconfig *config, struct netbuf *addr, 312 rpcprog_t pgm, rpcvers_t vers, int retrys, struct cred *cred, 313 CLIENT **cl) 314 { 315 CLIENT *h; 316 struct cku_private *p; 317 struct rpc_msg call_msg; 318 int error; 319 int plen; 320 321 if (cl == NULL) 322 return (EINVAL); 323 324 *cl = NULL; 325 error = 0; 326 327 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 328 329 h = ptoh(p); 330 331 /* handle */ 332 h->cl_ops = &clts_ops; 333 h->cl_private = (caddr_t)p; 334 h->cl_auth = authkern_create(); 335 336 /* call message, just used to pre-serialize below */ 337 call_msg.rm_xid = 0; 338 call_msg.rm_direction = CALL; 339 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 340 call_msg.rm_call.cb_prog = pgm; 341 call_msg.rm_call.cb_vers = vers; 342 343 /* private */ 344 clnt_clts_kinit(h, addr, retrys, cred); 345 346 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE); 347 348 /* pre-serialize call message header */ 349 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 350 XDR_DESTROY(&p->cku_outxdr); 351 error = EINVAL; /* XXX */ 352 goto bad; 353 } 354 XDR_DESTROY(&p->cku_outxdr); 355 356 p->cku_config.knc_rdev = config->knc_rdev; 357 p->cku_config.knc_semantics = config->knc_semantics; 358 plen = strlen(config->knc_protofmly) + 1; 359 p->cku_config.knc_protofmly = kmem_alloc(plen, KM_SLEEP); 360 bcopy(config->knc_protofmly, p->cku_config.knc_protofmly, plen); 361 p->cku_useresvport = -1; /* value is has not been set */ 362 363 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 364 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 365 366 *cl = h; 367 return (0); 368 369 bad: 370 auth_destroy(h->cl_auth); 371 kmem_free(p->cku_addr.buf, addr->maxlen); 372 kmem_free(p, sizeof (struct cku_private)); 373 374 return (error); 375 } 376 377 void 378 clnt_clts_kinit(CLIENT *h, struct netbuf *addr, int retrys, cred_t *cred) 379 { 380 /* LINTED pointer alignment */ 381 struct cku_private *p = htop(h); 382 struct rpcstat *rsp; 383 384 rsp = zone_getspecific(rpcstat_zone_key, rpc_zone()); 385 ASSERT(rsp != NULL); 386 387 p->cku_retrys = retrys; 388 389 if (p->cku_addr.maxlen < addr->len) { 390 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 391 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 392 393 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 394 p->cku_addr.maxlen = addr->maxlen; 395 } 396 397 p->cku_addr.len = addr->len; 398 bcopy(addr->buf, p->cku_addr.buf, addr->len); 399 400 p->cku_cred = cred; 401 p->cku_xid = 0; 402 p->cku_timers = NULL; 403 p->cku_timeall = NULL; 404 p->cku_feedback = NULL; 405 p->cku_bcast = FALSE; 406 p->cku_call.call_xid = 0; 407 p->cku_call.call_hash = 0; 408 p->cku_call.call_notified = FALSE; 409 p->cku_call.call_next = NULL; 410 p->cku_call.call_prev = NULL; 411 p->cku_call.call_reply = NULL; 412 p->cku_call.call_wq = NULL; 413 p->cku_stats = rsp->rpc_clts_client; 414 } 415 416 /* 417 * set the timers. Return current retransmission timeout. 418 */ 419 static int 420 clnt_clts_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 421 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, 422 uint32_t xid) 423 { 424 /* LINTED pointer alignment */ 425 struct cku_private *p = htop(h); 426 int value; 427 428 p->cku_feedback = feedback; 429 p->cku_feedarg = arg; 430 p->cku_timers = t; 431 p->cku_timeall = all; 432 if (xid) 433 p->cku_xid = xid; 434 value = all->rt_rtxcur; 435 value += t->rt_rtxcur; 436 if (value < minimum) 437 return (minimum); 438 RCSTAT_INCR(p->cku_stats, rctimers); 439 return (value); 440 } 441 442 /* 443 * Time out back off function. tim is in HZ 444 */ 445 #define MAXTIMO (20 * hz) 446 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 447 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 448 449 #define RETRY_POLL_TIMO 30 450 451 /* 452 * Call remote procedure. 453 * Most of the work of rpc is done here. We serialize what is left 454 * of the header (some was pre-serialized in the handle), serialize 455 * the arguments, and send it off. We wait for a reply or a time out. 456 * Timeout causes an immediate return, other packet problems may cause 457 * a retry on the receive. When a good packet is received we deserialize 458 * it, and check verification. A bad reply code will cause one retry 459 * with full (longhand) credentials. 460 */ 461 enum clnt_stat 462 clnt_clts_kcallit_addr(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 463 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 464 struct timeval wait, struct netbuf *sin) 465 { 466 /* LINTED pointer alignment */ 467 struct cku_private *p = htop(h); 468 XDR *xdrs; 469 int stries = p->cku_retrys; 470 int refreshes = REFRESHES; /* number of times to refresh cred */ 471 int round_trip; /* time the RPC */ 472 int error; 473 mblk_t *mp; 474 mblk_t *mpdup; 475 mblk_t *resp = NULL; 476 mblk_t *tmp; 477 calllist_t *call = &p->cku_call; 478 clock_t ori_timout, timout; 479 bool_t interrupted; 480 enum clnt_stat status; 481 struct rpc_msg reply_msg; 482 enum clnt_stat re_status; 483 endpnt_t *endpt; 484 485 RCSTAT_INCR(p->cku_stats, rccalls); 486 487 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_sec: %ld\n", wait.tv_sec); 488 RPCLOG(2, "clnt_clts_kcallit_addr: wait.tv_usec: %ld\n", wait.tv_usec); 489 490 timout = TIMEVAL_TO_TICK(&wait); 491 ori_timout = timout; 492 493 if (p->cku_xid == 0) { 494 p->cku_xid = alloc_xid(); 495 if (p->cku_endpnt != NULL) 496 endpnt_rele(p->cku_endpnt); 497 p->cku_endpnt = NULL; 498 } 499 call->call_zoneid = rpc_zoneid(); 500 501 mpdup = NULL; 502 call_again: 503 504 if (mpdup == NULL) { 505 506 while ((mp = allocb(CKU_INITSIZE, BPRI_LO)) == NULL) { 507 if (strwaitbuf(CKU_INITSIZE, BPRI_LO)) { 508 p->cku_err.re_status = RPC_SYSTEMERROR; 509 p->cku_err.re_errno = ENOSR; 510 goto done; 511 } 512 } 513 514 xdrs = &p->cku_outxdr; 515 xdrmblk_init(xdrs, mp, XDR_ENCODE, CKU_ALLOCSIZE); 516 517 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 518 /* 519 * Copy in the preserialized RPC header 520 * information. 521 */ 522 bcopy(p->cku_rpchdr, mp->b_rptr, CKU_HDRSIZE); 523 524 /* 525 * transaction id is the 1st thing in the output 526 * buffer. 527 */ 528 /* LINTED pointer alignment */ 529 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 530 531 /* Skip the preserialized stuff. */ 532 XDR_SETPOS(xdrs, CKU_HDRSIZE); 533 534 /* Serialize dynamic stuff into the output buffer. */ 535 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 536 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 537 (!(*xdr_args)(xdrs, argsp))) { 538 XDR_DESTROY(xdrs); 539 freemsg(mp); 540 p->cku_err.re_status = RPC_CANTENCODEARGS; 541 p->cku_err.re_errno = EIO; 542 goto done; 543 } 544 } else { 545 uint32_t *uproc = (uint32_t *) 546 &p->cku_rpchdr[CKU_HDRSIZE]; 547 IXDR_PUT_U_INT32(uproc, procnum); 548 549 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 550 XDR_SETPOS(xdrs, 0); 551 552 /* Serialize the procedure number and the arguments. */ 553 if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr, 554 CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) { 555 XDR_DESTROY(xdrs); 556 freemsg(mp); 557 p->cku_err.re_status = RPC_CANTENCODEARGS; 558 p->cku_err.re_errno = EIO; 559 goto done; 560 } 561 } 562 563 XDR_DESTROY(xdrs); 564 } else 565 mp = mpdup; 566 567 mpdup = dupmsg(mp); 568 if (mpdup == NULL) { 569 freemsg(mp); 570 p->cku_err.re_status = RPC_SYSTEMERROR; 571 p->cku_err.re_errno = ENOSR; 572 goto done; 573 } 574 575 /* 576 * Grab an endpnt only if the endpoint is NULL. We could be retrying 577 * the request and in this case we want to go through the same 578 * source port, so that the duplicate request cache may detect a 579 * retry. 580 */ 581 582 if (p->cku_endpnt == NULL) 583 p->cku_endpnt = endpnt_get(&p->cku_config, p->cku_useresvport); 584 585 if (p->cku_endpnt == NULL) { 586 freemsg(mp); 587 p->cku_err.re_status = RPC_SYSTEMERROR; 588 p->cku_err.re_errno = ENOSR; 589 goto done; 590 } 591 592 round_trip = ddi_get_lbolt(); 593 594 error = clnt_clts_dispatch_send(p->cku_endpnt->e_wq, mp, 595 &p->cku_addr, call, p->cku_xid, p->cku_cred); 596 597 if (error != 0) { 598 freemsg(mp); 599 p->cku_err.re_status = RPC_CANTSEND; 600 p->cku_err.re_errno = error; 601 RCSTAT_INCR(p->cku_stats, rccantsend); 602 goto done1; 603 } 604 605 RPCLOG(64, "clnt_clts_kcallit_addr: sent call for xid 0x%x\n", 606 p->cku_xid); 607 608 /* 609 * There are two reasons for which we go back to to tryread. 610 * 611 * a) In case the status is RPC_PROCUNAVAIL and we sent out a 612 * broadcast we should not get any invalid messages with the 613 * RPC_PROCUNAVAIL error back. Some broken RPC implementations 614 * send them and for this we have to ignore them ( as we would 615 * have never received them ) and look for another message 616 * which might contain the valid response because we don't know 617 * how many broken implementations are in the network. So we are 618 * going to loop until 619 * - we received a valid response 620 * - we have processed all invalid responses and 621 * got a time out when we try to receive again a 622 * message. 623 * 624 * b) We will jump back to tryread also in case we failed 625 * within the AUTH_VALIDATE. In this case we should move 626 * on and loop until we received a valid response or we 627 * have processed all responses with broken authentication 628 * and we got a time out when we try to receive a message. 629 */ 630 tryread: 631 mutex_enter(&call->call_lock); 632 interrupted = FALSE; 633 if (call->call_notified == FALSE) { 634 klwp_t *lwp = ttolwp(curthread); 635 clock_t cv_wait_ret = 1; /* init to > 0 */ 636 clock_t cv_timout = timout; 637 638 if (lwp != NULL) 639 lwp->lwp_nostop++; 640 641 cv_timout += ddi_get_lbolt(); 642 643 if (h->cl_nosignal) 644 while ((cv_wait_ret = 645 cv_timedwait(&call->call_cv, 646 &call->call_lock, cv_timout)) > 0 && 647 call->call_notified == FALSE) 648 ; 649 else 650 while ((cv_wait_ret = 651 cv_timedwait_sig(&call->call_cv, 652 &call->call_lock, cv_timout)) > 0 && 653 call->call_notified == FALSE) 654 ; 655 656 if (cv_wait_ret == 0) 657 interrupted = TRUE; 658 659 if (lwp != NULL) 660 lwp->lwp_nostop--; 661 } 662 resp = call->call_reply; 663 call->call_reply = NULL; 664 status = call->call_status; 665 /* 666 * We have to reset the call_notified here. In case we have 667 * to do a retry ( e.g. in case we got a RPC_PROCUNAVAIL 668 * error ) we need to set this to false to ensure that 669 * we will wait for the next message. When the next message 670 * is going to arrive the function clnt_clts_dispatch_notify 671 * will set this to true again. 672 */ 673 call->call_notified = FALSE; 674 call->call_status = RPC_TIMEDOUT; 675 mutex_exit(&call->call_lock); 676 677 if (status == RPC_TIMEDOUT) { 678 if (interrupted) { 679 /* 680 * We got interrupted, bail out 681 */ 682 p->cku_err.re_status = RPC_INTR; 683 p->cku_err.re_errno = EINTR; 684 goto done1; 685 } else { 686 RPCLOG(8, "clnt_clts_kcallit_addr: " 687 "request w/xid 0x%x timedout " 688 "waiting for reply\n", p->cku_xid); 689 #if 0 /* XXX not yet */ 690 /* 691 * Timeout may be due to a dead gateway. Send 692 * an ioctl downstream advising deletion of 693 * route when we reach the half-way point to 694 * timing out. 695 */ 696 if (stries == p->cku_retrys/2) { 697 t_kadvise(p->cku_endpnt->e_tiptr, 698 (uchar_t *)p->cku_addr.buf, 699 p->cku_addr.len); 700 } 701 #endif /* not yet */ 702 p->cku_err.re_status = RPC_TIMEDOUT; 703 p->cku_err.re_errno = ETIMEDOUT; 704 RCSTAT_INCR(p->cku_stats, rctimeouts); 705 goto done1; 706 } 707 } 708 709 ASSERT(resp != NULL); 710 711 /* 712 * Prepare the message for further processing. We need to remove 713 * the datagram header and copy the source address if necessary. No 714 * need to verify the header since rpcmod took care of that. 715 */ 716 /* 717 * Copy the source address if the caller has supplied a netbuf. 718 */ 719 if (sin != NULL) { 720 union T_primitives *pptr; 721 722 pptr = (union T_primitives *)resp->b_rptr; 723 bcopy(resp->b_rptr + pptr->unitdata_ind.SRC_offset, sin->buf, 724 pptr->unitdata_ind.SRC_length); 725 sin->len = pptr->unitdata_ind.SRC_length; 726 } 727 728 /* 729 * Pop off the datagram header. 730 * It was retained in rpcmodrput(). 731 */ 732 tmp = resp; 733 resp = resp->b_cont; 734 tmp->b_cont = NULL; 735 freeb(tmp); 736 737 round_trip = ddi_get_lbolt() - round_trip; 738 /* 739 * Van Jacobson timer algorithm here, only if NOT a retransmission. 740 */ 741 if (p->cku_timers != NULL && stries == p->cku_retrys) { 742 int rt; 743 744 rt = round_trip; 745 rt -= (p->cku_timers->rt_srtt >> 3); 746 p->cku_timers->rt_srtt += rt; 747 if (rt < 0) 748 rt = - rt; 749 rt -= (p->cku_timers->rt_deviate >> 2); 750 p->cku_timers->rt_deviate += rt; 751 p->cku_timers->rt_rtxcur = 752 (clock_t)((p->cku_timers->rt_srtt >> 2) + 753 p->cku_timers->rt_deviate) >> 1; 754 755 rt = round_trip; 756 rt -= (p->cku_timeall->rt_srtt >> 3); 757 p->cku_timeall->rt_srtt += rt; 758 if (rt < 0) 759 rt = - rt; 760 rt -= (p->cku_timeall->rt_deviate >> 2); 761 p->cku_timeall->rt_deviate += rt; 762 p->cku_timeall->rt_rtxcur = 763 (clock_t)((p->cku_timeall->rt_srtt >> 2) + 764 p->cku_timeall->rt_deviate) >> 1; 765 if (p->cku_feedback != NULL) { 766 (*p->cku_feedback)(FEEDBACK_OK, procnum, 767 p->cku_feedarg); 768 } 769 } 770 771 /* 772 * Process reply 773 */ 774 xdrs = &(p->cku_inxdr); 775 xdrmblk_init(xdrs, resp, XDR_DECODE, 0); 776 777 reply_msg.rm_direction = REPLY; 778 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 779 reply_msg.acpted_rply.ar_stat = SUCCESS; 780 reply_msg.acpted_rply.ar_verf = _null_auth; 781 /* 782 * xdr_results will be done in AUTH_UNWRAP. 783 */ 784 reply_msg.acpted_rply.ar_results.where = NULL; 785 reply_msg.acpted_rply.ar_results.proc = xdr_void; 786 787 /* 788 * Decode and validate the response. 789 */ 790 if (!xdr_replymsg(xdrs, &reply_msg)) { 791 p->cku_err.re_status = RPC_CANTDECODERES; 792 p->cku_err.re_errno = EIO; 793 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 794 XDR_DESTROY(xdrs); 795 goto done1; 796 } 797 798 _seterr_reply(&reply_msg, &(p->cku_err)); 799 800 re_status = p->cku_err.re_status; 801 if (re_status == RPC_SUCCESS) { 802 /* 803 * Reply is good, check auth. 804 */ 805 if (!AUTH_VALIDATE(h->cl_auth, 806 &reply_msg.acpted_rply.ar_verf)) { 807 p->cku_err.re_status = RPC_AUTHERROR; 808 p->cku_err.re_why = AUTH_INVALIDRESP; 809 RCSTAT_INCR(p->cku_stats, rcbadverfs); 810 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 811 XDR_DESTROY(xdrs); 812 goto tryread; 813 } 814 if (!AUTH_UNWRAP(h->cl_auth, xdrs, xdr_results, resultsp)) { 815 p->cku_err.re_status = RPC_CANTDECODERES; 816 p->cku_err.re_errno = EIO; 817 } 818 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 819 XDR_DESTROY(xdrs); 820 goto done1; 821 } 822 /* set errno in case we can't recover */ 823 if (re_status != RPC_VERSMISMATCH && 824 re_status != RPC_AUTHERROR && re_status != RPC_PROGVERSMISMATCH) 825 p->cku_err.re_errno = EIO; 826 /* 827 * Determine whether or not we're doing an RPC 828 * broadcast. Some server implementations don't 829 * follow RFC 1050, section 7.4.2 in that they 830 * don't remain silent when they see a proc 831 * they don't support. Therefore we keep trying 832 * to receive on RPC_PROCUNAVAIL, hoping to get 833 * a valid response from a compliant server. 834 */ 835 if (re_status == RPC_PROCUNAVAIL && p->cku_bcast) { 836 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 837 XDR_DESTROY(xdrs); 838 goto tryread; 839 } 840 if (re_status == RPC_AUTHERROR) { 841 842 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 843 XDR_DESTROY(xdrs); 844 call_table_remove(call); 845 if (call->call_reply != NULL) { 846 freemsg(call->call_reply); 847 call->call_reply = NULL; 848 } 849 850 /* 851 * Maybe our credential need to be refreshed 852 */ 853 if (refreshes > 0 && 854 AUTH_REFRESH(h->cl_auth, &reply_msg, p->cku_cred)) { 855 /* 856 * The credential is refreshed. Try the request again. 857 * Even if stries == 0, we still retry as long as 858 * refreshes > 0. This prevents a soft authentication 859 * error turning into a hard one at an upper level. 860 */ 861 refreshes--; 862 RCSTAT_INCR(p->cku_stats, rcbadcalls); 863 RCSTAT_INCR(p->cku_stats, rcnewcreds); 864 865 freemsg(mpdup); 866 mpdup = NULL; 867 freemsg(resp); 868 resp = NULL; 869 goto call_again; 870 } 871 /* 872 * We have used the client handle to do an AUTH_REFRESH 873 * and the RPC status may be set to RPC_SUCCESS; 874 * Let's make sure to set it to RPC_AUTHERROR. 875 */ 876 p->cku_err.re_status = RPC_CANTDECODERES; 877 878 /* 879 * Map recoverable and unrecoverable 880 * authentication errors to appropriate errno 881 */ 882 switch (p->cku_err.re_why) { 883 case AUTH_TOOWEAK: 884 /* 885 * Could be an nfsportmon failure, set 886 * useresvport and try again. 887 */ 888 if (p->cku_useresvport != 1) { 889 p->cku_useresvport = 1; 890 891 freemsg(mpdup); 892 mpdup = NULL; 893 freemsg(resp); 894 resp = NULL; 895 896 endpt = p->cku_endpnt; 897 if (endpt->e_tiptr != NULL) { 898 mutex_enter(&endpt->e_lock); 899 endpt->e_flags &= ~ENDPNT_BOUND; 900 (void) t_kclose(endpt->e_tiptr, 1); 901 endpt->e_tiptr = NULL; 902 mutex_exit(&endpt->e_lock); 903 904 } 905 906 p->cku_xid = alloc_xid(); 907 endpnt_rele(p->cku_endpnt); 908 p->cku_endpnt = NULL; 909 goto call_again; 910 } 911 /* FALLTHRU */ 912 case AUTH_BADCRED: 913 case AUTH_BADVERF: 914 case AUTH_INVALIDRESP: 915 case AUTH_FAILED: 916 case RPCSEC_GSS_NOCRED: 917 case RPCSEC_GSS_FAILED: 918 p->cku_err.re_errno = EACCES; 919 break; 920 case AUTH_REJECTEDCRED: 921 case AUTH_REJECTEDVERF: 922 default: 923 p->cku_err.re_errno = EIO; 924 break; 925 } 926 RPCLOG(1, "clnt_clts_kcallit : authentication failed " 927 "with RPC_AUTHERROR of type %d\n", 928 p->cku_err.re_why); 929 goto done; 930 } 931 932 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 933 XDR_DESTROY(xdrs); 934 935 done1: 936 call_table_remove(call); 937 if (call->call_reply != NULL) { 938 freemsg(call->call_reply); 939 call->call_reply = NULL; 940 } 941 RPCLOG(64, "clnt_clts_kcallit_addr: xid 0x%x taken off dispatch list", 942 p->cku_xid); 943 944 done: 945 if (resp != NULL) { 946 freemsg(resp); 947 resp = NULL; 948 } 949 950 if ((p->cku_err.re_status != RPC_SUCCESS) && 951 (p->cku_err.re_status != RPC_INTR) && 952 (p->cku_err.re_status != RPC_UDERROR) && 953 !IS_UNRECOVERABLE_RPC(p->cku_err.re_status)) { 954 if (p->cku_feedback != NULL && stries == p->cku_retrys) { 955 (*p->cku_feedback)(FEEDBACK_REXMIT1, procnum, 956 p->cku_feedarg); 957 } 958 959 timout = backoff(timout); 960 if (p->cku_timeall != (struct rpc_timers *)0) 961 p->cku_timeall->rt_rtxcur = timout; 962 963 if (p->cku_err.re_status == RPC_SYSTEMERROR || 964 p->cku_err.re_status == RPC_CANTSEND) { 965 /* 966 * Errors due to lack of resources, wait a bit 967 * and try again. 968 */ 969 (void) delay(hz/10); 970 } 971 if (stries-- > 0) { 972 RCSTAT_INCR(p->cku_stats, rcretrans); 973 goto call_again; 974 } 975 } 976 977 if (mpdup != NULL) 978 freemsg(mpdup); 979 980 if (p->cku_err.re_status != RPC_SUCCESS) { 981 RCSTAT_INCR(p->cku_stats, rcbadcalls); 982 } 983 984 /* 985 * Allow the endpoint to be held by the client handle in case this 986 * RPC was not successful. A retry may occur at a higher level and 987 * in this case we may want to send the request over the same 988 * source port. 989 * Endpoint is also released for one-way RPC: no reply, nor retransmit 990 * is expected. 991 */ 992 if ((p->cku_err.re_status == RPC_SUCCESS || 993 (p->cku_err.re_status == RPC_TIMEDOUT && ori_timout == 0)) && 994 p->cku_endpnt != NULL) { 995 endpnt_rele(p->cku_endpnt); 996 p->cku_endpnt = NULL; 997 } else { 998 DTRACE_PROBE2(clnt_clts_kcallit_done, int, p->cku_err.re_status, 999 struct endpnt *, p->cku_endpnt); 1000 } 1001 1002 return (p->cku_err.re_status); 1003 } 1004 1005 static enum clnt_stat 1006 clnt_clts_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 1007 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, 1008 struct timeval wait) 1009 { 1010 return (clnt_clts_kcallit_addr(h, procnum, xdr_args, argsp, 1011 xdr_results, resultsp, wait, NULL)); 1012 } 1013 1014 /* 1015 * Return error info on this handle. 1016 */ 1017 static void 1018 clnt_clts_kerror(CLIENT *h, struct rpc_err *err) 1019 { 1020 /* LINTED pointer alignment */ 1021 struct cku_private *p = htop(h); 1022 1023 *err = p->cku_err; 1024 } 1025 1026 /*ARGSUSED*/ 1027 static bool_t 1028 clnt_clts_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 1029 { 1030 xdr_free(xdr_res, res_ptr); 1031 1032 return (TRUE); 1033 } 1034 1035 /*ARGSUSED*/ 1036 static void 1037 clnt_clts_kabort(CLIENT *h) 1038 { 1039 } 1040 1041 static bool_t 1042 clnt_clts_kcontrol(CLIENT *h, int cmd, char *arg) 1043 { 1044 /* LINTED pointer alignment */ 1045 struct cku_private *p = htop(h); 1046 1047 switch (cmd) { 1048 case CLSET_XID: 1049 p->cku_xid = *((uint32_t *)arg); 1050 return (TRUE); 1051 1052 case CLGET_XID: 1053 *((uint32_t *)arg) = p->cku_xid; 1054 return (TRUE); 1055 1056 case CLSET_NODELAYONERR: 1057 /* 1058 * CLTS does not have this functionality, but 1059 * we return TRUE here to avoid error messages. 1060 */ 1061 return (TRUE); 1062 1063 case CLGET_NODELAYONERR: 1064 /* CLTS does not support this functionality. */ 1065 return (FALSE); 1066 1067 case CLSET_BCAST: 1068 p->cku_bcast = *((uint32_t *)arg); 1069 return (TRUE); 1070 1071 case CLGET_BCAST: 1072 *((uint32_t *)arg) = p->cku_bcast; 1073 return (TRUE); 1074 case CLSET_BINDRESVPORT: 1075 if (arg == NULL) 1076 return (FALSE); 1077 1078 if (*(int *)arg != 1 && *(int *)arg != 0) 1079 return (FALSE); 1080 1081 p->cku_useresvport = *(int *)arg; 1082 1083 return (TRUE); 1084 1085 case CLGET_BINDRESVPORT: 1086 if (arg == NULL) 1087 return (FALSE); 1088 1089 *(int *)arg = p->cku_useresvport; 1090 1091 return (TRUE); 1092 1093 default: 1094 return (FALSE); 1095 } 1096 } 1097 1098 /* 1099 * Destroy rpc handle. 1100 * Frees the space used for output buffer, private data, and handle 1101 * structure, and the file pointer/TLI data on last reference. 1102 */ 1103 static void 1104 clnt_clts_kdestroy(CLIENT *h) 1105 { 1106 /* LINTED pointer alignment */ 1107 struct cku_private *p = htop(h); 1108 calllist_t *call = &p->cku_call; 1109 1110 int plen; 1111 1112 RPCLOG(8, "clnt_clts_kdestroy h: %p\n", (void *)h); 1113 RPCLOG(8, "clnt_clts_kdestroy h: xid=0x%x\n", p->cku_xid); 1114 1115 if (p->cku_endpnt != NULL) 1116 endpnt_rele(p->cku_endpnt); 1117 1118 cv_destroy(&call->call_cv); 1119 mutex_destroy(&call->call_lock); 1120 1121 plen = strlen(p->cku_config.knc_protofmly) + 1; 1122 kmem_free(p->cku_config.knc_protofmly, plen); 1123 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1124 kmem_free(p, sizeof (*p)); 1125 } 1126 1127 /* 1128 * The connectionless (CLTS) kRPC endpoint management subsystem. 1129 * 1130 * Because endpoints are potentially shared among threads making RPC calls, 1131 * they are managed in a pool according to type (endpnt_type_t). Each 1132 * endpnt_type_t points to a list of usable endpoints through the e_pool 1133 * field, which is of type list_t. list_t is a doubly-linked list. 1134 * The number of endpoints in the pool is stored in the e_cnt field of 1135 * endpnt_type_t and the endpoints are reference counted using the e_ref field 1136 * in the endpnt_t structure. 1137 * 1138 * As an optimization, endpoints that have no references are also linked 1139 * to an idle list via e_ilist which is also of type list_t. When a thread 1140 * calls endpnt_get() to obtain a transport endpoint, the idle list is first 1141 * consulted and if such an endpoint exists, it is removed from the idle list 1142 * and returned to the caller. 1143 * 1144 * If the idle list is empty, then a check is made to see if more endpoints 1145 * can be created. If so, we proceed and create a new endpoint which is added 1146 * to the pool and returned to the caller. If we have reached the limit and 1147 * cannot make a new endpoint then one is returned to the caller via round- 1148 * robin policy. 1149 * 1150 * When an endpoint is placed on the idle list by a thread calling 1151 * endpnt_rele(), it is timestamped and then a reaper taskq is scheduled to 1152 * be dispatched if one hasn't already been. When the timer fires, the 1153 * taskq traverses the idle list and checks to see which endpoints are 1154 * eligible to be closed. It determines this by checking if the timestamp 1155 * when the endpoint was released has exceeded the the threshold for how long 1156 * it should stay alive. 1157 * 1158 * endpnt_t structures remain persistent until the memory reclaim callback, 1159 * endpnt_reclaim(), is invoked. 1160 * 1161 * Here is an example of how the data structures would be laid out by the 1162 * subsystem: 1163 * 1164 * endpnt_type_t 1165 * 1166 * loopback inet 1167 * _______________ ______________ 1168 * | e_next |----------------------->| e_next |---->> 1169 * | e_pool |<---+ | e_pool |<----+ 1170 * | e_ilist |<---+--+ | e_ilist |<----+--+ 1171 * +->| e_pcurr |----+--+--+ +->| e_pcurr |-----+--+--+ 1172 * | | ... | | | | | | ... | | | | 1173 * | | e_itimer (90) | | | | | | e_itimer (0) | | | | 1174 * | | e_cnt (1) | | | | | | e_cnt (3) | | | | 1175 * | +---------------+ | | | | +--------------+ | | | 1176 * | | | | | | | | 1177 * | endpnt_t | | | | | | | 1178 * | ____________ | | | | ____________ | | | 1179 * | | e_node |<------+ | | | | e_node |<------+ | | 1180 * | | e_idle |<---------+ | | | e_idle | | | | 1181 * +--| e_type |<------------+ +--| e_type | | | | 1182 * | e_tiptr | | | e_tiptr | | | | 1183 * | ... | | | ... | | | | 1184 * | e_lock | | | e_lock | | | | 1185 * | ... | | | ... | | | | 1186 * | e_ref (0) | | | e_ref (2) | | | | 1187 * | e_itime | | | e_itime | | | | 1188 * +------------+ | +------------+ | | | 1189 * | | | | 1190 * | | | | 1191 * | ____________ | | | 1192 * | | e_node |<------+ | | 1193 * | | e_idle |<------+--+ | 1194 * +--| e_type | | | 1195 * | | e_tiptr | | | 1196 * | | ... | | | 1197 * | | e_lock | | | 1198 * | | ... | | | 1199 * | | e_ref (0) | | | 1200 * | | e_itime | | | 1201 * | +------------+ | | 1202 * | | | 1203 * | | | 1204 * | ____________ | | 1205 * | | e_node |<------+ | 1206 * | | e_idle | | 1207 * +--| e_type |<------------+ 1208 * | e_tiptr | 1209 * | ... | 1210 * | e_lock | 1211 * | ... | 1212 * | e_ref (1) | 1213 * | e_itime | 1214 * +------------+ 1215 * 1216 * Endpoint locking strategy: 1217 * 1218 * The following functions manipulate lists which hold the endpoint and the 1219 * endpoints themselves: 1220 * 1221 * endpnt_get()/check_endpnt()/endpnt_rele()/endpnt_reap()/do_endpnt_reclaim() 1222 * 1223 * Lock description follows: 1224 * 1225 * endpnt_type_lock: Global reader/writer lock which protects accesses to the 1226 * endpnt_type_list. 1227 * 1228 * e_plock: Lock defined in the endpnt_type_t. It is intended to 1229 * protect accesses to the pool of endopints (e_pool) for a given 1230 * endpnt_type_t. 1231 * 1232 * e_ilock: Lock defined in endpnt_type_t. It is intended to protect accesses 1233 * to the idle list (e_ilist) of available endpoints for a given 1234 * endpnt_type_t. It also protects access to the e_itimer, e_async_cv, 1235 * and e_async_count fields in endpnt_type_t. 1236 * 1237 * e_lock: Lock defined in the endpnt structure. It is intended to protect 1238 * flags, cv, and ref count. 1239 * 1240 * The order goes as follows so as not to induce deadlock. 1241 * 1242 * endpnt_type_lock -> e_plock -> e_ilock -> e_lock 1243 * 1244 * Interaction with Zones and shutting down: 1245 * 1246 * endpnt_type_ts are uniquely identified by the (e_zoneid, e_rdev, e_protofmly) 1247 * tuple, which means that a zone may not reuse another zone's idle endpoints 1248 * without first doing a t_kclose(). 1249 * 1250 * A zone's endpnt_type_ts are destroyed when a zone is shut down; e_async_cv 1251 * and e_async_count are used to keep track of the threads in endpnt_taskq 1252 * trying to reap endpnt_ts in the endpnt_type_t. 1253 */ 1254 1255 /* 1256 * Allocate and initialize an endpnt_type_t 1257 */ 1258 static struct endpnt_type * 1259 endpnt_type_create(struct knetconfig *config) 1260 { 1261 struct endpnt_type *etype; 1262 1263 /* 1264 * Allocate a new endpoint type to hang a list of 1265 * endpoints off of it. 1266 */ 1267 etype = kmem_alloc(sizeof (struct endpnt_type), KM_SLEEP); 1268 etype->e_next = NULL; 1269 etype->e_pcurr = NULL; 1270 etype->e_itimer = 0; 1271 etype->e_cnt = 0; 1272 1273 (void) strncpy(etype->e_protofmly, config->knc_protofmly, KNC_STRSIZE); 1274 mutex_init(&etype->e_plock, NULL, MUTEX_DEFAULT, NULL); 1275 mutex_init(&etype->e_ilock, NULL, MUTEX_DEFAULT, NULL); 1276 etype->e_rdev = config->knc_rdev; 1277 etype->e_zoneid = rpc_zoneid(); 1278 etype->e_async_count = 0; 1279 cv_init(&etype->e_async_cv, NULL, CV_DEFAULT, NULL); 1280 1281 list_create(&etype->e_pool, sizeof (endpnt_t), 1282 offsetof(endpnt_t, e_node)); 1283 list_create(&etype->e_ilist, sizeof (endpnt_t), 1284 offsetof(endpnt_t, e_idle)); 1285 1286 /* 1287 * Check to see if we need to create a taskq for endpoint 1288 * reaping 1289 */ 1290 mutex_enter(&endpnt_taskq_lock); 1291 if (taskq_created == FALSE) { 1292 taskq_created = TRUE; 1293 mutex_exit(&endpnt_taskq_lock); 1294 ASSERT(endpnt_taskq == NULL); 1295 endpnt_taskq = taskq_create("clts_endpnt_taskq", 1, 1296 minclsyspri, 200, INT_MAX, 0); 1297 } else 1298 mutex_exit(&endpnt_taskq_lock); 1299 1300 return (etype); 1301 } 1302 1303 /* 1304 * Free an endpnt_type_t 1305 */ 1306 static void 1307 endpnt_type_free(struct endpnt_type *etype) 1308 { 1309 mutex_destroy(&etype->e_plock); 1310 mutex_destroy(&etype->e_ilock); 1311 list_destroy(&etype->e_pool); 1312 list_destroy(&etype->e_ilist); 1313 kmem_free(etype, sizeof (endpnt_type_t)); 1314 } 1315 1316 /* 1317 * Check the endpoint to ensure that it is suitable for use. 1318 * 1319 * Possible return values: 1320 * 1321 * return (1) - Endpoint is established, but needs to be re-opened. 1322 * return (0) && *newp == NULL - Endpoint is established, but unusable. 1323 * return (0) && *newp != NULL - Endpoint is established and usable. 1324 */ 1325 static int 1326 check_endpnt(struct endpnt *endp, struct endpnt **newp) 1327 { 1328 *newp = endp; 1329 1330 mutex_enter(&endp->e_lock); 1331 ASSERT(endp->e_ref >= 1); 1332 1333 /* 1334 * The first condition we check for is if the endpoint has been 1335 * allocated, but is unusable either because it has been closed or 1336 * has been marked stale. Only *one* thread will be allowed to 1337 * execute the then clause. This is enforced because the first thread 1338 * to check this condition will clear the flags, so that subsequent 1339 * thread(s) checking this endpoint will move on. 1340 */ 1341 if ((endp->e_flags & ENDPNT_ESTABLISHED) && 1342 (!(endp->e_flags & ENDPNT_BOUND) || 1343 (endp->e_flags & ENDPNT_STALE))) { 1344 /* 1345 * Clear the flags here since they will be 1346 * set again by this thread. They need to be 1347 * individually cleared because we want to maintain 1348 * the state for ENDPNT_ONIDLE. 1349 */ 1350 endp->e_flags &= ~(ENDPNT_ESTABLISHED | 1351 ENDPNT_WAITING | ENDPNT_BOUND | ENDPNT_STALE); 1352 mutex_exit(&endp->e_lock); 1353 return (1); 1354 } 1355 1356 /* 1357 * The second condition is meant for any thread that is waiting for 1358 * an endpoint to become established. It will cv_wait() until 1359 * the condition for the endpoint has been changed to ENDPNT_BOUND or 1360 * ENDPNT_STALE. 1361 */ 1362 while (!(endp->e_flags & ENDPNT_BOUND) && 1363 !(endp->e_flags & ENDPNT_STALE)) { 1364 endp->e_flags |= ENDPNT_WAITING; 1365 cv_wait(&endp->e_cv, &endp->e_lock); 1366 } 1367 1368 ASSERT(endp->e_flags & ENDPNT_ESTABLISHED); 1369 1370 /* 1371 * The last case we check for is if the endpoint has been marked stale. 1372 * If this is the case then set *newp to NULL and return, so that the 1373 * caller is notified of the error and can take appropriate action. 1374 */ 1375 if (endp->e_flags & ENDPNT_STALE) { 1376 endp->e_ref--; 1377 *newp = NULL; 1378 } 1379 mutex_exit(&endp->e_lock); 1380 return (0); 1381 } 1382 1383 #ifdef DEBUG 1384 /* 1385 * Provide a fault injection setting to test error conditions. 1386 */ 1387 static int endpnt_get_return_null = 0; 1388 #endif 1389 1390 /* 1391 * Returns a handle (struct endpnt *) to an open and bound endpoint 1392 * specified by the knetconfig passed in. Returns NULL if no valid endpoint 1393 * can be obtained. 1394 */ 1395 static struct endpnt * 1396 endpnt_get(struct knetconfig *config, int useresvport) 1397 { 1398 struct endpnt_type *n_etype = NULL; 1399 struct endpnt_type *np = NULL; 1400 struct endpnt *new = NULL; 1401 struct endpnt *endp = NULL; 1402 struct endpnt *next = NULL; 1403 TIUSER *tiptr = NULL; 1404 int rtries = BINDRESVPORT_RETRIES; 1405 int i = 0; 1406 int error; 1407 int retval; 1408 zoneid_t zoneid = rpc_zoneid(); 1409 cred_t *cr; 1410 1411 RPCLOG(1, "endpnt_get: protofmly %s, ", config->knc_protofmly); 1412 RPCLOG(1, "rdev %ld\n", config->knc_rdev); 1413 1414 #ifdef DEBUG 1415 /* 1416 * Inject fault if desired. Pretend we have a stale endpoint 1417 * and return NULL. 1418 */ 1419 if (endpnt_get_return_null > 0) { 1420 endpnt_get_return_null--; 1421 return (NULL); 1422 } 1423 #endif 1424 rw_enter(&endpnt_type_lock, RW_READER); 1425 1426 top: 1427 for (np = endpnt_type_list; np != NULL; np = np->e_next) 1428 if ((np->e_zoneid == zoneid) && 1429 (np->e_rdev == config->knc_rdev) && 1430 (strcmp(np->e_protofmly, 1431 config->knc_protofmly) == 0)) 1432 break; 1433 1434 if (np == NULL && n_etype != NULL) { 1435 ASSERT(rw_write_held(&endpnt_type_lock)); 1436 1437 /* 1438 * Link the endpoint type onto the list 1439 */ 1440 n_etype->e_next = endpnt_type_list; 1441 endpnt_type_list = n_etype; 1442 np = n_etype; 1443 n_etype = NULL; 1444 } 1445 1446 if (np == NULL) { 1447 /* 1448 * The logic here is that we were unable to find an 1449 * endpnt_type_t that matched our criteria, so we allocate a 1450 * new one. Because kmem_alloc() needs to be called with 1451 * KM_SLEEP, we drop our locks so that we don't induce 1452 * deadlock. After allocating and initializing the 1453 * endpnt_type_t, we reaquire the lock and go back to check 1454 * if this entry needs to be added to the list. Since we do 1455 * some operations without any locking other threads may 1456 * have been looking for the same endpnt_type_t and gone 1457 * through this code path. We check for this case and allow 1458 * one thread to link its endpnt_type_t to the list and the 1459 * other threads will simply free theirs. 1460 */ 1461 rw_exit(&endpnt_type_lock); 1462 n_etype = endpnt_type_create(config); 1463 1464 /* 1465 * We need to reaquire the lock with RW_WRITER here so that 1466 * we can safely link the new endpoint type onto the list. 1467 */ 1468 rw_enter(&endpnt_type_lock, RW_WRITER); 1469 goto top; 1470 } 1471 1472 rw_exit(&endpnt_type_lock); 1473 /* 1474 * If n_etype is not NULL, then another thread was able to 1475 * insert an endpnt_type_t of this type onto the list before 1476 * we did. Go ahead and free ours. 1477 */ 1478 if (n_etype != NULL) 1479 endpnt_type_free(n_etype); 1480 1481 mutex_enter(&np->e_ilock); 1482 /* 1483 * The algorithm to hand out endpoints is to first 1484 * give out those that are idle if such endpoints 1485 * exist. Otherwise, create a new one if we haven't 1486 * reached the max threshold. Finally, we give out 1487 * endpoints in a pseudo LRU fashion (round-robin). 1488 * 1489 * Note: The idle list is merely a hint of those endpoints 1490 * that should be idle. There exists a window after the 1491 * endpoint is released and before it is linked back onto the 1492 * idle list where a thread could get a reference to it and 1493 * use it. This is okay, since the reference counts will 1494 * still be consistent. 1495 */ 1496 if ((endp = (endpnt_t *)list_head(&np->e_ilist)) != NULL) { 1497 timeout_id_t t_id = 0; 1498 1499 mutex_enter(&endp->e_lock); 1500 endp->e_ref++; 1501 endp->e_itime = 0; 1502 endp->e_flags &= ~ENDPNT_ONIDLE; 1503 mutex_exit(&endp->e_lock); 1504 1505 /* 1506 * Pop the endpoint off the idle list and hand it off 1507 */ 1508 list_remove(&np->e_ilist, endp); 1509 1510 if (np->e_itimer != 0) { 1511 t_id = np->e_itimer; 1512 np->e_itimer = 0; 1513 } 1514 mutex_exit(&np->e_ilock); 1515 /* 1516 * Reset the idle timer if it has been set 1517 */ 1518 if (t_id != (timeout_id_t)0) 1519 (void) untimeout(t_id); 1520 1521 if (check_endpnt(endp, &new) == 0) 1522 return (new); 1523 } else if (np->e_cnt >= clnt_clts_max_endpoints) { 1524 /* 1525 * There are no idle endpoints currently, so 1526 * create a new one if we have not reached the maximum or 1527 * hand one out in round-robin. 1528 */ 1529 mutex_exit(&np->e_ilock); 1530 mutex_enter(&np->e_plock); 1531 endp = np->e_pcurr; 1532 mutex_enter(&endp->e_lock); 1533 endp->e_ref++; 1534 mutex_exit(&endp->e_lock); 1535 1536 ASSERT(endp != NULL); 1537 /* 1538 * Advance the pointer to the next eligible endpoint, if 1539 * necessary. 1540 */ 1541 if (np->e_cnt > 1) { 1542 next = (endpnt_t *)list_next(&np->e_pool, np->e_pcurr); 1543 if (next == NULL) 1544 next = (endpnt_t *)list_head(&np->e_pool); 1545 np->e_pcurr = next; 1546 } 1547 1548 mutex_exit(&np->e_plock); 1549 1550 /* 1551 * We need to check to see if this endpoint is bound or 1552 * not. If it is in progress then just wait until 1553 * the set up is complete 1554 */ 1555 if (check_endpnt(endp, &new) == 0) 1556 return (new); 1557 } else { 1558 mutex_exit(&np->e_ilock); 1559 mutex_enter(&np->e_plock); 1560 1561 /* 1562 * Allocate a new endpoint to use. If we can't allocate any 1563 * more memory then use one that is already established if any 1564 * such endpoints exist. 1565 */ 1566 new = kmem_cache_alloc(endpnt_cache, KM_NOSLEEP); 1567 if (new == NULL) { 1568 RPCLOG0(1, "endpnt_get: kmem_cache_alloc failed\n"); 1569 /* 1570 * Try to recover by using an existing endpoint. 1571 */ 1572 if (np->e_cnt <= 0) { 1573 mutex_exit(&np->e_plock); 1574 return (NULL); 1575 } 1576 endp = np->e_pcurr; 1577 if ((next = list_next(&np->e_pool, np->e_pcurr)) != 1578 NULL) 1579 np->e_pcurr = next; 1580 ASSERT(endp != NULL); 1581 mutex_enter(&endp->e_lock); 1582 endp->e_ref++; 1583 mutex_exit(&endp->e_lock); 1584 mutex_exit(&np->e_plock); 1585 1586 if (check_endpnt(endp, &new) == 0) 1587 return (new); 1588 } else { 1589 /* 1590 * Partially init an endpoint structure and put 1591 * it on the list, so that other interested threads 1592 * know that one is being created 1593 */ 1594 bzero(new, sizeof (struct endpnt)); 1595 1596 cv_init(&new->e_cv, NULL, CV_DEFAULT, NULL); 1597 mutex_init(&new->e_lock, NULL, MUTEX_DEFAULT, NULL); 1598 new->e_ref = 1; 1599 new->e_type = np; 1600 1601 /* 1602 * Link the endpoint into the pool. 1603 */ 1604 list_insert_head(&np->e_pool, new); 1605 np->e_cnt++; 1606 if (np->e_pcurr == NULL) 1607 np->e_pcurr = new; 1608 mutex_exit(&np->e_plock); 1609 } 1610 } 1611 1612 /* 1613 * The transport should be opened with sufficient privs 1614 */ 1615 cr = zone_kcred(); 1616 error = t_kopen(NULL, config->knc_rdev, FREAD|FWRITE|FNDELAY, &tiptr, 1617 cr); 1618 if (error) { 1619 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1620 goto bad; 1621 } 1622 1623 new->e_tiptr = tiptr; 1624 rpc_poptimod(tiptr->fp->f_vnode); 1625 1626 /* 1627 * Allow the kernel to push the module on behalf of the user. 1628 */ 1629 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 1630 K_TO_K, cr, &retval); 1631 if (error) { 1632 RPCLOG(1, "endpnt_get: kstr_push on rpcmod failed %d\n", error); 1633 goto bad; 1634 } 1635 1636 error = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 1637 cr, &retval); 1638 if (error) { 1639 RPCLOG(1, "endpnt_get: strioctl failed %d\n", error); 1640 goto bad; 1641 } 1642 1643 /* 1644 * Connectionless data flow should bypass the stream head. 1645 */ 1646 new->e_wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 1647 1648 error = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 1649 K_TO_K, cr, &retval); 1650 if (error) { 1651 RPCLOG(1, "endpnt_get: kstr_push on timod failed %d\n", error); 1652 goto bad; 1653 } 1654 1655 /* 1656 * Attempt to bind the endpoint. If we fail then propogate 1657 * error back to calling subsystem, so that it can be handled 1658 * appropriately. 1659 * If the caller has not specified reserved port usage then 1660 * take the system default. 1661 */ 1662 if (useresvport == -1) 1663 useresvport = clnt_clts_do_bindresvport; 1664 1665 if (useresvport && 1666 (strcmp(config->knc_protofmly, NC_INET) == 0 || 1667 strcmp(config->knc_protofmly, NC_INET6) == 0)) { 1668 1669 while ((error = 1670 bindresvport(new->e_tiptr, NULL, NULL, FALSE)) != 0) { 1671 RPCLOG(1, 1672 "endpnt_get: bindresvport error %d\n", error); 1673 if (error != EPROTO) { 1674 if (rtries-- <= 0) 1675 goto bad; 1676 1677 delay(hz << i++); 1678 continue; 1679 } 1680 1681 (void) t_kclose(new->e_tiptr, 1); 1682 /* 1683 * reopen with all privileges 1684 */ 1685 error = t_kopen(NULL, config->knc_rdev, 1686 FREAD|FWRITE|FNDELAY, 1687 &new->e_tiptr, cr); 1688 if (error) { 1689 RPCLOG(1, "endpnt_get: t_kopen: %d\n", error); 1690 new->e_tiptr = NULL; 1691 goto bad; 1692 } 1693 } 1694 } else if ((error = t_kbind(new->e_tiptr, NULL, NULL)) != 0) { 1695 RPCLOG(1, "endpnt_get: t_kbind failed: %d\n", error); 1696 goto bad; 1697 } 1698 1699 /* 1700 * Set the flags and notify and waiters that we have an established 1701 * endpoint. 1702 */ 1703 mutex_enter(&new->e_lock); 1704 new->e_flags |= ENDPNT_ESTABLISHED; 1705 new->e_flags |= ENDPNT_BOUND; 1706 if (new->e_flags & ENDPNT_WAITING) { 1707 cv_broadcast(&new->e_cv); 1708 new->e_flags &= ~ENDPNT_WAITING; 1709 } 1710 mutex_exit(&new->e_lock); 1711 1712 return (new); 1713 1714 bad: 1715 ASSERT(new != NULL); 1716 /* 1717 * mark this endpoint as stale and notify any threads waiting 1718 * on this endpoint that it will be going away. 1719 */ 1720 mutex_enter(&new->e_lock); 1721 if (new->e_ref > 0) { 1722 new->e_flags |= ENDPNT_ESTABLISHED; 1723 new->e_flags |= ENDPNT_STALE; 1724 if (new->e_flags & ENDPNT_WAITING) { 1725 cv_broadcast(&new->e_cv); 1726 new->e_flags &= ~ENDPNT_WAITING; 1727 } 1728 } 1729 new->e_ref--; 1730 new->e_tiptr = NULL; 1731 mutex_exit(&new->e_lock); 1732 1733 /* 1734 * If there was a transport endopoint opened, then close it. 1735 */ 1736 if (tiptr != NULL) 1737 (void) t_kclose(tiptr, 1); 1738 1739 return (NULL); 1740 } 1741 1742 /* 1743 * Release a referece to the endpoint 1744 */ 1745 static void 1746 endpnt_rele(struct endpnt *sp) 1747 { 1748 mutex_enter(&sp->e_lock); 1749 ASSERT(sp->e_ref > 0); 1750 sp->e_ref--; 1751 /* 1752 * If the ref count is zero, then start the idle timer and link 1753 * the endpoint onto the idle list. 1754 */ 1755 if (sp->e_ref == 0) { 1756 sp->e_itime = gethrestime_sec(); 1757 1758 /* 1759 * Check to see if the endpoint is already linked to the idle 1760 * list, so that we don't try to reinsert it. 1761 */ 1762 if (sp->e_flags & ENDPNT_ONIDLE) { 1763 mutex_exit(&sp->e_lock); 1764 mutex_enter(&sp->e_type->e_ilock); 1765 endpnt_reap_settimer(sp->e_type); 1766 mutex_exit(&sp->e_type->e_ilock); 1767 return; 1768 } 1769 1770 sp->e_flags |= ENDPNT_ONIDLE; 1771 mutex_exit(&sp->e_lock); 1772 mutex_enter(&sp->e_type->e_ilock); 1773 list_insert_tail(&sp->e_type->e_ilist, sp); 1774 endpnt_reap_settimer(sp->e_type); 1775 mutex_exit(&sp->e_type->e_ilock); 1776 } else 1777 mutex_exit(&sp->e_lock); 1778 } 1779 1780 static void 1781 endpnt_reap_settimer(endpnt_type_t *etp) 1782 { 1783 if (etp->e_itimer == (timeout_id_t)0) 1784 etp->e_itimer = timeout(endpnt_reap_dispatch, (void *)etp, 1785 clnt_clts_taskq_dispatch_interval); 1786 } 1787 1788 static void 1789 endpnt_reap_dispatch(void *a) 1790 { 1791 endpnt_type_t *etp = a; 1792 1793 /* 1794 * The idle timer has fired, so dispatch the taskq to close the 1795 * endpoint. 1796 */ 1797 if (taskq_dispatch(endpnt_taskq, (task_func_t *)endpnt_reap, etp, 1798 TQ_NOSLEEP) == TASKQID_INVALID) 1799 return; 1800 mutex_enter(&etp->e_ilock); 1801 etp->e_async_count++; 1802 mutex_exit(&etp->e_ilock); 1803 } 1804 1805 /* 1806 * Traverse the idle list and close those endpoints that have reached their 1807 * timeout interval. 1808 */ 1809 static void 1810 endpnt_reap(endpnt_type_t *etp) 1811 { 1812 struct endpnt *e; 1813 struct endpnt *next_node = NULL; 1814 1815 mutex_enter(&etp->e_ilock); 1816 e = list_head(&etp->e_ilist); 1817 while (e != NULL) { 1818 next_node = list_next(&etp->e_ilist, e); 1819 1820 mutex_enter(&e->e_lock); 1821 if (e->e_ref > 0) { 1822 mutex_exit(&e->e_lock); 1823 e = next_node; 1824 continue; 1825 } 1826 1827 ASSERT(e->e_ref == 0); 1828 if (e->e_itime > 0 && 1829 (e->e_itime + clnt_clts_endpoint_reap_interval) < 1830 gethrestime_sec()) { 1831 e->e_flags &= ~ENDPNT_BOUND; 1832 (void) t_kclose(e->e_tiptr, 1); 1833 e->e_tiptr = NULL; 1834 e->e_itime = 0; 1835 } 1836 mutex_exit(&e->e_lock); 1837 e = next_node; 1838 } 1839 etp->e_itimer = 0; 1840 if (--etp->e_async_count == 0) 1841 cv_signal(&etp->e_async_cv); 1842 mutex_exit(&etp->e_ilock); 1843 } 1844 1845 static void 1846 endpnt_reclaim(zoneid_t zoneid) 1847 { 1848 struct endpnt_type *np; 1849 struct endpnt *e; 1850 struct endpnt *next_node = NULL; 1851 list_t free_list; 1852 int rcnt = 0; 1853 1854 list_create(&free_list, sizeof (endpnt_t), offsetof(endpnt_t, e_node)); 1855 1856 RPCLOG0(1, "endpnt_reclaim: reclaim callback started\n"); 1857 rw_enter(&endpnt_type_lock, RW_READER); 1858 for (np = endpnt_type_list; np != NULL; np = np->e_next) { 1859 if (zoneid != ALL_ZONES && zoneid != np->e_zoneid) 1860 continue; 1861 1862 mutex_enter(&np->e_plock); 1863 RPCLOG(1, "endpnt_reclaim: protofmly %s, ", 1864 np->e_protofmly); 1865 RPCLOG(1, "rdev %ld\n", np->e_rdev); 1866 RPCLOG(1, "endpnt_reclaim: found %d endpoint(s)\n", 1867 np->e_cnt); 1868 1869 if (np->e_cnt == 0) { 1870 mutex_exit(&np->e_plock); 1871 continue; 1872 } 1873 1874 /* 1875 * The nice thing about maintaining an idle list is that if 1876 * there are any endpoints to reclaim, they are going to be 1877 * on this list. Just go through and reap the one's that 1878 * have ref counts of zero. 1879 */ 1880 mutex_enter(&np->e_ilock); 1881 e = list_head(&np->e_ilist); 1882 while (e != NULL) { 1883 next_node = list_next(&np->e_ilist, e); 1884 mutex_enter(&e->e_lock); 1885 if (e->e_ref > 0) { 1886 mutex_exit(&e->e_lock); 1887 e = next_node; 1888 continue; 1889 } 1890 ASSERT(e->e_ref == 0); 1891 mutex_exit(&e->e_lock); 1892 1893 list_remove(&np->e_ilist, e); 1894 list_remove(&np->e_pool, e); 1895 list_insert_head(&free_list, e); 1896 1897 rcnt++; 1898 np->e_cnt--; 1899 e = next_node; 1900 } 1901 mutex_exit(&np->e_ilock); 1902 /* 1903 * Reset the current pointer to be safe 1904 */ 1905 if ((e = (struct endpnt *)list_head(&np->e_pool)) != NULL) 1906 np->e_pcurr = e; 1907 else { 1908 ASSERT(np->e_cnt == 0); 1909 np->e_pcurr = NULL; 1910 } 1911 1912 mutex_exit(&np->e_plock); 1913 } 1914 rw_exit(&endpnt_type_lock); 1915 1916 while ((e = list_head(&free_list)) != NULL) { 1917 list_remove(&free_list, e); 1918 if (e->e_tiptr != NULL) 1919 (void) t_kclose(e->e_tiptr, 1); 1920 1921 cv_destroy(&e->e_cv); 1922 mutex_destroy(&e->e_lock); 1923 kmem_cache_free(endpnt_cache, e); 1924 } 1925 list_destroy(&free_list); 1926 RPCLOG(1, "endpnt_reclaim: reclaimed %d endpoint(s)\n", rcnt); 1927 } 1928 1929 /* 1930 * Endpoint reclaim zones destructor callback routine. 1931 * 1932 * After reclaiming any cached entries, we basically go through the endpnt_type 1933 * list, canceling outstanding timeouts and free'ing data structures. 1934 */ 1935 /* ARGSUSED */ 1936 static void 1937 endpnt_destructor(zoneid_t zoneid, void *a) 1938 { 1939 struct endpnt_type **npp; 1940 struct endpnt_type *np; 1941 struct endpnt_type *free_list = NULL; 1942 timeout_id_t t_id = 0; 1943 extern void clcleanup_zone(zoneid_t); 1944 extern void clcleanup4_zone(zoneid_t); 1945 1946 /* Make sure NFS client handles are released. */ 1947 clcleanup_zone(zoneid); 1948 clcleanup4_zone(zoneid); 1949 1950 endpnt_reclaim(zoneid); 1951 /* 1952 * We don't need to be holding on to any locks across the call to 1953 * endpnt_reclaim() and the code below; we know that no-one can 1954 * be holding open connections for this zone (all processes and kernel 1955 * threads are gone), so nothing could be adding anything to the list. 1956 */ 1957 rw_enter(&endpnt_type_lock, RW_WRITER); 1958 npp = &endpnt_type_list; 1959 while ((np = *npp) != NULL) { 1960 if (np->e_zoneid != zoneid) { 1961 npp = &np->e_next; 1962 continue; 1963 } 1964 mutex_enter(&np->e_plock); 1965 mutex_enter(&np->e_ilock); 1966 if (np->e_itimer != 0) { 1967 t_id = np->e_itimer; 1968 np->e_itimer = 0; 1969 } 1970 ASSERT(np->e_cnt == 0); 1971 ASSERT(list_head(&np->e_pool) == NULL); 1972 ASSERT(list_head(&np->e_ilist) == NULL); 1973 1974 mutex_exit(&np->e_ilock); 1975 mutex_exit(&np->e_plock); 1976 1977 /* 1978 * untimeout() any outstanding timers that have not yet fired. 1979 */ 1980 if (t_id != (timeout_id_t)0) 1981 (void) untimeout(t_id); 1982 *npp = np->e_next; 1983 np->e_next = free_list; 1984 free_list = np; 1985 } 1986 rw_exit(&endpnt_type_lock); 1987 1988 while (free_list != NULL) { 1989 np = free_list; 1990 free_list = free_list->e_next; 1991 /* 1992 * Wait for threads in endpnt_taskq trying to reap endpnt_ts in 1993 * the endpnt_type_t. 1994 */ 1995 mutex_enter(&np->e_ilock); 1996 while (np->e_async_count > 0) 1997 cv_wait(&np->e_async_cv, &np->e_ilock); 1998 cv_destroy(&np->e_async_cv); 1999 mutex_destroy(&np->e_plock); 2000 mutex_destroy(&np->e_ilock); 2001 list_destroy(&np->e_pool); 2002 list_destroy(&np->e_ilist); 2003 kmem_free(np, sizeof (endpnt_type_t)); 2004 } 2005 } 2006 2007 /* 2008 * Endpoint reclaim kmem callback routine. 2009 */ 2010 /* ARGSUSED */ 2011 static void 2012 endpnt_repossess(void *a) 2013 { 2014 /* 2015 * Reclaim idle endpnt's from all zones. 2016 */ 2017 if (endpnt_taskq != NULL) 2018 (void) taskq_dispatch(endpnt_taskq, 2019 (task_func_t *)(uintptr_t)endpnt_reclaim, (void *)ALL_ZONES, 2020 TQ_NOSLEEP); 2021 } 2022 2023 /* 2024 * RPC request dispatch routine. Constructs a datagram message and wraps it 2025 * around the RPC request to pass downstream. 2026 */ 2027 static int 2028 clnt_clts_dispatch_send(queue_t *q, mblk_t *mp, struct netbuf *addr, 2029 calllist_t *cp, uint_t xid, cred_t *cr) 2030 { 2031 mblk_t *bp; 2032 int msgsz; 2033 struct T_unitdata_req *udreq; 2034 2035 /* 2036 * Set up the call record. 2037 */ 2038 cp->call_wq = q; 2039 cp->call_xid = xid; 2040 cp->call_status = RPC_TIMEDOUT; 2041 cp->call_notified = FALSE; 2042 RPCLOG(64, 2043 "clnt_clts_dispatch_send: putting xid 0x%x on " 2044 "dispatch list\n", xid); 2045 cp->call_hash = call_hash(xid, clnt_clts_hash_size); 2046 cp->call_bucket = &clts_call_ht[cp->call_hash]; 2047 call_table_enter(cp); 2048 2049 /* 2050 * Construct the datagram 2051 */ 2052 msgsz = (int)TUNITDATAREQSZ; 2053 /* 2054 * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will 2055 * appear as -1. 2056 */ 2057 while (!(bp = allocb_cred(msgsz + addr->len, cr, NOPID))) { 2058 if (strwaitbuf(msgsz + addr->len, BPRI_LO)) 2059 return (ENOSR); 2060 } 2061 2062 udreq = (struct T_unitdata_req *)bp->b_wptr; 2063 udreq->PRIM_type = T_UNITDATA_REQ; 2064 udreq->DEST_length = addr->len; 2065 2066 if (addr->len) { 2067 bcopy(addr->buf, bp->b_wptr + msgsz, addr->len); 2068 udreq->DEST_offset = (t_scalar_t)msgsz; 2069 msgsz += addr->len; 2070 } else 2071 udreq->DEST_offset = 0; 2072 udreq->OPT_length = 0; 2073 udreq->OPT_offset = 0; 2074 2075 bp->b_datap->db_type = M_PROTO; 2076 bp->b_wptr += msgsz; 2077 2078 /* 2079 * Link the datagram header with the actual data 2080 */ 2081 linkb(bp, mp); 2082 2083 /* 2084 * Send downstream. 2085 */ 2086 if (canput(cp->call_wq)) { 2087 put(cp->call_wq, bp); 2088 return (0); 2089 } 2090 2091 return (EIO); 2092 } 2093 2094 /* 2095 * RPC response delivery routine. Deliver the response to the waiting 2096 * thread by matching the xid. 2097 */ 2098 void 2099 clnt_clts_dispatch_notify(mblk_t *mp, int resp_off, zoneid_t zoneid) 2100 { 2101 calllist_t *e = NULL; 2102 call_table_t *chtp; 2103 uint32_t xid; 2104 uint_t hash; 2105 unsigned char *hdr_offset; 2106 mblk_t *resp; 2107 2108 /* 2109 * If the RPC response is not contained in the same mblk as the 2110 * datagram header, then move to the next mblk. 2111 */ 2112 hdr_offset = mp->b_rptr; 2113 resp = mp; 2114 if ((mp->b_wptr - (mp->b_rptr + resp_off)) == 0) 2115 resp = mp->b_cont; 2116 else 2117 resp->b_rptr += resp_off; 2118 2119 ASSERT(resp != NULL); 2120 2121 if ((IS_P2ALIGNED(resp->b_rptr, sizeof (uint32_t))) && 2122 (resp->b_wptr - resp->b_rptr) >= sizeof (xid)) 2123 xid = *((uint32_t *)resp->b_rptr); 2124 else { 2125 int i = 0; 2126 unsigned char *p = (unsigned char *)&xid; 2127 unsigned char *rptr; 2128 mblk_t *tmp = resp; 2129 2130 /* 2131 * Copy the xid, byte-by-byte into xid. 2132 */ 2133 while (tmp) { 2134 rptr = tmp->b_rptr; 2135 while (rptr < tmp->b_wptr) { 2136 *p++ = *rptr++; 2137 if (++i >= sizeof (xid)) 2138 goto done_xid_copy; 2139 } 2140 tmp = tmp->b_cont; 2141 } 2142 2143 /* 2144 * If we got here, we ran out of mblk space before the 2145 * xid could be copied. 2146 */ 2147 ASSERT(tmp == NULL && i < sizeof (xid)); 2148 2149 RPCLOG0(1, 2150 "clnt_dispatch_notify(clts): message less than " 2151 "size of xid\n"); 2152 2153 freemsg(mp); 2154 return; 2155 } 2156 2157 done_xid_copy: 2158 2159 /* 2160 * Reset the read pointer back to the beginning of the protocol 2161 * header if we moved it. 2162 */ 2163 if (mp->b_rptr != hdr_offset) 2164 mp->b_rptr = hdr_offset; 2165 2166 hash = call_hash(xid, clnt_clts_hash_size); 2167 chtp = &clts_call_ht[hash]; 2168 /* call_table_find returns with the hash bucket locked */ 2169 call_table_find(chtp, xid, e); 2170 2171 if (e != NULL) { 2172 mutex_enter(&e->call_lock); 2173 2174 /* 2175 * verify that the reply is coming in on 2176 * the same zone that it was sent from. 2177 */ 2178 if (e->call_zoneid != zoneid) { 2179 mutex_exit(&e->call_lock); 2180 mutex_exit(&chtp->ct_lock); 2181 RPCLOG0(8, "clnt_dispatch_notify (clts): incorrect " 2182 "zoneid\n"); 2183 freemsg(mp); 2184 return; 2185 } 2186 2187 /* 2188 * found thread waiting for this reply. 2189 */ 2190 if (e->call_reply) { 2191 RPCLOG(8, 2192 "clnt_dispatch_notify (clts): discarding old " 2193 "reply for xid 0x%x\n", 2194 xid); 2195 freemsg(e->call_reply); 2196 } 2197 e->call_notified = TRUE; 2198 e->call_reply = mp; 2199 e->call_status = RPC_SUCCESS; 2200 cv_signal(&e->call_cv); 2201 mutex_exit(&e->call_lock); 2202 mutex_exit(&chtp->ct_lock); 2203 } else { 2204 zone_t *zone; 2205 struct rpcstat *rpcstat; 2206 2207 mutex_exit(&chtp->ct_lock); 2208 RPCLOG(8, "clnt_dispatch_notify (clts): no caller for reply " 2209 "0x%x\n", xid); 2210 freemsg(mp); 2211 /* 2212 * This is unfortunate, but we need to lookup the zone so we 2213 * can increment its "rcbadxids" counter. 2214 */ 2215 zone = zone_find_by_id(zoneid); 2216 if (zone == NULL) { 2217 /* 2218 * The zone went away... 2219 */ 2220 return; 2221 } 2222 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 2223 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 2224 /* 2225 * Not interested 2226 */ 2227 zone_rele(zone); 2228 return; 2229 } 2230 RCSTAT_INCR(rpcstat->rpc_clts_client, rcbadxids); 2231 zone_rele(zone); 2232 } 2233 } 2234 2235 /* 2236 * Init routine. Called when rpcmod is loaded. 2237 */ 2238 void 2239 clnt_clts_init(void) 2240 { 2241 endpnt_cache = kmem_cache_create("clnt_clts_endpnt_cache", 2242 sizeof (struct endpnt), 0, NULL, NULL, endpnt_repossess, NULL, 2243 NULL, 0); 2244 2245 rw_init(&endpnt_type_lock, NULL, RW_DEFAULT, NULL); 2246 2247 /* 2248 * Perform simple bounds checking to make sure that the setting is 2249 * reasonable 2250 */ 2251 if (clnt_clts_max_endpoints <= 0) { 2252 if (clnt_clts_do_bindresvport) 2253 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2254 else 2255 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2256 } 2257 2258 if (clnt_clts_do_bindresvport && 2259 clnt_clts_max_endpoints > RESERVED_PORTSPACE) 2260 clnt_clts_max_endpoints = RESERVED_PORTSPACE; 2261 else if (clnt_clts_max_endpoints > NONRESERVED_PORTSPACE) 2262 clnt_clts_max_endpoints = NONRESERVED_PORTSPACE; 2263 2264 if (clnt_clts_hash_size < DEFAULT_MIN_HASH_SIZE) 2265 clnt_clts_hash_size = DEFAULT_MIN_HASH_SIZE; 2266 2267 /* 2268 * Defer creating the taskq until rpcmod gets pushed. If we are 2269 * in diskless boot mode, rpcmod will get loaded early even before 2270 * thread_create() is available. 2271 */ 2272 endpnt_taskq = NULL; 2273 taskq_created = FALSE; 2274 mutex_init(&endpnt_taskq_lock, NULL, MUTEX_DEFAULT, NULL); 2275 2276 if (clnt_clts_endpoint_reap_interval < DEFAULT_ENDPOINT_REAP_INTERVAL) 2277 clnt_clts_endpoint_reap_interval = 2278 DEFAULT_ENDPOINT_REAP_INTERVAL; 2279 2280 /* 2281 * Dispatch the taskq at an interval which is offset from the 2282 * interval that the endpoints should be reaped. 2283 */ 2284 clnt_clts_taskq_dispatch_interval = 2285 (clnt_clts_endpoint_reap_interval + DEFAULT_INTERVAL_SHIFT) * hz; 2286 2287 /* 2288 * Initialize the completion queue 2289 */ 2290 clts_call_ht = call_table_init(clnt_clts_hash_size); 2291 /* 2292 * Initialize the zone destructor callback. 2293 */ 2294 zone_key_create(&endpnt_destructor_key, NULL, NULL, endpnt_destructor); 2295 } 2296 2297 void 2298 clnt_clts_fini(void) 2299 { 2300 (void) zone_key_delete(endpnt_destructor_key); 2301 } 2302