1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * nfs_cast.c : broadcast to a specific group of NFS servers 24 * 25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 #include <stdio.h> 30 #include <syslog.h> 31 #include <errno.h> 32 #include <string.h> 33 #include <sys/types.h> 34 #include <sys/time.h> 35 #include <sys/resource.h> 36 #include <unistd.h> 37 #include <stdlib.h> 38 #include <rpc/rpc.h> 39 #include <rpc/clnt_soc.h> 40 #include <rpc/nettype.h> 41 #include <rpc/pmap_prot.h> 42 #include <netconfig.h> 43 #include <netdir.h> 44 #include <nfs/nfs.h> 45 #define NFSCLIENT 46 #include <locale.h> 47 #include "automount.h" 48 49 #define PENALTY_WEIGHT 100000 50 51 struct tstamps { 52 struct tstamps *ts_next; 53 int ts_penalty; 54 int ts_inx; 55 int ts_rcvd; 56 struct timeval ts_timeval; 57 }; 58 59 /* A list of addresses - all belonging to the same transport */ 60 61 struct addrs { 62 struct addrs *addr_next; 63 struct mapfs *addr_mfs; 64 struct nd_addrlist *addr_addrs; 65 struct tstamps *addr_if_tstamps; 66 }; 67 68 /* A list of connectionless transports */ 69 70 struct transp { 71 struct transp *tr_next; 72 int tr_fd; 73 char *tr_device; 74 struct t_bind *tr_taddr; 75 struct addrs *tr_addrs; 76 }; 77 78 /* A list of map entries and their roundtrip times, for sorting */ 79 80 struct sm { 81 struct mapfs *mfs; 82 struct timeval timeval; 83 }; 84 85 static void free_transports(struct transp *); 86 static void calc_resp_time(struct timeval *); 87 static struct mapfs *sort_responses(struct transp *); 88 static int host_sm(const void *, const void *b); 89 static int time_sm(const void *, const void *b); 90 extern struct mapfs *add_mfs(struct mapfs *, int, struct mapfs **, 91 struct mapfs **); 92 93 /* 94 * This routine is designed to be able to "ping" 95 * a list of hosts and create a list of responding 96 * hosts sorted by response time. 97 * This must be done without any prior 98 * contact with the host - therefore the "ping" 99 * must be to a "well-known" address. The outstanding 100 * candidate here is the address of "rpcbind". 101 * 102 * A response to a ping is no guarantee that the host 103 * is running NFS, has a mount daemon, or exports 104 * the required filesystem. If the subsequent 105 * mount attempt fails then the host will be marked 106 * "ignore" and the host list will be re-pinged 107 * (sans the bad host). This process continues 108 * until a successful mount is achieved or until 109 * there are no hosts left to try. 110 */ 111 enum clnt_stat 112 nfs_cast(struct mapfs *mfs_in, struct mapfs **mfs_out, int timeout) 113 { 114 enum clnt_stat stat; 115 AUTH *sys_auth = authsys_create_default(); 116 XDR xdr_stream; 117 register XDR *xdrs = &xdr_stream; 118 int outlen; 119 int if_inx; 120 int tsec; 121 int flag; 122 int sent, addr_cnt, rcvd, if_cnt; 123 fd_set readfds, mask; 124 register ulong_t xid; /* xid - unique per addr */ 125 register int i; 126 struct rpc_msg msg; 127 struct timeval t, rcv_timeout; 128 char outbuf[UDPMSGSIZE], inbuf[UDPMSGSIZE]; 129 struct t_unitdata t_udata, t_rdata; 130 struct nd_hostserv hs; 131 struct nd_addrlist *retaddrs; 132 struct transp *tr_head; 133 struct transp *trans, *prev_trans; 134 struct addrs *a, *prev_addr; 135 struct tstamps *ts, *prev_ts; 136 NCONF_HANDLE *nc = NULL; 137 struct netconfig *nconf; 138 struct rlimit rl; 139 int dtbsize; 140 struct mapfs *mfs; 141 142 /* 143 * For each connectionless transport get a list of 144 * host addresses. Any single host may have 145 * addresses on several transports. 146 */ 147 addr_cnt = sent = rcvd = 0; 148 tr_head = NULL; 149 FD_ZERO(&mask); 150 151 /* 152 * Set the default select size to be the maximum FD_SETSIZE, unless 153 * the current rlimit is lower. 154 */ 155 dtbsize = FD_SETSIZE; 156 if (getrlimit(RLIMIT_NOFILE, &rl) == 0) { 157 if (rl.rlim_cur < FD_SETSIZE) 158 dtbsize = rl.rlim_cur; 159 } 160 161 prev_trans = NULL; 162 prev_addr = NULL; 163 prev_ts = NULL; 164 for (mfs = mfs_in; mfs; mfs = mfs->mfs_next) { 165 166 if (trace > 2) 167 trace_prt(1, "nfs_cast: host=%s\n", mfs->mfs_host); 168 169 nc = setnetconfig(); 170 if (nc == NULL) { 171 stat = RPC_CANTSEND; 172 goto done_broad; 173 } 174 while (nconf = getnetconfig(nc)) { 175 if (!(nconf->nc_flag & NC_VISIBLE) || 176 nconf->nc_semantics != NC_TPI_CLTS || 177 (strcmp(nconf->nc_protofmly, NC_LOOPBACK) == 0)) 178 continue; 179 trans = (struct transp *)malloc(sizeof (*trans)); 180 if (trans == NULL) { 181 syslog(LOG_ERR, "no memory"); 182 stat = RPC_CANTSEND; 183 goto done_broad; 184 } 185 (void) memset(trans, 0, sizeof (*trans)); 186 if (tr_head == NULL) 187 tr_head = trans; 188 else 189 prev_trans->tr_next = trans; 190 prev_trans = trans; 191 192 trans->tr_fd = t_open(nconf->nc_device, O_RDWR, NULL); 193 if (trans->tr_fd < 0) { 194 syslog(LOG_ERR, "nfscast: t_open: %s:%m", 195 nconf->nc_device); 196 stat = RPC_CANTSEND; 197 goto done_broad; 198 } 199 if (t_bind(trans->tr_fd, (struct t_bind *)NULL, 200 (struct t_bind *)NULL) < 0) { 201 syslog(LOG_ERR, "nfscast: t_bind: %m"); 202 stat = RPC_CANTSEND; 203 goto done_broad; 204 } 205 trans->tr_taddr = 206 /* LINTED pointer alignment */ 207 (struct t_bind *)t_alloc(trans->tr_fd, T_BIND, T_ADDR); 208 if (trans->tr_taddr == (struct t_bind *)NULL) { 209 syslog(LOG_ERR, "nfscast: t_alloc: %m"); 210 stat = RPC_SYSTEMERROR; 211 goto done_broad; 212 } 213 214 trans->tr_device = nconf->nc_device; 215 FD_SET(trans->tr_fd, &mask); 216 217 if_inx = 0; 218 hs.h_host = mfs->mfs_host; 219 hs.h_serv = "rpcbind"; 220 if (netdir_getbyname(nconf, &hs, &retaddrs) == ND_OK) { 221 222 /* 223 * If mfs->ignore is previously set for 224 * this map, clear it. Because a host can 225 * have either v6 or v4 address 226 */ 227 if (mfs->mfs_ignore == 1) 228 mfs->mfs_ignore = 0; 229 230 a = (struct addrs *)malloc(sizeof (*a)); 231 if (a == NULL) { 232 syslog(LOG_ERR, "no memory"); 233 stat = RPC_CANTSEND; 234 goto done_broad; 235 } 236 (void) memset(a, 0, sizeof (*a)); 237 if (trans->tr_addrs == NULL) 238 trans->tr_addrs = a; 239 else 240 prev_addr->addr_next = a; 241 prev_addr = a; 242 a->addr_if_tstamps = NULL; 243 a->addr_mfs = mfs; 244 a->addr_addrs = retaddrs; 245 if_cnt = retaddrs->n_cnt; 246 while (if_cnt--) { 247 ts = (struct tstamps *) 248 malloc(sizeof (*ts)); 249 if (ts == NULL) { 250 syslog(LOG_ERR, "no memory"); 251 stat = RPC_CANTSEND; 252 goto done_broad; 253 } 254 (void) memset(ts, 0, sizeof (*ts)); 255 ts->ts_penalty = mfs->mfs_penalty; 256 if (a->addr_if_tstamps == NULL) 257 a->addr_if_tstamps = ts; 258 else 259 prev_ts->ts_next = ts; 260 prev_ts = ts; 261 ts->ts_inx = if_inx++; 262 addr_cnt++; 263 } 264 break; 265 } else { 266 mfs->mfs_ignore = 1; 267 if (verbose) 268 syslog(LOG_ERR, 269 "%s:%s address not known", 270 mfs->mfs_host, 271 strcmp(nconf->nc_proto, NC_INET)?"IPv6":"IPv4"); 272 } 273 } /* while */ 274 275 endnetconfig(nc); 276 nc = NULL; 277 } /* for */ 278 if (addr_cnt == 0) { 279 syslog(LOG_ERR, "nfscast: couldn't find addresses"); 280 stat = RPC_CANTSEND; 281 goto done_broad; 282 } 283 284 (void) gettimeofday(&t, (struct timezone *)0); 285 xid = (getpid() ^ t.tv_sec ^ t.tv_usec) & ~0xFF; 286 t.tv_usec = 0; 287 288 /* serialize the RPC header */ 289 290 msg.rm_direction = CALL; 291 msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 292 msg.rm_call.cb_prog = RPCBPROG; 293 /* 294 * we can not use RPCBVERS here since it doesn't exist in 4.X, 295 * the fix to bug 1139883 has made the 4.X portmapper silent to 296 * version mismatches. This causes the RPC call to the remote 297 * portmapper to simply be ignored if it's not Version 2. 298 */ 299 msg.rm_call.cb_vers = PMAPVERS; 300 msg.rm_call.cb_proc = NULLPROC; 301 if (sys_auth == (AUTH *)NULL) { 302 stat = RPC_SYSTEMERROR; 303 goto done_broad; 304 } 305 msg.rm_call.cb_cred = sys_auth->ah_cred; 306 msg.rm_call.cb_verf = sys_auth->ah_verf; 307 xdrmem_create(xdrs, outbuf, sizeof (outbuf), XDR_ENCODE); 308 if (! xdr_callmsg(xdrs, &msg)) { 309 stat = RPC_CANTENCODEARGS; 310 goto done_broad; 311 } 312 outlen = (int)xdr_getpos(xdrs); 313 xdr_destroy(xdrs); 314 315 t_udata.opt.len = 0; 316 t_udata.udata.buf = outbuf; 317 t_udata.udata.len = outlen; 318 319 /* 320 * Basic loop: send packet to all hosts and wait for response(s). 321 * The response timeout grows larger per iteration. 322 * A unique xid is assigned to each address in order to 323 * correctly match the replies. 324 */ 325 for (tsec = 4; timeout > 0; tsec *= 2) { 326 327 timeout -= tsec; 328 if (timeout <= 0) 329 tsec += timeout; 330 331 rcv_timeout.tv_sec = tsec; 332 rcv_timeout.tv_usec = 0; 333 334 sent = 0; 335 for (trans = tr_head; trans; trans = trans->tr_next) { 336 for (a = trans->tr_addrs; a; a = a->addr_next) { 337 struct netbuf *if_netbuf = 338 a->addr_addrs->n_addrs; 339 ts = a->addr_if_tstamps; 340 if_cnt = a->addr_addrs->n_cnt; 341 while (if_cnt--) { 342 343 /* 344 * xid is the first thing in 345 * preserialized buffer 346 */ 347 /* LINTED pointer alignment */ 348 *((ulong_t *)outbuf) = 349 htonl(xid + ts->ts_inx); 350 (void) gettimeofday(&(ts->ts_timeval), 351 (struct timezone *)0); 352 /* 353 * Check if already received 354 * from a previous iteration. 355 */ 356 if (ts->ts_rcvd) { 357 sent++; 358 ts = ts->ts_next; 359 continue; 360 } 361 362 t_udata.addr = *if_netbuf++; 363 364 if (t_sndudata(trans->tr_fd, 365 &t_udata) == 0) { 366 sent++; 367 } 368 369 ts = ts->ts_next; 370 } 371 } 372 } 373 if (sent == 0) { /* no packets sent ? */ 374 stat = RPC_CANTSEND; 375 goto done_broad; 376 } 377 378 /* 379 * Have sent all the packets. Now collect the responses... 380 */ 381 rcvd = 0; 382 recv_again: 383 msg.acpted_rply.ar_verf = _null_auth; 384 msg.acpted_rply.ar_results.proc = xdr_void; 385 readfds = mask; 386 387 switch (select(dtbsize, &readfds, 388 (fd_set *)NULL, (fd_set *)NULL, &rcv_timeout)) { 389 390 case 0: /* Timed out */ 391 /* 392 * If we got at least one response in the 393 * last interval, then don't wait for any 394 * more. In theory we should wait for 395 * the max weighting (penalty) value so 396 * that a very slow server has a chance to 397 * respond but this could take a long time 398 * if the admin has set a high weighting 399 * value. 400 */ 401 if (rcvd > 0) 402 goto done_broad; 403 404 stat = RPC_TIMEDOUT; 405 continue; 406 407 case -1: /* some kind of error */ 408 if (errno == EINTR) 409 goto recv_again; 410 syslog(LOG_ERR, "nfscast: select: %m"); 411 if (rcvd == 0) 412 stat = RPC_CANTRECV; 413 goto done_broad; 414 415 } /* end of select results switch */ 416 417 for (trans = tr_head; trans; trans = trans->tr_next) { 418 if (FD_ISSET(trans->tr_fd, &readfds)) 419 break; 420 } 421 if (trans == NULL) 422 goto recv_again; 423 424 try_again: 425 t_rdata.addr = trans->tr_taddr->addr; 426 t_rdata.udata.buf = inbuf; 427 t_rdata.udata.maxlen = sizeof (inbuf); 428 t_rdata.udata.len = 0; 429 t_rdata.opt.len = 0; 430 if (t_rcvudata(trans->tr_fd, &t_rdata, &flag) < 0) { 431 if (errno == EINTR) 432 goto try_again; 433 syslog(LOG_ERR, "nfscast: t_rcvudata: %s:%m", 434 trans->tr_device); 435 stat = RPC_CANTRECV; 436 continue; 437 } 438 if (t_rdata.udata.len < sizeof (ulong_t)) 439 goto recv_again; 440 if (flag & T_MORE) { 441 syslog(LOG_ERR, 442 "nfscast: t_rcvudata: %s: buffer overflow", 443 trans->tr_device); 444 goto recv_again; 445 } 446 447 /* 448 * see if reply transaction id matches sent id. 449 * If so, decode the results. 450 * Note: received addr is ignored, it could be 451 * different from the send addr if the host has 452 * more than one addr. 453 */ 454 xdrmem_create(xdrs, inbuf, (uint_t)t_rdata.udata.len, 455 XDR_DECODE); 456 if (xdr_replymsg(xdrs, &msg)) { 457 if (msg.rm_reply.rp_stat == MSG_ACCEPTED && 458 (msg.rm_xid & ~0xFF) == xid) { 459 struct addrs *curr_addr; 460 461 i = msg.rm_xid & 0xFF; 462 for (curr_addr = trans->tr_addrs; curr_addr; 463 curr_addr = curr_addr->addr_next) { 464 for (ts = curr_addr->addr_if_tstamps; ts; 465 ts = ts->ts_next) 466 if (ts->ts_inx == i && !ts->ts_rcvd) { 467 ts->ts_rcvd = 1; 468 calc_resp_time(&ts->ts_timeval); 469 stat = RPC_SUCCESS; 470 rcvd++; 471 break; 472 } 473 } 474 } /* otherwise, we just ignore the errors ... */ 475 } 476 xdrs->x_op = XDR_FREE; 477 msg.acpted_rply.ar_results.proc = xdr_void; 478 (void) xdr_replymsg(xdrs, &msg); 479 XDR_DESTROY(xdrs); 480 if (rcvd == sent) 481 goto done_broad; 482 else 483 goto recv_again; 484 } 485 if (!rcvd) 486 stat = RPC_TIMEDOUT; 487 488 done_broad: 489 if (rcvd) { 490 *mfs_out = sort_responses(tr_head); 491 stat = RPC_SUCCESS; 492 } 493 if (nc) 494 endnetconfig(nc); 495 free_transports(tr_head); 496 AUTH_DESTROY(sys_auth); 497 return (stat); 498 } 499 500 /* 501 * Go through all the responses and sort fastest to slowest. 502 * Note that any penalty is added to the response time - so the 503 * fastest response isn't necessarily the one that arrived first. 504 */ 505 static struct mapfs * 506 sort_responses(trans) 507 struct transp *trans; 508 { 509 struct transp *t; 510 struct addrs *a; 511 struct tstamps *ti; 512 int i, size = 0, allocsize = 10; 513 struct mapfs *p, *mfs_head = NULL, *mfs_tail = NULL; 514 struct sm *buffer; 515 516 buffer = (struct sm *)malloc(allocsize * sizeof (struct sm)); 517 if (!buffer) { 518 syslog(LOG_ERR, "sort_responses: malloc error.\n"); 519 return (NULL); 520 } 521 522 for (t = trans; t; t = t->tr_next) { 523 for (a = t->tr_addrs; a; a = a->addr_next) { 524 for (ti = a->addr_if_tstamps; 525 ti; ti = ti->ts_next) { 526 if (!ti->ts_rcvd) 527 continue; 528 ti->ts_timeval.tv_usec += 529 (ti->ts_penalty * PENALTY_WEIGHT); 530 if (ti->ts_timeval.tv_usec >= 1000000) { 531 ti->ts_timeval.tv_sec += 532 (ti->ts_timeval.tv_usec / 1000000); 533 ti->ts_timeval.tv_usec = 534 (ti->ts_timeval.tv_usec % 1000000); 535 } 536 537 if (size >= allocsize) { 538 allocsize += 10; 539 buffer = (struct sm *)realloc(buffer, 540 allocsize * sizeof (struct sm)); 541 if (!buffer) { 542 syslog(LOG_ERR, 543 "sort_responses: malloc error.\n"); 544 return (NULL); 545 } 546 } 547 buffer[size].timeval = ti->ts_timeval; 548 buffer[size].mfs = a->addr_mfs; 549 size++; 550 } 551 } 552 } 553 554 #ifdef DEBUG 555 if (trace > 3) { 556 trace_prt(1, " sort_responses: before host sort:\n"); 557 for (i = 0; i < size; i++) 558 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host, 559 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec); 560 trace_prt(0, "\n"); 561 } 562 #endif 563 564 qsort((void *)buffer, size, sizeof (struct sm), host_sm); 565 566 /* 567 * Cope with multiply listed hosts by choosing first time 568 */ 569 for (i = 1; i < size; i++) { 570 #ifdef DEBUG 571 if (trace > 3) { 572 trace_prt(1, " sort_responses: comparing %s and %s\n", 573 buffer[i-1].mfs->mfs_host, 574 buffer[i].mfs->mfs_host); 575 } 576 #endif 577 if (strcmp(buffer[i-1].mfs->mfs_host, 578 buffer[i].mfs->mfs_host) == 0) 579 memcpy(&buffer[i].timeval, &buffer[i-1].timeval, 580 sizeof (struct timeval)); 581 } 582 if (trace > 3) 583 trace_prt(0, "\n"); 584 585 #ifdef DEBUG 586 if (trace > 3) { 587 trace_prt(1, " sort_responses: before time sort:\n"); 588 for (i = 0; i < size; i++) 589 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host, 590 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec); 591 trace_prt(0, "\n"); 592 } 593 #endif 594 595 qsort((void *)buffer, size, sizeof (struct sm), time_sm); 596 597 #ifdef DEBUG 598 if (trace > 3) { 599 trace_prt(1, " sort_responses: after sort:\n"); 600 for (i = 0; i < size; i++) 601 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host, 602 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec); 603 trace_prt(0, "\n"); 604 } 605 #endif 606 607 for (i = 0; i < size; i++) { 608 #ifdef DEBUG 609 if (trace > 3) { 610 trace_prt(1, " sort_responses: adding %s\n", 611 buffer[i].mfs->mfs_host); 612 } 613 #endif 614 p = add_mfs(buffer[i].mfs, 0, &mfs_head, &mfs_tail); 615 if (!p) 616 return (NULL); 617 } 618 free(buffer); 619 620 return (mfs_head); 621 } 622 623 624 /* 625 * Comparison routines called by qsort(3). 626 */ 627 static int host_sm(const void *a, const void *b) 628 { 629 return (strcmp(((struct sm *)a)->mfs->mfs_host, 630 ((struct sm *)b)->mfs->mfs_host)); 631 } 632 633 static int time_sm(const void *a, const void *b) 634 { 635 if (timercmp(&(((struct sm *)a)->timeval), 636 &(((struct sm *)b)->timeval), < /* cstyle */)) 637 return (-1); 638 else if (timercmp(&(((struct sm *)a)->timeval), 639 &(((struct sm *)b)->timeval), > /* cstyle */)) 640 return (1); 641 else 642 return (0); 643 } 644 645 /* 646 * Given send_time which is the time a request 647 * was transmitted to a server, subtract it 648 * from the time "now" thereby converting it 649 * to an elapsed time. 650 */ 651 static void 652 calc_resp_time(send_time) 653 struct timeval *send_time; 654 { 655 struct timeval time_now; 656 657 (void) gettimeofday(&time_now, (struct timezone *)0); 658 if (time_now.tv_usec < send_time->tv_usec) { 659 time_now.tv_sec--; 660 time_now.tv_usec += 1000000; 661 } 662 send_time->tv_sec = time_now.tv_sec - send_time->tv_sec; 663 send_time->tv_usec = time_now.tv_usec - send_time->tv_usec; 664 } 665 666 static void 667 free_transports(trans) 668 struct transp *trans; 669 { 670 struct transp *t, *tmpt = NULL; 671 struct addrs *a, *tmpa = NULL; 672 struct tstamps *ts, *tmpts = NULL; 673 674 for (t = trans; t; t = tmpt) { 675 if (t->tr_taddr) 676 (void) t_free((char *)t->tr_taddr, T_BIND); 677 if (t->tr_fd > 0) 678 (void) t_close(t->tr_fd); 679 for (a = t->tr_addrs; a; a = tmpa) { 680 for (ts = a->addr_if_tstamps; ts; ts = tmpts) { 681 tmpts = ts->ts_next; 682 free(ts); 683 } 684 (void) netdir_free((char *)a->addr_addrs, ND_ADDRLIST); 685 tmpa = a->addr_next; 686 free(a); 687 } 688 tmpt = t->tr_next; 689 free(t); 690 } 691 } 692