1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * nfs_cast.c : broadcast to a specific group of NFS servers
24 *
25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 */
28
29 #include <stdio.h>
30 #include <syslog.h>
31 #include <errno.h>
32 #include <string.h>
33 #include <sys/types.h>
34 #include <sys/time.h>
35 #include <sys/resource.h>
36 #include <unistd.h>
37 #include <stdlib.h>
38 #include <rpc/rpc.h>
39 #include <rpc/clnt_soc.h>
40 #include <rpc/nettype.h>
41 #include <rpc/pmap_prot.h>
42 #include <netconfig.h>
43 #include <netdir.h>
44 #include <nfs/nfs.h>
45 #define NFSCLIENT
46 #include <locale.h>
47 #include "automount.h"
48
49 #define PENALTY_WEIGHT 100000
50
51 struct tstamps {
52 struct tstamps *ts_next;
53 int ts_penalty;
54 int ts_inx;
55 int ts_rcvd;
56 struct timeval ts_timeval;
57 };
58
59 /* A list of addresses - all belonging to the same transport */
60
61 struct addrs {
62 struct addrs *addr_next;
63 struct mapfs *addr_mfs;
64 struct nd_addrlist *addr_addrs;
65 struct tstamps *addr_if_tstamps;
66 };
67
68 /* A list of connectionless transports */
69
70 struct transp {
71 struct transp *tr_next;
72 int tr_fd;
73 char *tr_device;
74 struct t_bind *tr_taddr;
75 struct addrs *tr_addrs;
76 };
77
78 /* A list of map entries and their roundtrip times, for sorting */
79
80 struct sm {
81 struct mapfs *mfs;
82 struct timeval timeval;
83 };
84
85 static void free_transports(struct transp *);
86 static void calc_resp_time(struct timeval *);
87 static struct mapfs *sort_responses(struct transp *);
88 static int host_sm(const void *, const void *b);
89 static int time_sm(const void *, const void *b);
90 extern struct mapfs *add_mfs(struct mapfs *, int, struct mapfs **,
91 struct mapfs **);
92
93 /*
94 * This routine is designed to be able to "ping"
95 * a list of hosts and create a list of responding
96 * hosts sorted by response time.
97 * This must be done without any prior
98 * contact with the host - therefore the "ping"
99 * must be to a "well-known" address. The outstanding
100 * candidate here is the address of "rpcbind".
101 *
102 * A response to a ping is no guarantee that the host
103 * is running NFS, has a mount daemon, or exports
104 * the required filesystem. If the subsequent
105 * mount attempt fails then the host will be marked
106 * "ignore" and the host list will be re-pinged
107 * (sans the bad host). This process continues
108 * until a successful mount is achieved or until
109 * there are no hosts left to try.
110 */
111 enum clnt_stat
nfs_cast(struct mapfs * mfs_in,struct mapfs ** mfs_out,int timeout)112 nfs_cast(struct mapfs *mfs_in, struct mapfs **mfs_out, int timeout)
113 {
114 enum clnt_stat stat;
115 AUTH *sys_auth = authsys_create_default();
116 XDR xdr_stream;
117 register XDR *xdrs = &xdr_stream;
118 int outlen;
119 int if_inx;
120 int tsec;
121 int flag;
122 int sent, addr_cnt, rcvd, if_cnt;
123 fd_set readfds, mask;
124 register ulong_t xid; /* xid - unique per addr */
125 register int i;
126 struct rpc_msg msg;
127 struct timeval t, rcv_timeout;
128 char outbuf[UDPMSGSIZE], inbuf[UDPMSGSIZE];
129 struct t_unitdata t_udata, t_rdata;
130 struct nd_hostserv hs;
131 struct nd_addrlist *retaddrs;
132 struct transp *tr_head;
133 struct transp *trans, *prev_trans;
134 struct addrs *a, *prev_addr;
135 struct tstamps *ts, *prev_ts;
136 NCONF_HANDLE *nc = NULL;
137 struct netconfig *nconf;
138 struct rlimit rl;
139 int dtbsize;
140 struct mapfs *mfs;
141
142 /*
143 * For each connectionless transport get a list of
144 * host addresses. Any single host may have
145 * addresses on several transports.
146 */
147 addr_cnt = sent = rcvd = 0;
148 tr_head = NULL;
149 FD_ZERO(&mask);
150
151 /*
152 * Set the default select size to be the maximum FD_SETSIZE, unless
153 * the current rlimit is lower.
154 */
155 dtbsize = FD_SETSIZE;
156 if (getrlimit(RLIMIT_NOFILE, &rl) == 0) {
157 if (rl.rlim_cur < FD_SETSIZE)
158 dtbsize = rl.rlim_cur;
159 }
160
161 prev_trans = NULL;
162 prev_addr = NULL;
163 prev_ts = NULL;
164 for (mfs = mfs_in; mfs; mfs = mfs->mfs_next) {
165
166 if (trace > 2)
167 trace_prt(1, "nfs_cast: host=%s\n", mfs->mfs_host);
168
169 nc = setnetconfig();
170 if (nc == NULL) {
171 stat = RPC_CANTSEND;
172 goto done_broad;
173 }
174 while (nconf = getnetconfig(nc)) {
175 if (!(nconf->nc_flag & NC_VISIBLE) ||
176 nconf->nc_semantics != NC_TPI_CLTS ||
177 (strcmp(nconf->nc_protofmly, NC_LOOPBACK) == 0))
178 continue;
179 trans = (struct transp *)malloc(sizeof (*trans));
180 if (trans == NULL) {
181 syslog(LOG_ERR, "no memory");
182 stat = RPC_CANTSEND;
183 goto done_broad;
184 }
185 (void) memset(trans, 0, sizeof (*trans));
186 if (tr_head == NULL)
187 tr_head = trans;
188 else
189 prev_trans->tr_next = trans;
190 prev_trans = trans;
191
192 trans->tr_fd = t_open(nconf->nc_device, O_RDWR, NULL);
193 if (trans->tr_fd < 0) {
194 syslog(LOG_ERR, "nfscast: t_open: %s:%m",
195 nconf->nc_device);
196 stat = RPC_CANTSEND;
197 goto done_broad;
198 }
199 if (t_bind(trans->tr_fd, (struct t_bind *)NULL,
200 (struct t_bind *)NULL) < 0) {
201 syslog(LOG_ERR, "nfscast: t_bind: %m");
202 stat = RPC_CANTSEND;
203 goto done_broad;
204 }
205 trans->tr_taddr =
206 /* LINTED pointer alignment */
207 (struct t_bind *)t_alloc(trans->tr_fd, T_BIND, T_ADDR);
208 if (trans->tr_taddr == (struct t_bind *)NULL) {
209 syslog(LOG_ERR, "nfscast: t_alloc: %m");
210 stat = RPC_SYSTEMERROR;
211 goto done_broad;
212 }
213
214 trans->tr_device = nconf->nc_device;
215 FD_SET(trans->tr_fd, &mask);
216
217 if_inx = 0;
218 hs.h_host = mfs->mfs_host;
219 hs.h_serv = "rpcbind";
220 if (netdir_getbyname(nconf, &hs, &retaddrs) == ND_OK) {
221
222 /*
223 * If mfs->ignore is previously set for
224 * this map, clear it. Because a host can
225 * have either v6 or v4 address
226 */
227 if (mfs->mfs_ignore == 1)
228 mfs->mfs_ignore = 0;
229
230 a = (struct addrs *)malloc(sizeof (*a));
231 if (a == NULL) {
232 syslog(LOG_ERR, "no memory");
233 stat = RPC_CANTSEND;
234 goto done_broad;
235 }
236 (void) memset(a, 0, sizeof (*a));
237 if (trans->tr_addrs == NULL)
238 trans->tr_addrs = a;
239 else
240 prev_addr->addr_next = a;
241 prev_addr = a;
242 a->addr_if_tstamps = NULL;
243 a->addr_mfs = mfs;
244 a->addr_addrs = retaddrs;
245 if_cnt = retaddrs->n_cnt;
246 while (if_cnt--) {
247 ts = (struct tstamps *)
248 malloc(sizeof (*ts));
249 if (ts == NULL) {
250 syslog(LOG_ERR, "no memory");
251 stat = RPC_CANTSEND;
252 goto done_broad;
253 }
254 (void) memset(ts, 0, sizeof (*ts));
255 ts->ts_penalty = mfs->mfs_penalty;
256 if (a->addr_if_tstamps == NULL)
257 a->addr_if_tstamps = ts;
258 else
259 prev_ts->ts_next = ts;
260 prev_ts = ts;
261 ts->ts_inx = if_inx++;
262 addr_cnt++;
263 }
264 break;
265 } else {
266 mfs->mfs_ignore = 1;
267 if (verbose)
268 syslog(LOG_ERR,
269 "%s:%s address not known",
270 mfs->mfs_host,
271 strcmp(nconf->nc_proto, NC_INET)?"IPv6":"IPv4");
272 }
273 } /* while */
274
275 endnetconfig(nc);
276 nc = NULL;
277 } /* for */
278 if (addr_cnt == 0) {
279 syslog(LOG_ERR, "nfscast: couldn't find addresses");
280 stat = RPC_CANTSEND;
281 goto done_broad;
282 }
283
284 (void) gettimeofday(&t, (struct timezone *)0);
285 xid = (getpid() ^ t.tv_sec ^ t.tv_usec) & ~0xFF;
286 t.tv_usec = 0;
287
288 /* serialize the RPC header */
289
290 msg.rm_direction = CALL;
291 msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
292 msg.rm_call.cb_prog = RPCBPROG;
293 /*
294 * we can not use RPCBVERS here since it doesn't exist in 4.X,
295 * the fix to bug 1139883 has made the 4.X portmapper silent to
296 * version mismatches. This causes the RPC call to the remote
297 * portmapper to simply be ignored if it's not Version 2.
298 */
299 msg.rm_call.cb_vers = PMAPVERS;
300 msg.rm_call.cb_proc = NULLPROC;
301 if (sys_auth == (AUTH *)NULL) {
302 stat = RPC_SYSTEMERROR;
303 goto done_broad;
304 }
305 msg.rm_call.cb_cred = sys_auth->ah_cred;
306 msg.rm_call.cb_verf = sys_auth->ah_verf;
307 xdrmem_create(xdrs, outbuf, sizeof (outbuf), XDR_ENCODE);
308 if (! xdr_callmsg(xdrs, &msg)) {
309 stat = RPC_CANTENCODEARGS;
310 goto done_broad;
311 }
312 outlen = (int)xdr_getpos(xdrs);
313 xdr_destroy(xdrs);
314
315 t_udata.opt.len = 0;
316 t_udata.udata.buf = outbuf;
317 t_udata.udata.len = outlen;
318
319 /*
320 * Basic loop: send packet to all hosts and wait for response(s).
321 * The response timeout grows larger per iteration.
322 * A unique xid is assigned to each address in order to
323 * correctly match the replies.
324 */
325 for (tsec = 4; timeout > 0; tsec *= 2) {
326
327 timeout -= tsec;
328 if (timeout <= 0)
329 tsec += timeout;
330
331 rcv_timeout.tv_sec = tsec;
332 rcv_timeout.tv_usec = 0;
333
334 sent = 0;
335 for (trans = tr_head; trans; trans = trans->tr_next) {
336 for (a = trans->tr_addrs; a; a = a->addr_next) {
337 struct netbuf *if_netbuf =
338 a->addr_addrs->n_addrs;
339 ts = a->addr_if_tstamps;
340 if_cnt = a->addr_addrs->n_cnt;
341 while (if_cnt--) {
342
343 /*
344 * xid is the first thing in
345 * preserialized buffer
346 */
347 /* LINTED pointer alignment */
348 *((ulong_t *)outbuf) =
349 htonl(xid + ts->ts_inx);
350 (void) gettimeofday(&(ts->ts_timeval),
351 (struct timezone *)0);
352 /*
353 * Check if already received
354 * from a previous iteration.
355 */
356 if (ts->ts_rcvd) {
357 sent++;
358 ts = ts->ts_next;
359 continue;
360 }
361
362 t_udata.addr = *if_netbuf++;
363
364 if (t_sndudata(trans->tr_fd,
365 &t_udata) == 0) {
366 sent++;
367 }
368
369 ts = ts->ts_next;
370 }
371 }
372 }
373 if (sent == 0) { /* no packets sent ? */
374 stat = RPC_CANTSEND;
375 goto done_broad;
376 }
377
378 /*
379 * Have sent all the packets. Now collect the responses...
380 */
381 rcvd = 0;
382 recv_again:
383 msg.acpted_rply.ar_verf = _null_auth;
384 msg.acpted_rply.ar_results.proc = xdr_void;
385 readfds = mask;
386
387 switch (select(dtbsize, &readfds,
388 (fd_set *)NULL, (fd_set *)NULL, &rcv_timeout)) {
389
390 case 0: /* Timed out */
391 /*
392 * If we got at least one response in the
393 * last interval, then don't wait for any
394 * more. In theory we should wait for
395 * the max weighting (penalty) value so
396 * that a very slow server has a chance to
397 * respond but this could take a long time
398 * if the admin has set a high weighting
399 * value.
400 */
401 if (rcvd > 0)
402 goto done_broad;
403
404 stat = RPC_TIMEDOUT;
405 continue;
406
407 case -1: /* some kind of error */
408 if (errno == EINTR)
409 goto recv_again;
410 syslog(LOG_ERR, "nfscast: select: %m");
411 if (rcvd == 0)
412 stat = RPC_CANTRECV;
413 goto done_broad;
414
415 } /* end of select results switch */
416
417 for (trans = tr_head; trans; trans = trans->tr_next) {
418 if (FD_ISSET(trans->tr_fd, &readfds))
419 break;
420 }
421 if (trans == NULL)
422 goto recv_again;
423
424 try_again:
425 t_rdata.addr = trans->tr_taddr->addr;
426 t_rdata.udata.buf = inbuf;
427 t_rdata.udata.maxlen = sizeof (inbuf);
428 t_rdata.udata.len = 0;
429 t_rdata.opt.len = 0;
430 if (t_rcvudata(trans->tr_fd, &t_rdata, &flag) < 0) {
431 if (errno == EINTR)
432 goto try_again;
433 syslog(LOG_ERR, "nfscast: t_rcvudata: %s:%m",
434 trans->tr_device);
435 stat = RPC_CANTRECV;
436 continue;
437 }
438 if (t_rdata.udata.len < sizeof (ulong_t))
439 goto recv_again;
440 if (flag & T_MORE) {
441 syslog(LOG_ERR,
442 "nfscast: t_rcvudata: %s: buffer overflow",
443 trans->tr_device);
444 goto recv_again;
445 }
446
447 /*
448 * see if reply transaction id matches sent id.
449 * If so, decode the results.
450 * Note: received addr is ignored, it could be
451 * different from the send addr if the host has
452 * more than one addr.
453 */
454 xdrmem_create(xdrs, inbuf, (uint_t)t_rdata.udata.len,
455 XDR_DECODE);
456 if (xdr_replymsg(xdrs, &msg)) {
457 if (msg.rm_reply.rp_stat == MSG_ACCEPTED &&
458 (msg.rm_xid & ~0xFF) == xid) {
459 struct addrs *curr_addr;
460
461 i = msg.rm_xid & 0xFF;
462 for (curr_addr = trans->tr_addrs; curr_addr;
463 curr_addr = curr_addr->addr_next) {
464 for (ts = curr_addr->addr_if_tstamps; ts;
465 ts = ts->ts_next)
466 if (ts->ts_inx == i && !ts->ts_rcvd) {
467 ts->ts_rcvd = 1;
468 calc_resp_time(&ts->ts_timeval);
469 stat = RPC_SUCCESS;
470 rcvd++;
471 break;
472 }
473 }
474 } /* otherwise, we just ignore the errors ... */
475 }
476 xdrs->x_op = XDR_FREE;
477 msg.acpted_rply.ar_results.proc = xdr_void;
478 (void) xdr_replymsg(xdrs, &msg);
479 XDR_DESTROY(xdrs);
480 if (rcvd == sent)
481 goto done_broad;
482 else
483 goto recv_again;
484 }
485 if (!rcvd)
486 stat = RPC_TIMEDOUT;
487
488 done_broad:
489 if (rcvd) {
490 *mfs_out = sort_responses(tr_head);
491 stat = RPC_SUCCESS;
492 }
493 if (nc)
494 endnetconfig(nc);
495 free_transports(tr_head);
496 AUTH_DESTROY(sys_auth);
497 return (stat);
498 }
499
500 /*
501 * Go through all the responses and sort fastest to slowest.
502 * Note that any penalty is added to the response time - so the
503 * fastest response isn't necessarily the one that arrived first.
504 */
505 static struct mapfs *
sort_responses(trans)506 sort_responses(trans)
507 struct transp *trans;
508 {
509 struct transp *t;
510 struct addrs *a;
511 struct tstamps *ti;
512 int i, size = 0, allocsize = 10;
513 struct mapfs *p, *mfs_head = NULL, *mfs_tail = NULL;
514 struct sm *buffer;
515
516 buffer = (struct sm *)malloc(allocsize * sizeof (struct sm));
517 if (!buffer) {
518 syslog(LOG_ERR, "sort_responses: malloc error.\n");
519 return (NULL);
520 }
521
522 for (t = trans; t; t = t->tr_next) {
523 for (a = t->tr_addrs; a; a = a->addr_next) {
524 for (ti = a->addr_if_tstamps;
525 ti; ti = ti->ts_next) {
526 if (!ti->ts_rcvd)
527 continue;
528 ti->ts_timeval.tv_usec +=
529 (ti->ts_penalty * PENALTY_WEIGHT);
530 if (ti->ts_timeval.tv_usec >= 1000000) {
531 ti->ts_timeval.tv_sec +=
532 (ti->ts_timeval.tv_usec / 1000000);
533 ti->ts_timeval.tv_usec =
534 (ti->ts_timeval.tv_usec % 1000000);
535 }
536
537 if (size >= allocsize) {
538 allocsize += 10;
539 buffer = (struct sm *)realloc(buffer,
540 allocsize * sizeof (struct sm));
541 if (!buffer) {
542 syslog(LOG_ERR,
543 "sort_responses: malloc error.\n");
544 return (NULL);
545 }
546 }
547 buffer[size].timeval = ti->ts_timeval;
548 buffer[size].mfs = a->addr_mfs;
549 size++;
550 }
551 }
552 }
553
554 #ifdef DEBUG
555 if (trace > 3) {
556 trace_prt(1, " sort_responses: before host sort:\n");
557 for (i = 0; i < size; i++)
558 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host,
559 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
560 trace_prt(0, "\n");
561 }
562 #endif
563
564 qsort((void *)buffer, size, sizeof (struct sm), host_sm);
565
566 /*
567 * Cope with multiply listed hosts by choosing first time
568 */
569 for (i = 1; i < size; i++) {
570 #ifdef DEBUG
571 if (trace > 3) {
572 trace_prt(1, " sort_responses: comparing %s and %s\n",
573 buffer[i-1].mfs->mfs_host,
574 buffer[i].mfs->mfs_host);
575 }
576 #endif
577 if (strcmp(buffer[i-1].mfs->mfs_host,
578 buffer[i].mfs->mfs_host) == 0)
579 memcpy(&buffer[i].timeval, &buffer[i-1].timeval,
580 sizeof (struct timeval));
581 }
582 if (trace > 3)
583 trace_prt(0, "\n");
584
585 #ifdef DEBUG
586 if (trace > 3) {
587 trace_prt(1, " sort_responses: before time sort:\n");
588 for (i = 0; i < size; i++)
589 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host,
590 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
591 trace_prt(0, "\n");
592 }
593 #endif
594
595 qsort((void *)buffer, size, sizeof (struct sm), time_sm);
596
597 #ifdef DEBUG
598 if (trace > 3) {
599 trace_prt(1, " sort_responses: after sort:\n");
600 for (i = 0; i < size; i++)
601 trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host,
602 buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
603 trace_prt(0, "\n");
604 }
605 #endif
606
607 for (i = 0; i < size; i++) {
608 #ifdef DEBUG
609 if (trace > 3) {
610 trace_prt(1, " sort_responses: adding %s\n",
611 buffer[i].mfs->mfs_host);
612 }
613 #endif
614 p = add_mfs(buffer[i].mfs, 0, &mfs_head, &mfs_tail);
615 if (!p)
616 return (NULL);
617 }
618 free(buffer);
619
620 return (mfs_head);
621 }
622
623
624 /*
625 * Comparison routines called by qsort(3).
626 */
host_sm(const void * a,const void * b)627 static int host_sm(const void *a, const void *b)
628 {
629 return (strcmp(((struct sm *)a)->mfs->mfs_host,
630 ((struct sm *)b)->mfs->mfs_host));
631 }
632
time_sm(const void * a,const void * b)633 static int time_sm(const void *a, const void *b)
634 {
635 if (timercmp(&(((struct sm *)a)->timeval),
636 &(((struct sm *)b)->timeval), < /* cstyle */))
637 return (-1);
638 else if (timercmp(&(((struct sm *)a)->timeval),
639 &(((struct sm *)b)->timeval), > /* cstyle */))
640 return (1);
641 else
642 return (0);
643 }
644
645 /*
646 * Given send_time which is the time a request
647 * was transmitted to a server, subtract it
648 * from the time "now" thereby converting it
649 * to an elapsed time.
650 */
651 static void
calc_resp_time(send_time)652 calc_resp_time(send_time)
653 struct timeval *send_time;
654 {
655 struct timeval time_now;
656
657 (void) gettimeofday(&time_now, (struct timezone *)0);
658 if (time_now.tv_usec < send_time->tv_usec) {
659 time_now.tv_sec--;
660 time_now.tv_usec += 1000000;
661 }
662 send_time->tv_sec = time_now.tv_sec - send_time->tv_sec;
663 send_time->tv_usec = time_now.tv_usec - send_time->tv_usec;
664 }
665
666 static void
free_transports(trans)667 free_transports(trans)
668 struct transp *trans;
669 {
670 struct transp *t, *tmpt = NULL;
671 struct addrs *a, *tmpa = NULL;
672 struct tstamps *ts, *tmpts = NULL;
673
674 for (t = trans; t; t = tmpt) {
675 if (t->tr_taddr)
676 (void) t_free((char *)t->tr_taddr, T_BIND);
677 if (t->tr_fd > 0)
678 (void) t_close(t->tr_fd);
679 for (a = t->tr_addrs; a; a = tmpa) {
680 for (ts = a->addr_if_tstamps; ts; ts = tmpts) {
681 tmpts = ts->ts_next;
682 free(ts);
683 }
684 (void) netdir_free((char *)a->addr_addrs, ND_ADDRLIST);
685 tmpa = a->addr_next;
686 free(a);
687 }
688 tmpt = t->tr_next;
689 free(t);
690 }
691 }
692