/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static const char *rdev = "/dev/nsctl"; /* * Define a minimal user stack size in bytes over and above the * libthread THR_STACK_MIN minimum value. * * This stack size needs to be sufficient to run _newlwp() and then * ioctl() down into the kernel. */ #define NSK_STACK_SIZE 512 /* * LWP scheduling control switches. * * allow_pri - set to non-zero to enable priocntl() manipulations of * created LWPs. * allow_rt - set to non-zero to use the RT rather than the TS * scheduling class when manipulating the schduling * parameters for an LWP. Only used if allow_pri is * non-zero. */ static int allow_pri = 1; static int allow_rt = 0; /* disallow - bad interactions with timeout() */ static int nsctl_fd = -1; static int sigterm; static int nthreads; /* number of threads in the kernel */ static int exiting; /* shutdown in progress flag */ static mutex_t thr_mutex = DEFAULTMUTEX; static mutex_t cfg_mutex = DEFAULTMUTEX; static int cl_nodeid = -1; static int display_msg = 0; static int delay_time = 30; static void usage(void) { (void) fprintf(stderr, gettext("usage: nskernd\n")); exit(255); } static void sighand(int sig) { if (sig == SIGTERM) { sigterm++; } } /* * Returns: 1 - can enter kernel; 0 - shutdown in progress, do not enter kernel */ int nthread_inc(void) { (void) mutex_lock(&thr_mutex); if (exiting) { /* cannot enter kernel as nskernd is being shutdown - exit */ (void) mutex_unlock(&thr_mutex); return (0); } nthreads++; (void) mutex_unlock(&thr_mutex); return (1); } void nthread_dec(void) { (void) mutex_lock(&thr_mutex); nthreads--; (void) mutex_unlock(&thr_mutex); } /* * returns: 1 - can shutdown; 0 - unable to shutdown */ int canshutdown(void) { int rc = 1; time_t start_delay; (void) mutex_lock(&thr_mutex); if (nthreads > 0) { if (display_msg) { (void) fprintf(stderr, gettext("nskernd: unable to shutdown: " "%d kernel threads in use\n"), nthreads); } start_delay = time(0); while (nthreads > 0 && (time(0) - start_delay) < delay_time) { (void) mutex_unlock(&thr_mutex); (void) sleep(1); (void) mutex_lock(&thr_mutex); (void) fprintf(stderr, gettext("nskernd: delay shutdown: " "%d kernel threads in use\n"), nthreads); } if (nthreads > 0) { rc = 0; } else { exiting = 1; } } else { /* flag shutdown in progress */ exiting = 1; } (void) mutex_unlock(&thr_mutex); return (rc); } /* * returns: 1 - shutdown successful; 0 - unable to shutdown */ int shutdown(void) { struct nskernd data; int rc; if (nsctl_fd < 0) return (1); bzero(&data, sizeof (data)); data.command = NSKERND_STOP; if (!canshutdown()) { return (0); } rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data); if (rc < 0) { if (errno != EINTR || !sigterm) { (void) fprintf(stderr, gettext("nskernd: NSKERND_STOP failed\n")); } } return (1); } /* * First function run by a NSKERND_NEWLWP thread. * * Determines if it needs to change the scheduling priority of the LWP, * and then calls back into the kernel. */ static void * _newlwp(void *arg) { struct nskernd nsk; pcparms_t pcparms; pcinfo_t pcinfo; /* copy arguments onto stack and free heap memory */ bcopy(arg, &nsk, sizeof (nsk)); free(arg); if (nsk.data2 && allow_pri) { /* increase the scheduling priority of this LWP */ bzero(&pcinfo, sizeof (pcinfo)); (void) strcpy(pcinfo.pc_clname, allow_rt ? "RT" : "TS"); if (priocntl(0, 0, PC_GETCID, (char *)&pcinfo) < 0) { (void) fprintf(stderr, gettext( "nskernd: priocntl(PC_GETCID) failed: %s\n"), strerror(errno)); goto pri_done; } bzero(&pcparms, sizeof (pcparms)); pcparms.pc_cid = pcinfo.pc_cid; if (allow_rt) { ((rtparms_t *)pcparms.pc_clparms)->rt_pri = (pri_t)0; /* minimum RT priority */ ((rtparms_t *)pcparms.pc_clparms)->rt_tqsecs = (uint_t)RT_TQDEF; ((rtparms_t *)pcparms.pc_clparms)->rt_tqnsecs = RT_TQDEF; } else { ((tsparms_t *)pcparms.pc_clparms)->ts_uprilim = ((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri; ((tsparms_t *)pcparms.pc_clparms)->ts_upri = ((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri; } if (priocntl(P_LWPID, P_MYID, PC_SETPARMS, (char *)&pcparms) < 0) { (void) fprintf(stderr, gettext( "nskernd: priocntl(PC_SETPARMS) failed: %s\n"), strerror(errno)); } } pri_done: if (nthread_inc()) { (void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk); nthread_dec(); } return (NULL); } /* * Start a new thread bound to an LWP. * * This is the user level side of nsc_create_process(). */ static void newlwp(struct nskernd *req) { struct nskernd *nskp; thread_t tid; int rc; nskp = malloc(sizeof (*nskp)); if (!nskp) { #ifdef DEBUG (void) fprintf(stderr, gettext("nskernd: malloc(%d) failed\n"), sizeof (*nskp)); #endif req->data1 = (uint64_t)ENOMEM; return; } /* copy args for child */ bcopy(req, nskp, sizeof (*nskp)); rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE), _newlwp, nskp, THR_BOUND|THR_DETACHED, &tid); if (rc != 0) { /* thr_create failed */ #ifdef DEBUG (void) fprintf(stderr, gettext("nskernd: thr_create failed: %s\n"), strerror(errno)); #endif req->data1 = (uint64_t)errno; free(nskp); } else { /* success - _newlwp() will free nskp */ req->data1 = (uint64_t)0; } } static int log_iibmp_err(char *set, int flags) { CFGFILE *cfg; char key[CFG_MAX_KEY]; char buf[CFG_MAX_BUF]; char newflags[CFG_MAX_BUF]; char outbuf[CFG_MAX_BUF]; char *mst, *shd, *bmp, *mode, *ovr, *cnode, *opt, *grp; int setno, found = 0; int setlen; int rc = 0; pid_t pid = -1; if (set && *set) { setlen = strlen(set); } else { return (EINVAL); } (void) mutex_lock(&cfg_mutex); cfg = cfg_open(""); if (!cfg) { (void) mutex_unlock(&cfg_mutex); return (ENXIO); } if (!cfg_lock(cfg, CFG_WRLOCK)) { (void) mutex_unlock(&cfg_mutex); cfg_close(cfg); pid = fork(); if (pid == -1) { (void) fprintf(stderr, gettext( "nskernd: Error forking\n")); return (errno); } else if (pid > 0) { (void) fprintf(stdout, gettext( "nskernd: Attempting deferred bitmap error\n")); return (0); } (void) mutex_lock(&cfg_mutex); cfg = cfg_open(""); if (!cfg) { (void) mutex_unlock(&cfg_mutex); (void) fprintf(stderr, gettext( "nskernd: Failed cfg_open, deferred bitmap\n")); return (ENXIO); } /* Sooner or later, this lock will be free */ while (!cfg_lock(cfg, CFG_WRLOCK)) (void) sleep(2); } /* find the proper set number */ for (setno = 1; !found; setno++) { (void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno); if (cfg_get_cstring(cfg, key, buf, CFG_MAX_BUF) < 0) { break; } mst = strtok(buf, " "); shd = strtok(NULL, " "); if (strncmp(shd, set, setlen) == 0) { found = 1; bmp = strtok(NULL, " "); mode = strtok(NULL, " "); ovr = strtok(NULL, " "); cnode = strtok(NULL, " "); opt = strtok(NULL, " "); grp = strtok(NULL, " "); break; } } if (found) { /* were there flags in the options field already? */ (void) snprintf(newflags, CFG_MAX_BUF, "%s=0x%x", NSKERN_II_BMP_OPTION, flags); if (opt && strcmp(opt, "-") != 0) { bzero(newflags, CFG_MAX_BUF); opt = strtok(opt, ";"); while (opt) { if (strncmp(opt, NSKERN_II_BMP_OPTION, strlen(NSKERN_II_BMP_OPTION)) != 0) { (void) strcat(newflags, ";"); (void) strcat(newflags, opt); } } } (void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno); (void) snprintf(outbuf, CFG_MAX_BUF, "%s %s %s %s %s %s %s %s", mst, shd, bmp, mode, ovr, cnode, newflags, grp); if (cfg_put_cstring(cfg, key, outbuf, CFG_MAX_BUF) < 0) { (void) printf("Failed to put [%s]\n", outbuf); rc = ENXIO; } else { (void) cfg_commit(cfg); rc = 0; } } else { (void) fprintf(stderr, gettext( "nskernd: Failed deferred bitmap [%s]\n"), set); rc = EINVAL; } cfg_unlock(cfg); cfg_close(cfg); (void) mutex_unlock(&cfg_mutex); /* * if we are the fork'ed client, just exit, if parent just return */ if (pid == 0) { exit(rc); /*NOTREACHED*/ } else { return (rc); } } /* * First function run by a NSKERND_LOCK thread. * * Opens dscfg and locks it, * and then calls back into the kernel. * * Incoming: * data1 is the kernel address of the sync structure. * data2 is read(0)/write(1) lock mode. * * Returns: * data1 as incoming. * data2 errno. */ static void * _dolock(void *arg) { struct nskernd nsk; CFGFILE *cfg; int locked; int mode; int rc = 0; /* copy arguments onto stack and free heap memory */ bcopy(arg, &nsk, sizeof (nsk)); free(arg); (void) mutex_lock(&cfg_mutex); cfg = cfg_open(""); if (cfg == NULL) { #ifdef DEBUG (void) fprintf(stderr, gettext("nskernd: cfg_open failed: %s\n"), strerror(errno)); #endif rc = ENXIO; } if (nsk.data2 == 0) { mode = CFG_RDLOCK; } else { mode = CFG_WRLOCK; } locked = 0; if (rc == 0) { if (cfg_lock(cfg, mode)) { locked = 1; } else { #ifdef DEBUG (void) fprintf(stderr, gettext("nskernd: cfg_lock failed: %s\n"), strerror(errno)); #endif rc = EINVAL; } } /* return to kernel */ nsk.data2 = (uint64_t)rc; if (nthread_inc()) { (void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk); nthread_dec(); } /* cleanup */ if (locked) { cfg_unlock(cfg); locked = 0; } if (cfg != NULL) { cfg_close(cfg); cfg = NULL; } (void) mutex_unlock(&cfg_mutex); return (NULL); } /* * Inter-node lock thread. * * This is the user level side of nsc_rmlock(). */ static void dolock(struct nskernd *req) { struct nskernd *nskp; thread_t tid; int rc; /* create a new thread to do the lock and return to kernel */ nskp = malloc(sizeof (*nskp)); if (!nskp) { #ifdef DEBUG (void) fprintf(stderr, gettext("nskernd:dolock: malloc(%d) failed\n"), sizeof (*nskp)); #endif req->data1 = (uint64_t)ENOMEM; return; } /* copy args for child */ bcopy(req, nskp, sizeof (*nskp)); rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE), _dolock, nskp, THR_BOUND|THR_DETACHED, &tid); if (rc != 0) { /* thr_create failed */ #ifdef DEBUG (void) fprintf(stderr, gettext("nskernd: thr_create failed: %s\n"), strerror(errno)); #endif req->data1 = (uint64_t)errno; free(nskp); } else { /* success - _dolock() will free nskp */ req->data1 = (uint64_t)0; } } /* * Convenience code for engineering test of multi-terabyte volumes. * * zvol (part of zfs) does not support DKIOCPARTITION but does use EFI * labels. This code allocates a simple efi label structure and ioctls * to extract the size of a zvol. It only handles the minimal EFI ioctl * implementation in zvol. */ static void zvol_bsize(char *path, uint64_t *size, const int pnum) { struct stat64 stb1, stb2; struct dk_minfo dkm; int fd = -1; int rc; if (cl_nodeid || pnum != 0) return; if ((fd = open(path, O_RDONLY)) < 0) { return; } if (stat64("/devices/pseudo/zfs@0:zfs", &stb1) != 0 || fstat64(fd, &stb2) != 0 || !S_ISCHR(stb1.st_mode) || !S_ISCHR(stb2.st_mode) || major(stb1.st_rdev) != major(stb2.st_rdev)) { (void) close(fd); return; } rc = ioctl(fd, DKIOCGMEDIAINFO, (void *)&dkm); if (rc >= 0) { *size = LE_64(dkm.dki_capacity) * (dkm.dki_lbsize) / 512; } (void) close(fd); } /* ARGSUSED */ static void get_bsize(uint64_t raw_fd, uint64_t *size, int *partitionp, char *path) { struct nscioc_bsize bsize; #ifdef DKIOCPARTITION struct partition64 p64; #endif struct dk_cinfo dki_info; struct vtoc vtoc; int fd; *partitionp = -1; *size = (uint64_t)0; dki_info.dki_partition = (ushort_t)-1; bsize.dki_info = (uint64_t)(unsigned long)&dki_info; bsize.vtoc = (uint64_t)(unsigned long)&vtoc; bsize.raw_fd = raw_fd; bsize.efi = 0; fd = open(rdev, O_RDONLY); if (fd < 0) return; if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) { if (dki_info.dki_partition != (ushort_t)-1) { /* assume part# is ok and just the size failed */ *partitionp = (int)dki_info.dki_partition; #ifdef DKIOCPARTITION /* see if this is an EFI label */ bzero(&p64, sizeof (p64)); p64.p_partno = (uint_t)*partitionp; if ((ioctl(fd, DKIOCPARTITION, &p64)) > 0) { *size = (uint64_t)p64.p_size; } else { bsize.p64 = (uint64_t)(unsigned long)&p64; bsize.efi = 1; if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) { /* see if this is a zvol */ zvol_bsize(path, size, *partitionp); } else { *size = (uint64_t)p64.p_size; } } #endif /* DKIOCPARTITION */ } (void) close(fd); return; } (void) close(fd); *partitionp = (int)dki_info.dki_partition; if (vtoc.v_sanity != VTOC_SANE) return; if (vtoc.v_version != V_VERSION && vtoc.v_version != 0) return; if (dki_info.dki_partition > V_NUMPAR) return; *size = (uint64_t)vtoc.v_part[(int)dki_info.dki_partition].p_size; } static int iscluster(void) { /* * Find out if we are running in a cluster */ cl_nodeid = cfg_iscluster(); if (cl_nodeid > 0) { return (TRUE); } else if (cl_nodeid == 0) { return (FALSE); } (void) fprintf(stderr, "%s\n", gettext("nskernd: unable to ascertain environment")); exit(1); /* NOTREACHED */ } /* * Runtime Solaris release checking - build release == runtime release * is always considered success, so only keep entries in the map for * the special cases. */ static nsc_release_t nskernd_rel_map[] = { /* { "5.10", "5.10" }, */ { "5.11", "5.10" }, { NULL, NULL } }; #ifdef lint #define main nskernd_main #endif /* ARGSUSED1 */ int main(int argc, char *argv[]) { const char *dir = "/"; struct nskernd data; struct rlimit rl; int i, run, rc; int partition; char *reqd; int syncpipe[2]; int startup; (void) setlocale(LC_ALL, ""); (void) textdomain("nskernd"); rc = nsc_check_release(BUILD_REV_STR, nskernd_rel_map, &reqd); if (rc < 0) { (void) fprintf(stderr, gettext("nskernd: unable to determine the current " "Solaris release: %s\n"), strerror(errno)); exit(1); } else if (rc == FALSE) { (void) fprintf(stderr, gettext("nskernd: incorrect Solaris release " "(requires %s)\n"), reqd); exit(1); } rc = 0; if (argc != 1) usage(); /* * Usage: [-g] [-d ] */ while ((i = getopt(argc, argv, "gd:")) != EOF) { switch (i) { case 'g': display_msg = 1; break; case 'd': delay_time = atoi(optarg); if (delay_time <= 0) { delay_time = 30; } break; default: syslog(LOG_ERR, "Usage: nskernd [-g] [-d ]"); exit(1); break; } } if (chroot(dir) < 0) { (void) fprintf(stderr, gettext("nskernd: chroot failed: %s\n"), strerror(errno)); exit(1); } if (chdir(dir) < 0) { (void) fprintf(stderr, gettext("nskernd: chdir failed: %s\n"), strerror(errno)); exit(1); } /* * Determine if we are in a Sun Cluster or not, before fork'ing */ (void) iscluster(); /* * create a pipe to synchronise the parent with the * child just before it enters its service loop. */ if (pipe(syncpipe) < 0) { (void) fprintf(stderr, gettext("nskernd: cannot create pipe: %s\n"), strerror(errno)); exit(1); } /* * Fork off a child that becomes the daemon. */ if ((rc = fork()) > 0) { char c; int n; (void) close(syncpipe[1]); /* * wait for the close of the pipe. * If we get a char back, indicates good * status from child, so exit 0. * If we get a zero length read, then the * child has failed, so we do too. */ n = read(syncpipe[0], &c, 1); exit((n <= 0) ? 1 : 0); } else if (rc < 0) { (void) fprintf(stderr, gettext("nskernd: cannot fork: %s\n"), strerror(errno)); exit(1); } /* * In child - become daemon. */ /* use closefrom(3C) from PSARC/2000/193 when possible */ for (i = 0; i < syncpipe[1]; i++) { (void) close(i); } closefrom(syncpipe[1] + 1); (void) open("/dev/console", O_WRONLY|O_APPEND); (void) dup(0); (void) dup(0); (void) close(0); (void) setpgrp(); /* * Ignore all signals apart from SIGTERM. */ for (i = 1; i < _sys_nsig; i++) (void) sigset(i, SIG_IGN); (void) sigset(SIGTERM, sighand); /* * Increase the number of fd's that can be open. */ rl.rlim_cur = RLIM_INFINITY; rl.rlim_max = RLIM_INFINITY; if (setrlimit(RLIMIT_NOFILE, &rl) < 0) { (void) fprintf(stderr, gettext("nskernd: could not increase RLIMIT_NOFILE: %s\n"), strerror(errno)); (void) fprintf(stderr, gettext("nskernd: the maximum number of nsctl open " "devices may be reduced\n")); } /* * Open /dev/nsctl and startup. */ nsctl_fd = open(rdev, O_RDONLY); if (nsctl_fd < 0) { (void) fprintf(stderr, gettext("nskernd: unable to open %s\n"), rdev); exit(1); } bzero(&data, sizeof (data)); data.command = NSKERND_START; data.data1 = (uint64_t)cl_nodeid; run = 1; startup = 1; while (run) { rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data); if (rc < 0) { /* try and do kernel cleanup and exit */ if (shutdown()) { run = 0; } else { sigterm = 0; } (void) fprintf(stderr, gettext("nskernd: NSCIOC_NSKERND failed: %s\n"), strerror(errno)); continue; } else if (sigterm) { /* SIGTERM received - terminate */ if (data.command != NSKERND_START && (data.command != NSKERND_STOP || data.data1 != (uint64_t)1)) { /* need to do kernel cleanup */ if (shutdown()) { run = 0; } else { sigterm = 0; data.command = NSKERND_START; data.data1 = (uint64_t)cl_nodeid; } } else { /* just quit */ if (canshutdown()) { run = 0; } else { /* cannot shutdown - threads active */ sigterm = 0; data.command = NSKERND_START; data.data1 = (uint64_t)cl_nodeid; } } continue; } if (startup) { char c = 0; (void) write(syncpipe[1], &c, 1); (void) close(syncpipe[1]); startup = 0; } switch (data.command) { case NSKERND_START: /* (re)start completion */ if (rc == 1) { (void) fprintf(stderr, gettext("nskernd: already started\n")); run = 0; } else if (rc == 2) { (void) fprintf(stderr, gettext("nskernd: stopped by kernel\n")); run = 0; } data.command = NSKERND_WAIT; break; case NSKERND_STOP: /* kernel telling daemon to stop */ if (data.data1 != (uint64_t)1) { (void) shutdown(); run = 0; } break; case NSKERND_BSIZE: /* * kernel requesting partsize * data1 - size return * data2 - raw_fd (entry) * - partition number (return) */ partition = -1; get_bsize(data.data2, &data.data1, &partition, data.char1); data.data2 = (uint64_t)partition; data.command = NSKERND_WAIT; break; case NSKERND_NEWLWP: /* kernel requesting a new LWP */ newlwp(&data); data.command = NSKERND_WAIT; break; case NSKERND_LOCK: /* kernel requesting lock */ dolock(&data); data.command = NSKERND_WAIT; break; case NSKERND_WAIT: /* kernel retrying wait */ /* * the kernel thread can be woken by the dr config * utilities (ie cfgadm) therefore we just reissue * the wait. */ break; case NSKERND_IIBITMAP: rc = log_iibmp_err(data.char1, (int)data.data1); data.data1 = (uint64_t)rc; data.command = NSKERND_WAIT; break; default: (void) fprintf(stderr, gettext("nskernd: unknown command %d"), data.command); data.command = NSKERND_WAIT; break; } } (void) close(nsctl_fd); return (rc); }