/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ /* * Disk Lights Agent (FMA) * * This Fault Management Daemon (fmd) module periodically scans the topology * tree, enumerates all disks with associated fault indicators, and then * synchronises the fault status of resources in the FMA Resource Cache with * the indicators. In short: it turns the fault light on for befallen disks. * * Presently, we recognise associated fault indicators for disks by looking * for the following structure in the topology tree: * * /bay=N * | * +---- /disk=0 <---------------- our Disk * | * +---- /bay=N?indicator=fail <---- the Fault Light * \---- /bay=N?indicator=ident * * That is: a DISK node will have a parent BAY; that BAY will itself have * child Facility nodes, one of which will be called "fail". If any of the * above does not hold, we simply do nothing for this disk. */ #include #include #include #include #include #include #include #include typedef struct disk_lights { fmd_hdl_t *dl_fmd; uint64_t dl_poll_interval; uint64_t dl_coalesce_interval; id_t dl_timer; boolean_t dl_triggered; } disk_lights_t; static void disklights_topo(fmd_hdl_t *, topo_hdl_t *); static void disklights_recv(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *); static void disklights_timeout(fmd_hdl_t *, id_t, void *); static const fmd_hdl_ops_t fmd_ops = { disklights_recv, /* fmdo_recv */ disklights_timeout, /* fmdo_timeout */ NULL, /* fmdo_close */ NULL, /* fmdo_stats */ NULL, /* fmdo_gc */ NULL, /* fmdo_send */ disklights_topo, /* fmdo_topo */ }; /* * POLL_INTERVAL is the period after which we perform an unsolicited poll * to ensure we remain in sync with reality. */ #define DL_PROP_POLL_INTERVAL "poll-interval" /* * COALESCE_INTERVAL is how long we wait after we are trigged by either a * topology change or a relevant list.* event, in order to allow a series * of events to coalesce. */ #define DL_PROP_COALESCE_INTERVAL "coalesce-interval" static const fmd_prop_t fmd_props[] = { { DL_PROP_POLL_INTERVAL, FMD_TYPE_TIME, "5min" }, { DL_PROP_COALESCE_INTERVAL, FMD_TYPE_TIME, "3s" }, { NULL, 0, NULL } }; static const fmd_hdl_info_t fmd_info = { "Disk Lights Agent", "1.0", &fmd_ops, fmd_props }; /* * Fetch the Facility Node properties (name, type) from the FMRI * for this node, or return -1 if we can't. */ static int get_facility_props(topo_hdl_t *hdl, tnode_t *node, char **facname, char **factype) { int e, ret = -1; nvlist_t *fmri = NULL, *fnvl; char *nn = NULL, *tt = NULL; if (topo_node_resource(node, &fmri, &e) != 0) goto out; if (nvlist_lookup_nvlist(fmri, FM_FMRI_FACILITY, &fnvl) != 0) goto out; if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_NAME, &nn) != 0) goto out; if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_TYPE, &tt) != 0) goto out; *facname = topo_hdl_strdup(hdl, nn); *factype = topo_hdl_strdup(hdl, tt); ret = 0; out: nvlist_free(fmri); return (ret); } typedef struct dl_fault_walk_inner { char *fwi_name; uint32_t fwi_mode; } dl_fault_walk_inner_t; static int dl_fault_walk_inner(topo_hdl_t *thp, tnode_t *node, void *arg) { dl_fault_walk_inner_t *fwi = arg; char *facname = NULL, *factype = NULL; int err; /* * We're only interested in BAY children that are valid Facility Nodes. */ if (topo_node_flags(node) != TOPO_NODE_FACILITY || get_facility_props(thp, node, &facname, &factype) != 0) { goto out; } if (strcmp(fwi->fwi_name, facname) != 0) goto out; /* * Attempt to set the LED mode appropriately. If this fails, give up * and move on. */ (void) topo_prop_set_uint32(node, TOPO_PGROUP_FACILITY, TOPO_LED_MODE, TOPO_PROP_MUTABLE, fwi->fwi_mode, &err); out: topo_hdl_strfree(thp, facname); topo_hdl_strfree(thp, factype); return (TOPO_WALK_NEXT); } static int dl_fault_walk_outer(topo_hdl_t *thp, tnode_t *node, void *arg) { disk_lights_t *dl = arg; dl_fault_walk_inner_t fwi; tnode_t *pnode; int err, has_fault; nvlist_t *fmri = NULL; bzero(&fwi, sizeof (fwi)); /* * We are only looking for DISK nodes in the topology that have a parent * BAY. */ if (strcmp(DISK, topo_node_name(node)) != 0 || (pnode = topo_node_parent(node)) == NULL || strcmp(BAY, topo_node_name(pnode)) != 0) { return (TOPO_WALK_NEXT); } /* * Check to see if the Resource this FMRI describes is Faulty: */ if (topo_node_resource(node, &fmri, &err) != 0) return (TOPO_WALK_NEXT); has_fault = fmd_nvl_fmri_has_fault(dl->dl_fmd, fmri, FMD_HAS_FAULT_RESOURCE, NULL); nvlist_free(fmri); /* * Walk the children of this BAY and flush out our fault status if * we find an appropriate indicator node. */ fwi.fwi_name = "fail"; fwi.fwi_mode = has_fault ? TOPO_LED_STATE_ON : TOPO_LED_STATE_OFF; (void) topo_node_child_walk(thp, pnode, dl_fault_walk_inner, &fwi, &err); return (TOPO_WALK_NEXT); } /* * Walk all of the topology nodes looking for DISKs that match the structure * described in the overview. Once we find them, check their fault status * and update their fault indiciator accordingly. */ static void dl_examine_topo(disk_lights_t *dl) { int err; topo_hdl_t *thp = NULL; topo_walk_t *twp = NULL; thp = fmd_hdl_topo_hold(dl->dl_fmd, TOPO_VERSION); if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, dl_fault_walk_outer, dl, &err)) == NULL) { fmd_hdl_error(dl->dl_fmd, "failed to get topology: %s\n", topo_strerror(err)); goto out; } if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) { fmd_hdl_error(dl->dl_fmd, "failed to walk topology: %s\n", topo_strerror(err)); goto out; } out: if (twp != NULL) topo_walk_fini(twp); if (thp != NULL) fmd_hdl_topo_rele(dl->dl_fmd, thp); } static void dl_trigger_enum(disk_lights_t *dl) { /* * If we're already on the short-poll coalesce timer, then return * immediately. */ if (dl->dl_triggered == B_TRUE) return; dl->dl_triggered = B_TRUE; /* * Replace existing poll timer with coalesce timer: */ if (dl->dl_timer != 0) fmd_timer_remove(dl->dl_fmd, dl->dl_timer); dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL, dl->dl_coalesce_interval); } /*ARGSUSED*/ static void disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data) { disk_lights_t *dl = fmd_hdl_getspecific(hdl); dl->dl_triggered = B_FALSE; dl_examine_topo(dl); /* * Install the long-interval timer for the next poll. */ dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval); } /*ARGSUSED*/ static void disklights_topo(fmd_hdl_t *hdl, topo_hdl_t *thp) { disk_lights_t *dl = fmd_hdl_getspecific(hdl); dl_trigger_enum(dl); } /*ARGSUSED*/ static void disklights_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) { disk_lights_t *dl = fmd_hdl_getspecific(hdl); dl_trigger_enum(dl); } void _fmd_init(fmd_hdl_t *hdl) { disk_lights_t *dl; if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) return; dl = fmd_hdl_zalloc(hdl, sizeof (*dl), FMD_SLEEP); fmd_hdl_setspecific(hdl, dl); /* * Load Configuration: */ dl->dl_fmd = hdl; dl->dl_poll_interval = fmd_prop_get_int64(hdl, DL_PROP_POLL_INTERVAL); dl->dl_coalesce_interval = fmd_prop_get_int64(hdl, DL_PROP_COALESCE_INTERVAL); /* * Schedule the initial enumeration: */ dl_trigger_enum(dl); } void _fmd_fini(fmd_hdl_t *hdl) { disk_lights_t *dl = fmd_hdl_getspecific(hdl); fmd_hdl_free(hdl, dl, sizeof (*dl)); }