1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
7 * You can obtain a copy of the license from the top-level file
8 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
9 * You may not use this file except in compliance with the license.
10 *
11 * CDDL HEADER END
12 */
13
14 /*
15 * Copyright (c) 2016, Intel Corporation.
16 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
17 * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
18 */
19
20 #include <libnvpair.h>
21 #include <libzfs.h>
22 #include <stddef.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/list.h>
26 #include <sys/time.h>
27 #include <sys/sysevent/eventdefs.h>
28 #include <sys/sysevent/dev.h>
29 #include <sys/fm/protocol.h>
30 #include <sys/fm/fs/zfs.h>
31 #include <pthread.h>
32 #include <unistd.h>
33
34 #include "zfs_agents.h"
35 #include "fmd_api.h"
36 #include "../zed_log.h"
37
38 /*
39 * agent dispatch code
40 */
41
42 static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER;
43 static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER;
44 static list_t agent_events; /* list of pending events */
45 static int agent_exiting;
46
47 typedef struct agent_event {
48 char ae_class[64];
49 char ae_subclass[32];
50 nvlist_t *ae_nvl;
51 list_node_t ae_node;
52 } agent_event_t;
53
54 pthread_t g_agents_tid;
55
56 libzfs_handle_t *g_zfs_hdl;
57
58 /* guid search data */
59 typedef enum device_type {
60 DEVICE_TYPE_L2ARC, /* l2arc device */
61 DEVICE_TYPE_SPARE, /* spare device */
62 DEVICE_TYPE_PRIMARY /* any primary pool storage device */
63 } device_type_t;
64
65 typedef struct guid_search {
66 uint64_t gs_pool_guid;
67 uint64_t gs_vdev_guid;
68 const char *gs_devid;
69 device_type_t gs_vdev_type;
70 uint64_t gs_vdev_expandtime; /* vdev expansion time */
71 } guid_search_t;
72
73 /*
74 * Walks the vdev tree recursively looking for a matching devid.
75 * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise.
76 */
77 static boolean_t
zfs_agent_iter_vdev(zpool_handle_t * zhp,nvlist_t * nvl,void * arg)78 zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
79 {
80 guid_search_t *gsp = arg;
81 const char *path = NULL;
82 uint_t c, children;
83 nvlist_t **child;
84 uint64_t vdev_guid;
85
86 /*
87 * First iterate over any children.
88 */
89 if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
90 &child, &children) == 0) {
91 for (c = 0; c < children; c++) {
92 if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
93 gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY;
94 return (B_TRUE);
95 }
96 }
97 }
98 /*
99 * Iterate over any spares and cache devices
100 */
101 if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
102 &child, &children) == 0) {
103 for (c = 0; c < children; c++) {
104 if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
105 gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
106 return (B_TRUE);
107 }
108 }
109 }
110 if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
111 &child, &children) == 0) {
112 for (c = 0; c < children; c++) {
113 if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
114 gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
115 return (B_TRUE);
116 }
117 }
118 }
119 /*
120 * On a devid match, grab the vdev guid and expansion time, if any.
121 */
122 if (gsp->gs_devid != NULL &&
123 (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
124 (strcmp(gsp->gs_devid, path) == 0)) {
125 (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
126 &gsp->gs_vdev_guid);
127 (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
128 &gsp->gs_vdev_expandtime);
129 return (B_TRUE);
130 }
131 /*
132 * Otherwise, on a vdev guid match, grab the devid and expansion
133 * time. The devid might be missing on removal since its not part
134 * of blkid cache and L2ARC VDEV does not contain pool guid in its
135 * blkid, so this is a special case for L2ARC VDEV.
136 */
137 else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL &&
138 nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 &&
139 gsp->gs_vdev_guid == vdev_guid) {
140 (void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
141 &gsp->gs_devid);
142 (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
143 &gsp->gs_vdev_expandtime);
144 return (B_TRUE);
145 }
146
147 return (B_FALSE);
148 }
149
150 static int
zfs_agent_iter_pool(zpool_handle_t * zhp,void * arg)151 zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
152 {
153 guid_search_t *gsp = arg;
154 nvlist_t *config, *nvl;
155
156 /*
157 * For each vdev in this pool, look for a match by devid
158 */
159 if ((config = zpool_get_config(zhp, NULL)) != NULL) {
160 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
161 &nvl) == 0) {
162 (void) zfs_agent_iter_vdev(zhp, nvl, gsp);
163 }
164 }
165 /*
166 * if a match was found then grab the pool guid
167 */
168 if (gsp->gs_vdev_guid && gsp->gs_devid) {
169 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
170 &gsp->gs_pool_guid);
171 }
172
173 zpool_close(zhp);
174 return (gsp->gs_devid != NULL && gsp->gs_vdev_guid != 0);
175 }
176
177 void
zfs_agent_post_event(const char * class,const char * subclass,nvlist_t * nvl)178 zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
179 {
180 agent_event_t *event;
181
182 if (subclass == NULL)
183 subclass = "";
184
185 event = malloc(sizeof (agent_event_t));
186 if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) {
187 if (event)
188 free(event);
189 return;
190 }
191
192 if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) {
193 class = EC_ZFS;
194 subclass = ESC_ZFS_VDEV_CHECK;
195 }
196
197 /*
198 * On Linux, we don't get the expected FM_RESOURCE_REMOVED ereport
199 * from the vdev_disk layer after a hot unplug. Fortunately we do
200 * get an EC_DEV_REMOVE from our disk monitor and it is a suitable
201 * proxy so we remap it here for the benefit of the diagnosis engine.
202 * Starting in OpenZFS 2.0, we do get FM_RESOURCE_REMOVED from the spa
203 * layer. Processing multiple FM_RESOURCE_REMOVED events is not harmful.
204 */
205 if ((strcmp(class, EC_DEV_REMOVE) == 0) &&
206 (strcmp(subclass, ESC_DISK) == 0) &&
207 (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) ||
208 nvlist_exists(nvl, DEV_IDENTIFIER))) {
209 nvlist_t *payload = event->ae_nvl;
210 struct timeval tv;
211 int64_t tod[2];
212 uint64_t pool_guid = 0, vdev_guid = 0;
213 guid_search_t search = { 0 };
214 device_type_t devtype = DEVICE_TYPE_PRIMARY;
215 const char *devid = NULL;
216
217 class = "resource.fs.zfs.removed";
218 subclass = "";
219
220 (void) nvlist_add_string(payload, FM_CLASS, class);
221 (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid);
222 (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
223 (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
224
225 (void) gettimeofday(&tv, NULL);
226 tod[0] = tv.tv_sec;
227 tod[1] = tv.tv_usec;
228 (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
229
230 /*
231 * If devid is missing but vdev_guid is available, find devid
232 * and pool_guid from vdev_guid.
233 * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
234 * ZFS_EV_POOL_GUID may be missing so find them.
235 */
236 if (devid == NULL || pool_guid == 0 || vdev_guid == 0) {
237 if (devid == NULL)
238 search.gs_vdev_guid = vdev_guid;
239 else
240 search.gs_devid = devid;
241 zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
242 if (devid == NULL)
243 devid = search.gs_devid;
244 if (pool_guid == 0)
245 pool_guid = search.gs_pool_guid;
246 if (vdev_guid == 0)
247 vdev_guid = search.gs_vdev_guid;
248 devtype = search.gs_vdev_type;
249 }
250
251 /*
252 * We want to avoid reporting "remove" events coming from
253 * libudev for VDEVs which were expanded recently (10s) and
254 * avoid activating spares in response to partitions being
255 * deleted and created in rapid succession.
256 */
257 if (search.gs_vdev_expandtime != 0 &&
258 search.gs_vdev_expandtime + 10 > tv.tv_sec) {
259 zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
260 "for recently expanded device '%s'", EC_DEV_REMOVE,
261 devid);
262 fnvlist_free(payload);
263 free(event);
264 goto out;
265 }
266
267 (void) nvlist_add_uint64(payload,
268 FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid);
269 (void) nvlist_add_uint64(payload,
270 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid);
271 switch (devtype) {
272 case DEVICE_TYPE_L2ARC:
273 (void) nvlist_add_string(payload,
274 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
275 VDEV_TYPE_L2CACHE);
276 break;
277 case DEVICE_TYPE_SPARE:
278 (void) nvlist_add_string(payload,
279 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE);
280 break;
281 case DEVICE_TYPE_PRIMARY:
282 (void) nvlist_add_string(payload,
283 FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK);
284 break;
285 }
286
287 zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'",
288 EC_DEV_REMOVE, class);
289 }
290
291 (void) strlcpy(event->ae_class, class, sizeof (event->ae_class));
292 (void) strlcpy(event->ae_subclass, subclass,
293 sizeof (event->ae_subclass));
294
295 (void) pthread_mutex_lock(&agent_lock);
296 list_insert_tail(&agent_events, event);
297 (void) pthread_mutex_unlock(&agent_lock);
298
299 out:
300 (void) pthread_cond_signal(&agent_cond);
301 }
302
303 static void
zfs_agent_dispatch(const char * class,const char * subclass,nvlist_t * nvl)304 zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl)
305 {
306 /*
307 * The diagnosis engine subscribes to the following events.
308 * On illumos these subscriptions reside in:
309 * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf
310 */
311 if (strstr(class, "ereport.fs.zfs.") != NULL ||
312 strstr(class, "resource.fs.zfs.") != NULL ||
313 strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 ||
314 strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 ||
315 strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) {
316 fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class);
317 }
318
319 /*
320 * The retire agent subscribes to the following events.
321 * On illumos these subscriptions reside in:
322 * /usr/lib/fm/fmd/plugins/zfs-retire.conf
323 *
324 * NOTE: faults events come directly from our diagnosis engine
325 * and will not pass through the zfs kernel module.
326 */
327 if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
328 strcmp(class, "resource.fs.zfs.removed") == 0 ||
329 strcmp(class, "resource.fs.zfs.statechange") == 0 ||
330 strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
331 fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class);
332 }
333
334 /*
335 * The SLM module only consumes disk events and vdev check events
336 *
337 * NOTE: disk events come directly from disk monitor and will
338 * not pass through the zfs kernel module.
339 */
340 if (strstr(class, "EC_dev_") != NULL ||
341 strcmp(class, EC_ZFS) == 0) {
342 (void) zfs_slm_event(class, subclass, nvl);
343 }
344 }
345
346 /*
347 * Events are consumed and dispatched from this thread
348 * An agent can also post an event so event list lock
349 * is not held when calling an agent.
350 * One event is consumed at a time.
351 */
352 static void *
zfs_agent_consumer_thread(void * arg)353 zfs_agent_consumer_thread(void *arg)
354 {
355 (void) arg;
356
357 for (;;) {
358 agent_event_t *event;
359
360 (void) pthread_mutex_lock(&agent_lock);
361
362 /* wait for an event to show up */
363 while (!agent_exiting && list_is_empty(&agent_events))
364 (void) pthread_cond_wait(&agent_cond, &agent_lock);
365
366 if (agent_exiting) {
367 (void) pthread_mutex_unlock(&agent_lock);
368 zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: "
369 "exiting");
370 return (NULL);
371 }
372
373 if ((event = list_remove_head(&agent_events)) != NULL) {
374 (void) pthread_mutex_unlock(&agent_lock);
375
376 /* dispatch to all event subscribers */
377 zfs_agent_dispatch(event->ae_class, event->ae_subclass,
378 event->ae_nvl);
379
380 nvlist_free(event->ae_nvl);
381 free(event);
382 continue;
383 }
384
385 (void) pthread_mutex_unlock(&agent_lock);
386 }
387
388 return (NULL);
389 }
390
391 void
zfs_agent_init(libzfs_handle_t * zfs_hdl)392 zfs_agent_init(libzfs_handle_t *zfs_hdl)
393 {
394 fmd_hdl_t *hdl;
395
396 g_zfs_hdl = zfs_hdl;
397
398 if (zfs_slm_init() != 0)
399 zed_log_die("Failed to initialize zfs slm");
400 zed_log_msg(LOG_INFO, "Add Agent: init");
401
402 hdl = fmd_module_hdl("zfs-diagnosis");
403 _zfs_diagnosis_init(hdl);
404 if (!fmd_module_initialized(hdl))
405 zed_log_die("Failed to initialize zfs diagnosis");
406
407 hdl = fmd_module_hdl("zfs-retire");
408 _zfs_retire_init(hdl);
409 if (!fmd_module_initialized(hdl))
410 zed_log_die("Failed to initialize zfs retire");
411
412 list_create(&agent_events, sizeof (agent_event_t),
413 offsetof(struct agent_event, ae_node));
414
415 if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread,
416 NULL) != 0) {
417 list_destroy(&agent_events);
418 zed_log_die("Failed to initialize agents");
419 }
420 pthread_setname_np(g_agents_tid, "agents");
421 }
422
423 void
zfs_agent_fini(void)424 zfs_agent_fini(void)
425 {
426 fmd_hdl_t *hdl;
427 agent_event_t *event;
428
429 agent_exiting = 1;
430 (void) pthread_cond_signal(&agent_cond);
431
432 /* wait for zfs_enum_pools thread to complete */
433 (void) pthread_join(g_agents_tid, NULL);
434
435 /* drain any pending events */
436 while ((event = list_remove_head(&agent_events)) != NULL) {
437 nvlist_free(event->ae_nvl);
438 free(event);
439 }
440
441 list_destroy(&agent_events);
442
443 if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) {
444 _zfs_retire_fini(hdl);
445 fmd_hdl_unregister(hdl);
446 }
447 if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) {
448 _zfs_diagnosis_fini(hdl);
449 fmd_hdl_unregister(hdl);
450 }
451
452 zed_log_msg(LOG_INFO, "Add Agent: fini");
453 zfs_slm_fini();
454
455 g_zfs_hdl = NULL;
456 }
457