1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
7 * You can obtain a copy of the license from the top-level file
8 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
9 * You may not use this file except in compliance with the license.
10 *
11 * CDDL HEADER END
12 */
13
14 /*
15 * Copyright (c) 2016, 2017, Intel Corporation.
16 */
17
18 #ifdef HAVE_LIBUDEV
19
20 #include <errno.h>
21 #include <fcntl.h>
22 #include <libnvpair.h>
23 #include <libudev.h>
24 #include <libzfs.h>
25 #include <libzutil.h>
26 #include <pthread.h>
27 #include <stdlib.h>
28 #include <string.h>
29
30 #include <sys/sysevent/eventdefs.h>
31 #include <sys/sysevent/dev.h>
32
33 #include "zed_log.h"
34 #include "zed_disk_event.h"
35 #include "agents/zfs_agents.h"
36
37 /*
38 * Portions of ZED need to see disk events for disks belonging to ZFS pools.
39 * A libudev monitor is established to monitor block device actions and pass
40 * them on to internal ZED logic modules. Initially, zfs_mod.c is the only
41 * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
42 * module responsible for handling disk events for ZFS.
43 */
44
45 pthread_t g_mon_tid;
46 struct udev *g_udev;
47 struct udev_monitor *g_mon;
48
49
50 #define DEV_BYID_PATH "/dev/disk/by-id/"
51
52 /* 64MB is minimum usable disk for ZFS */
53 #define MINIMUM_SECTORS 131072ULL
54
55
56 /*
57 * Post disk event to SLM module
58 *
59 * occurs in the context of monitor thread
60 */
61 static void
zed_udev_event(const char * class,const char * subclass,nvlist_t * nvl)62 zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
63 {
64 const char *strval;
65 uint64_t numval;
66
67 zed_log_msg(LOG_INFO, "zed_disk_event:");
68 zed_log_msg(LOG_INFO, "\tclass: %s", class);
69 zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
70 if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
71 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
72 if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
73 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
74 if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
75 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
76 if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE)
77 zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART);
78 if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
79 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
80 if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
81 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
82 if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0)
83 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval);
84 if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
85 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
86 if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
87 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
88
89 (void) zfs_agent_post_event(class, subclass, nvl);
90 }
91
92 /*
93 * dev_event_nvlist: place event schema into an nv pair list
94 *
95 * NAME VALUE (example)
96 * -------------- --------------------------------------------------------
97 * DEV_NAME /dev/sdl
98 * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
99 * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
100 * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0
101 * DEV_IS_PART ---
102 * DEV_SIZE 500107862016
103 * ZFS_EV_POOL_GUID 17523635698032189180
104 * ZFS_EV_VDEV_GUID 14663607734290803088
105 */
106 static nvlist_t *
dev_event_nvlist(struct udev_device * dev)107 dev_event_nvlist(struct udev_device *dev)
108 {
109 nvlist_t *nvl;
110 char strval[128];
111 const char *value, *path;
112 uint64_t guid;
113
114 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
115 return (NULL);
116
117 if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
118 (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
119 if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
120 (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
121 if ((path = udev_device_get_devnode(dev)) != NULL)
122 (void) nvlist_add_string(nvl, DEV_NAME, path);
123 if ((value = udev_device_get_devpath(dev)) != NULL)
124 (void) nvlist_add_string(nvl, DEV_PATH, value);
125 value = udev_device_get_devtype(dev);
126 if ((value != NULL && strcmp("partition", value) == 0) ||
127 (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
128 != NULL)) {
129 (void) nvlist_add_boolean(nvl, DEV_IS_PART);
130 }
131 if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
132 uint64_t numval = DEV_BSIZE;
133
134 numval *= strtoull(value, NULL, 10);
135 (void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
136
137 /*
138 * If the device has a parent, then get the parent block
139 * device's size as well. For example, /dev/sda1's parent
140 * is /dev/sda.
141 */
142 struct udev_device *parent_dev = udev_device_get_parent(dev);
143 if (parent_dev != NULL &&
144 (value = udev_device_get_sysattr_value(parent_dev, "size"))
145 != NULL) {
146 uint64_t numval = DEV_BSIZE;
147
148 numval *= strtoull(value, NULL, 10);
149 (void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval);
150 }
151 }
152
153 /*
154 * Grab the pool and vdev guids from blkid cache
155 */
156 value = udev_device_get_property_value(dev, "ID_FS_UUID");
157 if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
158 (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
159
160 value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
161 if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
162 (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
163
164 /*
165 * Either a vdev guid or a devid must be present for matching
166 */
167 if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
168 !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
169 nvlist_free(nvl);
170 return (NULL);
171 }
172
173 return (nvl);
174 }
175
176 /*
177 * Listen for block device uevents
178 */
179 static void *
zed_udev_monitor(void * arg)180 zed_udev_monitor(void *arg)
181 {
182 struct udev_monitor *mon = arg;
183 const char *tmp;
184 char *tmp2;
185
186 zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
187
188 while (1) {
189 struct udev_device *dev;
190 const char *action, *type, *part, *sectors;
191 const char *bus, *uuid, *devpath;
192 const char *class, *subclass;
193 nvlist_t *nvl;
194 boolean_t is_zfs = B_FALSE;
195
196 /* allow a cancellation while blocked (recvmsg) */
197 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
198
199 /* blocks at recvmsg until an event occurs */
200 if ((dev = udev_monitor_receive_device(mon)) == NULL) {
201 zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
202 "device error %d", errno);
203 continue;
204 }
205
206 /* allow all steps to complete before a cancellation */
207 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
208
209 /*
210 * Strongly typed device is the preferred filter
211 */
212 type = udev_device_get_property_value(dev, "ID_FS_TYPE");
213 if (type != NULL && type[0] != '\0') {
214 if (strcmp(type, "zfs_member") == 0) {
215 is_zfs = B_TRUE;
216 } else {
217 /* not ours, so skip */
218 zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
219 "%s (in use by %s)",
220 udev_device_get_devnode(dev), type);
221 udev_device_unref(dev);
222 continue;
223 }
224 }
225
226 /*
227 * if this is a disk and it is partitioned, then the
228 * zfs label will reside in a DEVTYPE=partition and
229 * we can skip passing this event
230 *
231 * Special case: Blank disks are sometimes reported with
232 * an erroneous 'atari' partition, and should not be
233 * excluded from being used as an autoreplace disk:
234 *
235 * https://github.com/openzfs/zfs/issues/13497
236 */
237 type = udev_device_get_property_value(dev, "DEVTYPE");
238 part = udev_device_get_property_value(dev,
239 "ID_PART_TABLE_TYPE");
240 if (type != NULL && type[0] != '\0' &&
241 strcmp(type, "disk") == 0 &&
242 part != NULL && part[0] != '\0') {
243 const char *devname =
244 udev_device_get_property_value(dev, "DEVNAME");
245
246 if (strcmp(part, "atari") == 0) {
247 zed_log_msg(LOG_INFO,
248 "%s: %s is reporting an atari partition, "
249 "but we're going to assume it's a false "
250 "positive and still use it (issue #13497)",
251 __func__, devname);
252 } else {
253 zed_log_msg(LOG_INFO,
254 "%s: skip %s since it has a %s partition "
255 "already", __func__, devname, part);
256 /* skip and wait for partition event */
257 udev_device_unref(dev);
258 continue;
259 }
260 }
261
262 /*
263 * ignore small partitions
264 */
265 sectors = udev_device_get_property_value(dev,
266 "ID_PART_ENTRY_SIZE");
267 if (sectors == NULL)
268 sectors = udev_device_get_sysattr_value(dev, "size");
269 if (sectors != NULL &&
270 strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
271 zed_log_msg(LOG_INFO,
272 "%s: %s sectors %s < %llu (minimum)",
273 __func__,
274 udev_device_get_property_value(dev, "DEVNAME"),
275 sectors, MINIMUM_SECTORS);
276 udev_device_unref(dev);
277 continue;
278 }
279
280 /*
281 * If the blkid probe didn't find ZFS, then a persistent
282 * device id string is required in the message schema
283 * for matching with vdevs. Preflight here for expected
284 * udev information.
285 *
286 * Special case:
287 * NVMe devices don't have ID_BUS set (at least on RHEL 7-8),
288 * but they are valid for autoreplace. Add a special case for
289 * them by searching for "/nvme/" in the udev DEVPATH:
290 *
291 * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1
292 */
293 bus = udev_device_get_property_value(dev, "ID_BUS");
294 uuid = udev_device_get_property_value(dev, "DM_UUID");
295 devpath = udev_device_get_devpath(dev);
296 if (!is_zfs && (bus == NULL && uuid == NULL &&
297 strstr(devpath, "/nvme/") == NULL)) {
298 zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
299 "source", udev_device_get_devnode(dev));
300 udev_device_unref(dev);
301 continue;
302 }
303
304 action = udev_device_get_action(dev);
305 if (strcmp(action, "add") == 0) {
306 class = EC_DEV_ADD;
307 subclass = ESC_DISK;
308 } else if (strcmp(action, "remove") == 0) {
309 class = EC_DEV_REMOVE;
310 subclass = ESC_DISK;
311 } else if (strcmp(action, "change") == 0) {
312 class = EC_DEV_STATUS;
313 subclass = ESC_DEV_DLE;
314 } else {
315 zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
316 action);
317 udev_device_unref(dev);
318 continue;
319 }
320
321 /*
322 * Special case an EC_DEV_ADD for multipath devices
323 *
324 * When a multipath device is created, udev reports the
325 * following:
326 *
327 * 1. "add" event of the dm device for the multipath device
328 * (like /dev/dm-3).
329 * 2. "change" event to create the actual multipath device
330 * symlink (like /dev/mapper/mpatha). The event also
331 * passes back the relevant DM vars we care about, like
332 * DM_UUID.
333 * 3. Another "change" event identical to #2 (that we ignore).
334 *
335 * To get the behavior we want, we treat the "change" event
336 * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
337 * a new disk being added.
338 */
339 if (strcmp(class, EC_DEV_STATUS) == 0 &&
340 udev_device_get_property_value(dev, "DM_UUID") &&
341 udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
342 tmp = udev_device_get_devnode(dev);
343 tmp2 = zfs_get_underlying_path(tmp);
344 if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
345 /*
346 * We have a real underlying device, which
347 * means that this multipath "change" event is
348 * an "add" event.
349 *
350 * If the multipath device and the underlying
351 * dev are the same name (i.e. /dev/dm-5), then
352 * there is no real underlying disk for this
353 * multipath device, and so this "change" event
354 * really is a multipath removal.
355 */
356 class = EC_DEV_ADD;
357 subclass = ESC_DISK;
358 } else {
359 tmp = udev_device_get_property_value(dev,
360 "DM_NR_VALID_PATHS");
361 /* treat as a multipath remove */
362 if (tmp != NULL && strcmp(tmp, "0") == 0) {
363 class = EC_DEV_REMOVE;
364 subclass = ESC_DISK;
365 }
366 }
367 free(tmp2);
368 }
369
370 /*
371 * Special case an EC_DEV_ADD for scsi_debug devices
372 *
373 * These devices require a udevadm trigger command after
374 * creation in order to register the vdev_id scsidebug alias
375 * rule (adds a persistent path (phys_path) used for fault
376 * management automated tests in the ZFS test suite.
377 *
378 * After udevadm trigger command, event registers as a "change"
379 * event but needs to instead be handled as another "add" event
380 * to allow for disk labeling and partitioning to occur.
381 */
382 if (strcmp(class, EC_DEV_STATUS) == 0 &&
383 udev_device_get_property_value(dev, "ID_VDEV") &&
384 udev_device_get_property_value(dev, "ID_MODEL")) {
385 const char *id_model, *id_model_sd = "scsi_debug";
386
387 id_model = udev_device_get_property_value(dev,
388 "ID_MODEL");
389 if (strcmp(id_model, id_model_sd) == 0) {
390 class = EC_DEV_ADD;
391 subclass = ESC_DISK;
392 }
393 }
394
395 if ((nvl = dev_event_nvlist(dev)) != NULL) {
396 zed_udev_event(class, subclass, nvl);
397 nvlist_free(nvl);
398 }
399
400 udev_device_unref(dev);
401 }
402
403 return (NULL);
404 }
405
406 int
zed_disk_event_init(void)407 zed_disk_event_init(void)
408 {
409 int fd, fflags;
410
411 if ((g_udev = udev_new()) == NULL) {
412 zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
413 return (-1);
414 }
415
416 /* Set up a udev monitor for block devices */
417 g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
418 udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
419 udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
420 "partition");
421 udev_monitor_enable_receiving(g_mon);
422
423 /* Make sure monitoring socket is blocking */
424 fd = udev_monitor_get_fd(g_mon);
425 if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
426 (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
427
428 /* spawn a thread to monitor events */
429 if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
430 udev_monitor_unref(g_mon);
431 udev_unref(g_udev);
432 zed_log_msg(LOG_WARNING, "pthread_create failed");
433 return (-1);
434 }
435
436 pthread_setname_np(g_mon_tid, "udev monitor");
437 zed_log_msg(LOG_INFO, "zed_disk_event_init");
438
439 return (0);
440 }
441
442 void
zed_disk_event_fini(void)443 zed_disk_event_fini(void)
444 {
445 /* cancel monitor thread at recvmsg() */
446 (void) pthread_cancel(g_mon_tid);
447 (void) pthread_join(g_mon_tid, NULL);
448
449 /* cleanup udev resources */
450 udev_monitor_unref(g_mon);
451 udev_unref(g_udev);
452
453 zed_log_msg(LOG_INFO, "zed_disk_event_fini");
454 }
455
456 #else
457
458 #include "zed_disk_event.h"
459
460 int
zed_disk_event_init(void)461 zed_disk_event_init(void)
462 {
463 return (0);
464 }
465
466 void
zed_disk_event_fini(void)467 zed_disk_event_fini(void)
468 {
469 }
470
471 #endif /* HAVE_LIBUDEV */
472