xref: /freebsd/sys/contrib/openzfs/cmd/zed/zed_disk_event.c (revision 924226fba12cc9a228c73b956e1b7fa24c60b055)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License Version 1.0 (CDDL-1.0).
6  * You can obtain a copy of the license from the top-level file
7  * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
8  * You may not use this file except in compliance with the license.
9  *
10  * CDDL HEADER END
11  */
12 
13 /*
14  * Copyright (c) 2016, 2017, Intel Corporation.
15  */
16 
17 #ifdef HAVE_LIBUDEV
18 
19 #include <errno.h>
20 #include <fcntl.h>
21 #include <libnvpair.h>
22 #include <libudev.h>
23 #include <libzfs.h>
24 #include <libzutil.h>
25 #include <pthread.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
29 #include <sys/sysevent/eventdefs.h>
30 #include <sys/sysevent/dev.h>
31 
32 #include "zed_log.h"
33 #include "zed_disk_event.h"
34 #include "agents/zfs_agents.h"
35 
36 /*
37  * Portions of ZED need to see disk events for disks belonging to ZFS pools.
38  * A libudev monitor is established to monitor block device actions and pass
39  * them on to internal ZED logic modules.  Initially, zfs_mod.c is the only
40  * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
41  * module responsible for handling disk events for ZFS.
42  */
43 
44 pthread_t g_mon_tid;
45 struct udev *g_udev;
46 struct udev_monitor *g_mon;
47 
48 
49 #define	DEV_BYID_PATH	"/dev/disk/by-id/"
50 
51 /* 64MB is minimum usable disk for ZFS */
52 #define	MINIMUM_SECTORS		131072
53 
54 
55 /*
56  * Post disk event to SLM module
57  *
58  * occurs in the context of monitor thread
59  */
60 static void
61 zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
62 {
63 	char *strval;
64 	uint64_t numval;
65 
66 	zed_log_msg(LOG_INFO, "zed_disk_event:");
67 	zed_log_msg(LOG_INFO, "\tclass: %s", class);
68 	zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
69 	if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
70 		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
71 	if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
72 		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
73 	if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
74 		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
75 	if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE)
76 		zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART);
77 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
78 		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
79 	if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
80 		zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
81 	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
82 		zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
83 	if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
84 		zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
85 
86 	(void) zfs_agent_post_event(class, subclass, nvl);
87 }
88 
89 /*
90  * dev_event_nvlist: place event schema into an nv pair list
91  *
92  * NAME			VALUE (example)
93  * --------------	--------------------------------------------------------
94  * DEV_NAME		/dev/sdl
95  * DEV_PATH		/devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
96  * DEV_IDENTIFIER	ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
97  * DEV_PHYS_PATH	pci-0000:04:00.0-sas-0x4433221101000000-lun-0
98  * DEV_IS_PART		---
99  * DEV_SIZE		500107862016
100  * ZFS_EV_POOL_GUID	17523635698032189180
101  * ZFS_EV_VDEV_GUID	14663607734290803088
102  */
103 static nvlist_t *
104 dev_event_nvlist(struct udev_device *dev)
105 {
106 	nvlist_t *nvl;
107 	char strval[128];
108 	const char *value, *path;
109 	uint64_t guid;
110 
111 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
112 		return (NULL);
113 
114 	if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
115 		(void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
116 	if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
117 		(void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
118 	if ((path = udev_device_get_devnode(dev)) != NULL)
119 		(void) nvlist_add_string(nvl, DEV_NAME, path);
120 	if ((value = udev_device_get_devpath(dev)) != NULL)
121 		(void) nvlist_add_string(nvl, DEV_PATH, value);
122 	value = udev_device_get_devtype(dev);
123 	if ((value != NULL && strcmp("partition", value) == 0) ||
124 	    (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
125 	    != NULL)) {
126 		(void) nvlist_add_boolean(nvl, DEV_IS_PART);
127 	}
128 	if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
129 		uint64_t numval = DEV_BSIZE;
130 
131 		numval *= strtoull(value, NULL, 10);
132 		(void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
133 	}
134 
135 	/*
136 	 * Grab the pool and vdev guids from blkid cache
137 	 */
138 	value = udev_device_get_property_value(dev, "ID_FS_UUID");
139 	if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
140 		(void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
141 
142 	value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
143 	if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
144 		(void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
145 
146 	/*
147 	 * Either a vdev guid or a devid must be present for matching
148 	 */
149 	if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
150 	    !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
151 		nvlist_free(nvl);
152 		return (NULL);
153 	}
154 
155 	return (nvl);
156 }
157 
158 /*
159  *  Listen for block device uevents
160  */
161 static void *
162 zed_udev_monitor(void *arg)
163 {
164 	struct udev_monitor *mon = arg;
165 	char *tmp, *tmp2;
166 
167 	zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
168 
169 	while (1) {
170 		struct udev_device *dev;
171 		const char *action, *type, *part, *sectors;
172 		const char *bus, *uuid;
173 		const char *class, *subclass;
174 		nvlist_t *nvl;
175 		boolean_t is_zfs = B_FALSE;
176 
177 		/* allow a cancellation while blocked (recvmsg) */
178 		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
179 
180 		/* blocks at recvmsg until an event occurs */
181 		if ((dev = udev_monitor_receive_device(mon)) == NULL) {
182 			zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
183 			    "device error %d", errno);
184 			continue;
185 		}
186 
187 		/* allow all steps to complete before a cancellation */
188 		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
189 
190 		/*
191 		 * Strongly typed device is the preferred filter
192 		 */
193 		type = udev_device_get_property_value(dev, "ID_FS_TYPE");
194 		if (type != NULL && type[0] != '\0') {
195 			if (strcmp(type, "zfs_member") == 0) {
196 				is_zfs = B_TRUE;
197 			} else {
198 				/* not ours, so skip */
199 				zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
200 				    "%s (in use by %s)",
201 				    udev_device_get_devnode(dev), type);
202 				udev_device_unref(dev);
203 				continue;
204 			}
205 		}
206 
207 		/*
208 		 * if this is a disk and it is partitioned, then the
209 		 * zfs label will reside in a DEVTYPE=partition and
210 		 * we can skip passing this event
211 		 */
212 		type = udev_device_get_property_value(dev, "DEVTYPE");
213 		part = udev_device_get_property_value(dev,
214 		    "ID_PART_TABLE_TYPE");
215 		if (type != NULL && type[0] != '\0' &&
216 		    strcmp(type, "disk") == 0 &&
217 		    part != NULL && part[0] != '\0') {
218 			zed_log_msg(LOG_INFO,
219 			    "%s: skip %s since it has a %s partition already",
220 			    __func__,
221 			    udev_device_get_property_value(dev, "DEVNAME"),
222 			    part);
223 			/* skip and wait for partition event */
224 			udev_device_unref(dev);
225 			continue;
226 		}
227 
228 		/*
229 		 * ignore small partitions
230 		 */
231 		sectors = udev_device_get_property_value(dev,
232 		    "ID_PART_ENTRY_SIZE");
233 		if (sectors == NULL)
234 			sectors = udev_device_get_sysattr_value(dev, "size");
235 		if (sectors != NULL &&
236 		    strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
237 			zed_log_msg(LOG_INFO,
238 			    "%s: %s sectors %s < %llu (minimum)",
239 			    __func__,
240 			    udev_device_get_property_value(dev, "DEVNAME"),
241 			    sectors, MINIMUM_SECTORS);
242 			udev_device_unref(dev);
243 			continue;
244 		}
245 
246 		/*
247 		 * If the blkid probe didn't find ZFS, then a persistent
248 		 * device id string is required in the message schema
249 		 * for matching with vdevs. Preflight here for expected
250 		 * udev information.
251 		 */
252 		bus = udev_device_get_property_value(dev, "ID_BUS");
253 		uuid = udev_device_get_property_value(dev, "DM_UUID");
254 		if (!is_zfs && (bus == NULL && uuid == NULL)) {
255 			zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
256 			    "source", udev_device_get_devnode(dev));
257 			udev_device_unref(dev);
258 			continue;
259 		}
260 
261 		action = udev_device_get_action(dev);
262 		if (strcmp(action, "add") == 0) {
263 			class = EC_DEV_ADD;
264 			subclass = ESC_DISK;
265 		} else if (strcmp(action, "remove") == 0) {
266 			class = EC_DEV_REMOVE;
267 			subclass = ESC_DISK;
268 		} else if (strcmp(action, "change") == 0) {
269 			class = EC_DEV_STATUS;
270 			subclass = ESC_DEV_DLE;
271 		} else {
272 			zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
273 			    action);
274 			udev_device_unref(dev);
275 			continue;
276 		}
277 
278 		/*
279 		 * Special case an EC_DEV_ADD for multipath devices
280 		 *
281 		 * When a multipath device is created, udev reports the
282 		 * following:
283 		 *
284 		 * 1.	"add" event of the dm device for the multipath device
285 		 *	(like /dev/dm-3).
286 		 * 2.	"change" event to create the actual multipath device
287 		 *	symlink (like /dev/mapper/mpatha).  The event also
288 		 *	passes back the relevant DM vars we care about, like
289 		 *	DM_UUID.
290 		 * 3.	Another "change" event identical to #2 (that we ignore).
291 		 *
292 		 * To get the behavior we want, we treat the "change" event
293 		 * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
294 		 * a new disk being added.
295 		 */
296 		if (strcmp(class, EC_DEV_STATUS) == 0 &&
297 		    udev_device_get_property_value(dev, "DM_UUID") &&
298 		    udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
299 			tmp = (char *)udev_device_get_devnode(dev);
300 			tmp2 = zfs_get_underlying_path(tmp);
301 			if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
302 				/*
303 				 * We have a real underlying device, which
304 				 * means that this multipath "change" event is
305 				 * an "add" event.
306 				 *
307 				 * If the multipath device and the underlying
308 				 * dev are the same name (i.e. /dev/dm-5), then
309 				 * there is no real underlying disk for this
310 				 * multipath device, and so this "change" event
311 				 * really is a multipath removal.
312 				 */
313 				class = EC_DEV_ADD;
314 				subclass = ESC_DISK;
315 			} else {
316 				tmp = (char *)
317 				    udev_device_get_property_value(dev,
318 				    "DM_NR_VALID_PATHS");
319 				/* treat as a multipath remove */
320 				if (tmp != NULL && strcmp(tmp, "0") == 0) {
321 					class = EC_DEV_REMOVE;
322 					subclass = ESC_DISK;
323 				}
324 			}
325 			free(tmp2);
326 		}
327 
328 		/*
329 		 * Special case an EC_DEV_ADD for scsi_debug devices
330 		 *
331 		 * These devices require a udevadm trigger command after
332 		 * creation in order to register the vdev_id scsidebug alias
333 		 * rule (adds a persistent path (phys_path) used for fault
334 		 * management automated tests in the ZFS test suite.
335 		 *
336 		 * After udevadm trigger command, event registers as a "change"
337 		 * event but needs to instead be handled as another "add" event
338 		 * to allow for disk labeling and partitioning to occur.
339 		 */
340 		if (strcmp(class, EC_DEV_STATUS) == 0 &&
341 		    udev_device_get_property_value(dev, "ID_VDEV") &&
342 		    udev_device_get_property_value(dev, "ID_MODEL")) {
343 			const char *id_model, *id_model_sd = "scsi_debug";
344 
345 			id_model = udev_device_get_property_value(dev,
346 			    "ID_MODEL");
347 			if (strcmp(id_model, id_model_sd) == 0) {
348 				class = EC_DEV_ADD;
349 				subclass = ESC_DISK;
350 			}
351 		}
352 
353 		if ((nvl = dev_event_nvlist(dev)) != NULL) {
354 			zed_udev_event(class, subclass, nvl);
355 			nvlist_free(nvl);
356 		}
357 
358 		udev_device_unref(dev);
359 	}
360 
361 	return (NULL);
362 }
363 
364 int
365 zed_disk_event_init(void)
366 {
367 	int fd, fflags;
368 
369 	if ((g_udev = udev_new()) == NULL) {
370 		zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
371 		return (-1);
372 	}
373 
374 	/* Set up a udev monitor for block devices */
375 	g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
376 	udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
377 	udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
378 	    "partition");
379 	udev_monitor_enable_receiving(g_mon);
380 
381 	/* Make sure monitoring socket is blocking */
382 	fd = udev_monitor_get_fd(g_mon);
383 	if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
384 		(void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
385 
386 	/* spawn a thread to monitor events */
387 	if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
388 		udev_monitor_unref(g_mon);
389 		udev_unref(g_udev);
390 		zed_log_msg(LOG_WARNING, "pthread_create failed");
391 		return (-1);
392 	}
393 
394 	pthread_setname_np(g_mon_tid, "udev monitor");
395 	zed_log_msg(LOG_INFO, "zed_disk_event_init");
396 
397 	return (0);
398 }
399 
400 void
401 zed_disk_event_fini(void)
402 {
403 	/* cancel monitor thread at recvmsg() */
404 	(void) pthread_cancel(g_mon_tid);
405 	(void) pthread_join(g_mon_tid, NULL);
406 
407 	/* cleanup udev resources */
408 	udev_monitor_unref(g_mon);
409 	udev_unref(g_udev);
410 
411 	zed_log_msg(LOG_INFO, "zed_disk_event_fini");
412 }
413 
414 #else
415 
416 #include "zed_disk_event.h"
417 
418 int
419 zed_disk_event_init(void)
420 {
421 	return (0);
422 }
423 
424 void
425 zed_disk_event_fini(void)
426 {
427 }
428 
429 #endif /* HAVE_LIBUDEV */
430