1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License Version 1.0 (CDDL-1.0). 7 * You can obtain a copy of the license from the top-level file 8 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. 9 * You may not use this file except in compliance with the license. 10 * 11 * CDDL HEADER END 12 */ 13 14 /* 15 * Copyright (c) 2016, 2017, Intel Corporation. 16 */ 17 18 #ifdef HAVE_LIBUDEV 19 20 #include <errno.h> 21 #include <fcntl.h> 22 #include <libnvpair.h> 23 #include <libudev.h> 24 #include <libzfs.h> 25 #include <libzutil.h> 26 #include <pthread.h> 27 #include <stdlib.h> 28 #include <string.h> 29 30 #include <sys/sysevent/eventdefs.h> 31 #include <sys/sysevent/dev.h> 32 33 #include "zed_log.h" 34 #include "zed_disk_event.h" 35 #include "agents/zfs_agents.h" 36 37 /* 38 * Portions of ZED need to see disk events for disks belonging to ZFS pools. 39 * A libudev monitor is established to monitor block device actions and pass 40 * them on to internal ZED logic modules. Initially, zfs_mod.c is the only 41 * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM 42 * module responsible for handling disk events for ZFS. 43 */ 44 45 pthread_t g_mon_tid; 46 struct udev *g_udev; 47 struct udev_monitor *g_mon; 48 49 50 #define DEV_BYID_PATH "/dev/disk/by-id/" 51 52 /* 64MB is minimum usable disk for ZFS */ 53 #define MINIMUM_SECTORS 131072ULL 54 55 56 /* 57 * Post disk event to SLM module 58 * 59 * occurs in the context of monitor thread 60 */ 61 static void 62 zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) 63 { 64 const char *strval; 65 uint64_t numval; 66 67 zed_log_msg(LOG_INFO, "zed_disk_event:"); 68 zed_log_msg(LOG_INFO, "\tclass: %s", class); 69 zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass); 70 if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0) 71 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval); 72 if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0) 73 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval); 74 if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0) 75 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval); 76 if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE) 77 zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART); 78 if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0) 79 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); 80 if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) 81 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); 82 if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0) 83 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval); 84 if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) 85 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); 86 if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) 87 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval); 88 89 (void) zfs_agent_post_event(class, subclass, nvl); 90 } 91 92 /* 93 * dev_event_nvlist: place event schema into an nv pair list 94 * 95 * NAME VALUE (example) 96 * -------------- -------------------------------------------------------- 97 * DEV_NAME /dev/sdl 98 * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/... 99 * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC 100 * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0 101 * DEV_IS_PART --- 102 * DEV_SIZE 500107862016 103 * ZFS_EV_POOL_GUID 17523635698032189180 104 * ZFS_EV_VDEV_GUID 14663607734290803088 105 */ 106 static nvlist_t * 107 dev_event_nvlist(struct udev_device *dev) 108 { 109 nvlist_t *nvl; 110 char strval[128]; 111 const char *value, *path; 112 uint64_t guid; 113 114 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) 115 return (NULL); 116 117 if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0) 118 (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval); 119 if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0) 120 (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval); 121 if ((path = udev_device_get_devnode(dev)) != NULL) 122 (void) nvlist_add_string(nvl, DEV_NAME, path); 123 if ((value = udev_device_get_devpath(dev)) != NULL) 124 (void) nvlist_add_string(nvl, DEV_PATH, value); 125 value = udev_device_get_devtype(dev); 126 if ((value != NULL && strcmp("partition", value) == 0) || 127 (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER") 128 != NULL)) { 129 (void) nvlist_add_boolean(nvl, DEV_IS_PART); 130 } 131 if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) { 132 uint64_t numval = DEV_BSIZE; 133 134 numval *= strtoull(value, NULL, 10); 135 (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); 136 137 /* 138 * If the device has a parent, then get the parent block 139 * device's size as well. For example, /dev/sda1's parent 140 * is /dev/sda. 141 */ 142 struct udev_device *parent_dev = udev_device_get_parent(dev); 143 if (parent_dev != NULL && 144 (value = udev_device_get_sysattr_value(parent_dev, "size")) 145 != NULL) { 146 uint64_t numval = DEV_BSIZE; 147 148 numval *= strtoull(value, NULL, 10); 149 (void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval); 150 } 151 } 152 153 /* 154 * Grab the pool and vdev guids from blkid cache 155 */ 156 value = udev_device_get_property_value(dev, "ID_FS_UUID"); 157 if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) 158 (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid); 159 160 value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB"); 161 if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) 162 (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid); 163 164 /* 165 * Either a vdev guid or a devid must be present for matching 166 */ 167 if (!nvlist_exists(nvl, DEV_IDENTIFIER) && 168 !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) { 169 nvlist_free(nvl); 170 return (NULL); 171 } 172 173 return (nvl); 174 } 175 176 /* 177 * Listen for block device uevents 178 */ 179 static void * 180 zed_udev_monitor(void *arg) 181 { 182 struct udev_monitor *mon = arg; 183 const char *tmp; 184 char *tmp2; 185 186 zed_log_msg(LOG_INFO, "Waiting for new udev disk events..."); 187 188 while (1) { 189 struct udev_device *dev; 190 const char *action, *type, *part, *sectors; 191 const char *bus, *uuid, *devpath; 192 const char *class, *subclass; 193 nvlist_t *nvl; 194 boolean_t is_zfs = B_FALSE; 195 196 /* allow a cancellation while blocked (recvmsg) */ 197 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); 198 199 /* blocks at recvmsg until an event occurs */ 200 if ((dev = udev_monitor_receive_device(mon)) == NULL) { 201 zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive " 202 "device error %d", errno); 203 continue; 204 } 205 206 /* allow all steps to complete before a cancellation */ 207 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); 208 209 /* 210 * Strongly typed device is the preferred filter 211 */ 212 type = udev_device_get_property_value(dev, "ID_FS_TYPE"); 213 if (type != NULL && type[0] != '\0') { 214 if (strcmp(type, "zfs_member") == 0) { 215 is_zfs = B_TRUE; 216 } else { 217 /* not ours, so skip */ 218 zed_log_msg(LOG_INFO, "zed_udev_monitor: skip " 219 "%s (in use by %s)", 220 udev_device_get_devnode(dev), type); 221 udev_device_unref(dev); 222 continue; 223 } 224 } 225 226 /* 227 * if this is a disk and it is partitioned, then the 228 * zfs label will reside in a DEVTYPE=partition and 229 * we can skip passing this event 230 * 231 * Special case: Blank disks are sometimes reported with 232 * an erroneous 'atari' partition, and should not be 233 * excluded from being used as an autoreplace disk: 234 * 235 * https://github.com/openzfs/zfs/issues/13497 236 */ 237 type = udev_device_get_property_value(dev, "DEVTYPE"); 238 part = udev_device_get_property_value(dev, 239 "ID_PART_TABLE_TYPE"); 240 if (type != NULL && type[0] != '\0' && 241 strcmp(type, "disk") == 0 && 242 part != NULL && part[0] != '\0') { 243 const char *devname = 244 udev_device_get_property_value(dev, "DEVNAME"); 245 246 if (strcmp(part, "atari") == 0) { 247 zed_log_msg(LOG_INFO, 248 "%s: %s is reporting an atari partition, " 249 "but we're going to assume it's a false " 250 "positive and still use it (issue #13497)", 251 __func__, devname); 252 } else { 253 zed_log_msg(LOG_INFO, 254 "%s: skip %s since it has a %s partition " 255 "already", __func__, devname, part); 256 /* skip and wait for partition event */ 257 udev_device_unref(dev); 258 continue; 259 } 260 } 261 262 /* 263 * ignore small partitions 264 */ 265 sectors = udev_device_get_property_value(dev, 266 "ID_PART_ENTRY_SIZE"); 267 if (sectors == NULL) 268 sectors = udev_device_get_sysattr_value(dev, "size"); 269 if (sectors != NULL && 270 strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) { 271 zed_log_msg(LOG_INFO, 272 "%s: %s sectors %s < %llu (minimum)", 273 __func__, 274 udev_device_get_property_value(dev, "DEVNAME"), 275 sectors, MINIMUM_SECTORS); 276 udev_device_unref(dev); 277 continue; 278 } 279 280 /* 281 * If the blkid probe didn't find ZFS, then a persistent 282 * device id string is required in the message schema 283 * for matching with vdevs. Preflight here for expected 284 * udev information. 285 * 286 * Special case: 287 * NVMe devices don't have ID_BUS set (at least on RHEL 7-8), 288 * but they are valid for autoreplace. Add a special case for 289 * them by searching for "/nvme/" in the udev DEVPATH: 290 * 291 * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1 292 */ 293 bus = udev_device_get_property_value(dev, "ID_BUS"); 294 uuid = udev_device_get_property_value(dev, "DM_UUID"); 295 devpath = udev_device_get_devpath(dev); 296 if (!is_zfs && (bus == NULL && uuid == NULL && 297 strstr(devpath, "/nvme/") == NULL)) { 298 zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " 299 "source", udev_device_get_devnode(dev)); 300 udev_device_unref(dev); 301 continue; 302 } 303 304 action = udev_device_get_action(dev); 305 if (strcmp(action, "add") == 0) { 306 class = EC_DEV_ADD; 307 subclass = ESC_DISK; 308 } else if (strcmp(action, "remove") == 0) { 309 class = EC_DEV_REMOVE; 310 subclass = ESC_DISK; 311 } else if (strcmp(action, "change") == 0) { 312 class = EC_DEV_STATUS; 313 subclass = ESC_DEV_DLE; 314 } else { 315 zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown", 316 action); 317 udev_device_unref(dev); 318 continue; 319 } 320 321 /* 322 * Special case an EC_DEV_ADD for multipath devices 323 * 324 * When a multipath device is created, udev reports the 325 * following: 326 * 327 * 1. "add" event of the dm device for the multipath device 328 * (like /dev/dm-3). 329 * 2. "change" event to create the actual multipath device 330 * symlink (like /dev/mapper/mpatha). The event also 331 * passes back the relevant DM vars we care about, like 332 * DM_UUID. 333 * 3. Another "change" event identical to #2 (that we ignore). 334 * 335 * To get the behavior we want, we treat the "change" event 336 * in #2 as a "add" event; as if "/dev/mapper/mpatha" was 337 * a new disk being added. 338 */ 339 if (strcmp(class, EC_DEV_STATUS) == 0 && 340 udev_device_get_property_value(dev, "DM_UUID") && 341 udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) { 342 tmp = udev_device_get_devnode(dev); 343 tmp2 = zfs_get_underlying_path(tmp); 344 if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) { 345 /* 346 * We have a real underlying device, which 347 * means that this multipath "change" event is 348 * an "add" event. 349 * 350 * If the multipath device and the underlying 351 * dev are the same name (i.e. /dev/dm-5), then 352 * there is no real underlying disk for this 353 * multipath device, and so this "change" event 354 * really is a multipath removal. 355 */ 356 class = EC_DEV_ADD; 357 subclass = ESC_DISK; 358 } else { 359 tmp = udev_device_get_property_value(dev, 360 "DM_NR_VALID_PATHS"); 361 /* treat as a multipath remove */ 362 if (tmp != NULL && strcmp(tmp, "0") == 0) { 363 class = EC_DEV_REMOVE; 364 subclass = ESC_DISK; 365 } 366 } 367 free(tmp2); 368 } 369 370 /* 371 * Special case an EC_DEV_ADD for scsi_debug devices 372 * 373 * These devices require a udevadm trigger command after 374 * creation in order to register the vdev_id scsidebug alias 375 * rule (adds a persistent path (phys_path) used for fault 376 * management automated tests in the ZFS test suite. 377 * 378 * After udevadm trigger command, event registers as a "change" 379 * event but needs to instead be handled as another "add" event 380 * to allow for disk labeling and partitioning to occur. 381 */ 382 if (strcmp(class, EC_DEV_STATUS) == 0 && 383 udev_device_get_property_value(dev, "ID_VDEV") && 384 udev_device_get_property_value(dev, "ID_MODEL")) { 385 const char *id_model, *id_model_sd = "scsi_debug"; 386 387 id_model = udev_device_get_property_value(dev, 388 "ID_MODEL"); 389 if (strcmp(id_model, id_model_sd) == 0) { 390 class = EC_DEV_ADD; 391 subclass = ESC_DISK; 392 } 393 } 394 395 if ((nvl = dev_event_nvlist(dev)) != NULL) { 396 zed_udev_event(class, subclass, nvl); 397 nvlist_free(nvl); 398 } 399 400 udev_device_unref(dev); 401 } 402 403 return (NULL); 404 } 405 406 int 407 zed_disk_event_init(void) 408 { 409 int fd, fflags; 410 411 if ((g_udev = udev_new()) == NULL) { 412 zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno); 413 return (-1); 414 } 415 416 /* Set up a udev monitor for block devices */ 417 g_mon = udev_monitor_new_from_netlink(g_udev, "udev"); 418 udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk"); 419 udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", 420 "partition"); 421 udev_monitor_enable_receiving(g_mon); 422 423 /* Make sure monitoring socket is blocking */ 424 fd = udev_monitor_get_fd(g_mon); 425 if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK) 426 (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK); 427 428 /* spawn a thread to monitor events */ 429 if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) { 430 udev_monitor_unref(g_mon); 431 udev_unref(g_udev); 432 zed_log_msg(LOG_WARNING, "pthread_create failed"); 433 return (-1); 434 } 435 436 pthread_setname_np(g_mon_tid, "udev monitor"); 437 zed_log_msg(LOG_INFO, "zed_disk_event_init"); 438 439 return (0); 440 } 441 442 void 443 zed_disk_event_fini(void) 444 { 445 /* cancel monitor thread at recvmsg() */ 446 (void) pthread_cancel(g_mon_tid); 447 (void) pthread_join(g_mon_tid, NULL); 448 449 /* cleanup udev resources */ 450 udev_monitor_unref(g_mon); 451 udev_unref(g_udev); 452 453 zed_log_msg(LOG_INFO, "zed_disk_event_fini"); 454 } 455 456 #else 457 458 #include "zed_disk_event.h" 459 460 int 461 zed_disk_event_init(void) 462 { 463 return (0); 464 } 465 466 void 467 zed_disk_event_fini(void) 468 { 469 } 470 471 #endif /* HAVE_LIBUDEV */ 472