1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License Version 1.0 (CDDL-1.0). 6 * You can obtain a copy of the license from the top-level file 7 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. 8 * You may not use this file except in compliance with the license. 9 * 10 * CDDL HEADER END 11 */ 12 13 /* 14 * Copyright (c) 2016, 2017, Intel Corporation. 15 */ 16 17 #ifdef HAVE_LIBUDEV 18 19 #include <errno.h> 20 #include <fcntl.h> 21 #include <libnvpair.h> 22 #include <libudev.h> 23 #include <libzfs.h> 24 #include <libzutil.h> 25 #include <pthread.h> 26 #include <stdlib.h> 27 #include <string.h> 28 29 #include <sys/sysevent/eventdefs.h> 30 #include <sys/sysevent/dev.h> 31 32 #include "zed_log.h" 33 #include "zed_disk_event.h" 34 #include "agents/zfs_agents.h" 35 36 /* 37 * Portions of ZED need to see disk events for disks belonging to ZFS pools. 38 * A libudev monitor is established to monitor block device actions and pass 39 * them on to internal ZED logic modules. Initially, zfs_mod.c is the only 40 * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM 41 * module responsible for handling disk events for ZFS. 42 */ 43 44 pthread_t g_mon_tid; 45 struct udev *g_udev; 46 struct udev_monitor *g_mon; 47 48 49 #define DEV_BYID_PATH "/dev/disk/by-id/" 50 51 /* 64MB is minimum usable disk for ZFS */ 52 #define MINIMUM_SECTORS 131072ULL 53 54 55 /* 56 * Post disk event to SLM module 57 * 58 * occurs in the context of monitor thread 59 */ 60 static void 61 zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) 62 { 63 const char *strval; 64 uint64_t numval; 65 66 zed_log_msg(LOG_INFO, "zed_disk_event:"); 67 zed_log_msg(LOG_INFO, "\tclass: %s", class); 68 zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass); 69 if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0) 70 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval); 71 if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0) 72 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval); 73 if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0) 74 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval); 75 if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE) 76 zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART); 77 if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0) 78 zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); 79 if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) 80 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); 81 if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0) 82 zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval); 83 if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) 84 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); 85 if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) 86 zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval); 87 88 (void) zfs_agent_post_event(class, subclass, nvl); 89 } 90 91 /* 92 * dev_event_nvlist: place event schema into an nv pair list 93 * 94 * NAME VALUE (example) 95 * -------------- -------------------------------------------------------- 96 * DEV_NAME /dev/sdl 97 * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/... 98 * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC 99 * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0 100 * DEV_IS_PART --- 101 * DEV_SIZE 500107862016 102 * ZFS_EV_POOL_GUID 17523635698032189180 103 * ZFS_EV_VDEV_GUID 14663607734290803088 104 */ 105 static nvlist_t * 106 dev_event_nvlist(struct udev_device *dev) 107 { 108 nvlist_t *nvl; 109 char strval[128]; 110 const char *value, *path; 111 uint64_t guid; 112 113 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) 114 return (NULL); 115 116 if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0) 117 (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval); 118 if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0) 119 (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval); 120 if ((path = udev_device_get_devnode(dev)) != NULL) 121 (void) nvlist_add_string(nvl, DEV_NAME, path); 122 if ((value = udev_device_get_devpath(dev)) != NULL) 123 (void) nvlist_add_string(nvl, DEV_PATH, value); 124 value = udev_device_get_devtype(dev); 125 if ((value != NULL && strcmp("partition", value) == 0) || 126 (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER") 127 != NULL)) { 128 (void) nvlist_add_boolean(nvl, DEV_IS_PART); 129 } 130 if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) { 131 uint64_t numval = DEV_BSIZE; 132 133 numval *= strtoull(value, NULL, 10); 134 (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); 135 136 /* 137 * If the device has a parent, then get the parent block 138 * device's size as well. For example, /dev/sda1's parent 139 * is /dev/sda. 140 */ 141 struct udev_device *parent_dev = udev_device_get_parent(dev); 142 if (parent_dev != NULL && 143 (value = udev_device_get_sysattr_value(parent_dev, "size")) 144 != NULL) { 145 uint64_t numval = DEV_BSIZE; 146 147 numval *= strtoull(value, NULL, 10); 148 (void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval); 149 } 150 } 151 152 /* 153 * Grab the pool and vdev guids from blkid cache 154 */ 155 value = udev_device_get_property_value(dev, "ID_FS_UUID"); 156 if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) 157 (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid); 158 159 value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB"); 160 if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) 161 (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid); 162 163 /* 164 * Either a vdev guid or a devid must be present for matching 165 */ 166 if (!nvlist_exists(nvl, DEV_IDENTIFIER) && 167 !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) { 168 nvlist_free(nvl); 169 return (NULL); 170 } 171 172 return (nvl); 173 } 174 175 /* 176 * Listen for block device uevents 177 */ 178 static void * 179 zed_udev_monitor(void *arg) 180 { 181 struct udev_monitor *mon = arg; 182 const char *tmp; 183 char *tmp2; 184 185 zed_log_msg(LOG_INFO, "Waiting for new udev disk events..."); 186 187 while (1) { 188 struct udev_device *dev; 189 const char *action, *type, *part, *sectors; 190 const char *bus, *uuid, *devpath; 191 const char *class, *subclass; 192 nvlist_t *nvl; 193 boolean_t is_zfs = B_FALSE; 194 195 /* allow a cancellation while blocked (recvmsg) */ 196 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); 197 198 /* blocks at recvmsg until an event occurs */ 199 if ((dev = udev_monitor_receive_device(mon)) == NULL) { 200 zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive " 201 "device error %d", errno); 202 continue; 203 } 204 205 /* allow all steps to complete before a cancellation */ 206 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); 207 208 /* 209 * Strongly typed device is the preferred filter 210 */ 211 type = udev_device_get_property_value(dev, "ID_FS_TYPE"); 212 if (type != NULL && type[0] != '\0') { 213 if (strcmp(type, "zfs_member") == 0) { 214 is_zfs = B_TRUE; 215 } else { 216 /* not ours, so skip */ 217 zed_log_msg(LOG_INFO, "zed_udev_monitor: skip " 218 "%s (in use by %s)", 219 udev_device_get_devnode(dev), type); 220 udev_device_unref(dev); 221 continue; 222 } 223 } 224 225 /* 226 * if this is a disk and it is partitioned, then the 227 * zfs label will reside in a DEVTYPE=partition and 228 * we can skip passing this event 229 * 230 * Special case: Blank disks are sometimes reported with 231 * an erroneous 'atari' partition, and should not be 232 * excluded from being used as an autoreplace disk: 233 * 234 * https://github.com/openzfs/zfs/issues/13497 235 */ 236 type = udev_device_get_property_value(dev, "DEVTYPE"); 237 part = udev_device_get_property_value(dev, 238 "ID_PART_TABLE_TYPE"); 239 if (type != NULL && type[0] != '\0' && 240 strcmp(type, "disk") == 0 && 241 part != NULL && part[0] != '\0') { 242 const char *devname = 243 udev_device_get_property_value(dev, "DEVNAME"); 244 245 if (strcmp(part, "atari") == 0) { 246 zed_log_msg(LOG_INFO, 247 "%s: %s is reporting an atari partition, " 248 "but we're going to assume it's a false " 249 "positive and still use it (issue #13497)", 250 __func__, devname); 251 } else { 252 zed_log_msg(LOG_INFO, 253 "%s: skip %s since it has a %s partition " 254 "already", __func__, devname, part); 255 /* skip and wait for partition event */ 256 udev_device_unref(dev); 257 continue; 258 } 259 } 260 261 /* 262 * ignore small partitions 263 */ 264 sectors = udev_device_get_property_value(dev, 265 "ID_PART_ENTRY_SIZE"); 266 if (sectors == NULL) 267 sectors = udev_device_get_sysattr_value(dev, "size"); 268 if (sectors != NULL && 269 strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) { 270 zed_log_msg(LOG_INFO, 271 "%s: %s sectors %s < %llu (minimum)", 272 __func__, 273 udev_device_get_property_value(dev, "DEVNAME"), 274 sectors, MINIMUM_SECTORS); 275 udev_device_unref(dev); 276 continue; 277 } 278 279 /* 280 * If the blkid probe didn't find ZFS, then a persistent 281 * device id string is required in the message schema 282 * for matching with vdevs. Preflight here for expected 283 * udev information. 284 * 285 * Special case: 286 * NVMe devices don't have ID_BUS set (at least on RHEL 7-8), 287 * but they are valid for autoreplace. Add a special case for 288 * them by searching for "/nvme/" in the udev DEVPATH: 289 * 290 * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1 291 */ 292 bus = udev_device_get_property_value(dev, "ID_BUS"); 293 uuid = udev_device_get_property_value(dev, "DM_UUID"); 294 devpath = udev_device_get_devpath(dev); 295 if (!is_zfs && (bus == NULL && uuid == NULL && 296 strstr(devpath, "/nvme/") == NULL)) { 297 zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " 298 "source", udev_device_get_devnode(dev)); 299 udev_device_unref(dev); 300 continue; 301 } 302 303 action = udev_device_get_action(dev); 304 if (strcmp(action, "add") == 0) { 305 class = EC_DEV_ADD; 306 subclass = ESC_DISK; 307 } else if (strcmp(action, "remove") == 0) { 308 class = EC_DEV_REMOVE; 309 subclass = ESC_DISK; 310 } else if (strcmp(action, "change") == 0) { 311 class = EC_DEV_STATUS; 312 subclass = ESC_DEV_DLE; 313 } else { 314 zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown", 315 action); 316 udev_device_unref(dev); 317 continue; 318 } 319 320 /* 321 * Special case an EC_DEV_ADD for multipath devices 322 * 323 * When a multipath device is created, udev reports the 324 * following: 325 * 326 * 1. "add" event of the dm device for the multipath device 327 * (like /dev/dm-3). 328 * 2. "change" event to create the actual multipath device 329 * symlink (like /dev/mapper/mpatha). The event also 330 * passes back the relevant DM vars we care about, like 331 * DM_UUID. 332 * 3. Another "change" event identical to #2 (that we ignore). 333 * 334 * To get the behavior we want, we treat the "change" event 335 * in #2 as a "add" event; as if "/dev/mapper/mpatha" was 336 * a new disk being added. 337 */ 338 if (strcmp(class, EC_DEV_STATUS) == 0 && 339 udev_device_get_property_value(dev, "DM_UUID") && 340 udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) { 341 tmp = udev_device_get_devnode(dev); 342 tmp2 = zfs_get_underlying_path(tmp); 343 if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) { 344 /* 345 * We have a real underlying device, which 346 * means that this multipath "change" event is 347 * an "add" event. 348 * 349 * If the multipath device and the underlying 350 * dev are the same name (i.e. /dev/dm-5), then 351 * there is no real underlying disk for this 352 * multipath device, and so this "change" event 353 * really is a multipath removal. 354 */ 355 class = EC_DEV_ADD; 356 subclass = ESC_DISK; 357 } else { 358 tmp = udev_device_get_property_value(dev, 359 "DM_NR_VALID_PATHS"); 360 /* treat as a multipath remove */ 361 if (tmp != NULL && strcmp(tmp, "0") == 0) { 362 class = EC_DEV_REMOVE; 363 subclass = ESC_DISK; 364 } 365 } 366 free(tmp2); 367 } 368 369 /* 370 * Special case an EC_DEV_ADD for scsi_debug devices 371 * 372 * These devices require a udevadm trigger command after 373 * creation in order to register the vdev_id scsidebug alias 374 * rule (adds a persistent path (phys_path) used for fault 375 * management automated tests in the ZFS test suite. 376 * 377 * After udevadm trigger command, event registers as a "change" 378 * event but needs to instead be handled as another "add" event 379 * to allow for disk labeling and partitioning to occur. 380 */ 381 if (strcmp(class, EC_DEV_STATUS) == 0 && 382 udev_device_get_property_value(dev, "ID_VDEV") && 383 udev_device_get_property_value(dev, "ID_MODEL")) { 384 const char *id_model, *id_model_sd = "scsi_debug"; 385 386 id_model = udev_device_get_property_value(dev, 387 "ID_MODEL"); 388 if (strcmp(id_model, id_model_sd) == 0) { 389 class = EC_DEV_ADD; 390 subclass = ESC_DISK; 391 } 392 } 393 394 if ((nvl = dev_event_nvlist(dev)) != NULL) { 395 zed_udev_event(class, subclass, nvl); 396 nvlist_free(nvl); 397 } 398 399 udev_device_unref(dev); 400 } 401 402 return (NULL); 403 } 404 405 int 406 zed_disk_event_init(void) 407 { 408 int fd, fflags; 409 410 if ((g_udev = udev_new()) == NULL) { 411 zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno); 412 return (-1); 413 } 414 415 /* Set up a udev monitor for block devices */ 416 g_mon = udev_monitor_new_from_netlink(g_udev, "udev"); 417 udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk"); 418 udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", 419 "partition"); 420 udev_monitor_enable_receiving(g_mon); 421 422 /* Make sure monitoring socket is blocking */ 423 fd = udev_monitor_get_fd(g_mon); 424 if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK) 425 (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK); 426 427 /* spawn a thread to monitor events */ 428 if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) { 429 udev_monitor_unref(g_mon); 430 udev_unref(g_udev); 431 zed_log_msg(LOG_WARNING, "pthread_create failed"); 432 return (-1); 433 } 434 435 pthread_setname_np(g_mon_tid, "udev monitor"); 436 zed_log_msg(LOG_INFO, "zed_disk_event_init"); 437 438 return (0); 439 } 440 441 void 442 zed_disk_event_fini(void) 443 { 444 /* cancel monitor thread at recvmsg() */ 445 (void) pthread_cancel(g_mon_tid); 446 (void) pthread_join(g_mon_tid, NULL); 447 448 /* cleanup udev resources */ 449 udev_monitor_unref(g_mon); 450 udev_unref(g_udev); 451 452 zed_log_msg(LOG_INFO, "zed_disk_event_fini"); 453 } 454 455 #else 456 457 #include "zed_disk_event.h" 458 459 int 460 zed_disk_event_init(void) 461 { 462 return (0); 463 } 464 465 void 466 zed_disk_event_fini(void) 467 { 468 } 469 470 #endif /* HAVE_LIBUDEV */ 471