1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2021 Western Digital Corporation or its affiliates. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/mm.h> 8 #include <linux/sched/mm.h> 9 #include <linux/slab.h> 10 #include <linux/bitmap.h> 11 12 #include "dm-core.h" 13 14 #define DM_MSG_PREFIX "zone" 15 16 /* 17 * For internal zone reports bypassing the top BIO submission path. 18 */ 19 static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, 20 sector_t sector, unsigned int nr_zones, 21 report_zones_cb cb, void *data) 22 { 23 struct gendisk *disk = md->disk; 24 int ret; 25 struct dm_report_zones_args args = { 26 .next_sector = sector, 27 .orig_data = data, 28 .orig_cb = cb, 29 }; 30 31 do { 32 struct dm_target *tgt; 33 34 tgt = dm_table_find_target(t, args.next_sector); 35 if (WARN_ON_ONCE(!tgt->type->report_zones)) 36 return -EIO; 37 38 args.tgt = tgt; 39 ret = tgt->type->report_zones(tgt, &args, 40 nr_zones - args.zone_idx); 41 if (ret < 0) 42 return ret; 43 } while (args.zone_idx < nr_zones && 44 args.next_sector < get_capacity(disk)); 45 46 return args.zone_idx; 47 } 48 49 /* 50 * User facing dm device block device report zone operation. This calls the 51 * report_zones operation for each target of a device table. This operation is 52 * generally implemented by targets using dm_report_zones(). 53 */ 54 int dm_blk_report_zones(struct gendisk *disk, sector_t sector, 55 unsigned int nr_zones, report_zones_cb cb, void *data) 56 { 57 struct mapped_device *md = disk->private_data; 58 struct dm_table *map; 59 struct dm_table *zone_revalidate_map = md->zone_revalidate_map; 60 int srcu_idx, ret = -EIO; 61 bool put_table = false; 62 63 if (!zone_revalidate_map || md->revalidate_map_task != current) { 64 /* 65 * Regular user context or 66 * Zone revalidation during __bind() is in progress, but this 67 * call is from a different process 68 */ 69 if (dm_suspended_md(md)) 70 return -EAGAIN; 71 72 map = dm_get_live_table(md, &srcu_idx); 73 put_table = true; 74 } else { 75 /* Zone revalidation during __bind() */ 76 map = zone_revalidate_map; 77 } 78 79 if (map) 80 ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, 81 data); 82 83 if (put_table) 84 dm_put_live_table(md, srcu_idx); 85 86 return ret; 87 } 88 89 static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, 90 void *data) 91 { 92 struct dm_report_zones_args *args = data; 93 sector_t sector_diff = args->tgt->begin - args->start; 94 95 /* 96 * Ignore zones beyond the target range. 97 */ 98 if (zone->start >= args->start + args->tgt->len) 99 return 0; 100 101 /* 102 * Remap the start sector and write pointer position of the zone 103 * to match its position in the target range. 104 */ 105 zone->start += sector_diff; 106 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { 107 if (zone->cond == BLK_ZONE_COND_FULL) 108 zone->wp = zone->start + zone->len; 109 else if (zone->cond == BLK_ZONE_COND_EMPTY) 110 zone->wp = zone->start; 111 else 112 zone->wp += sector_diff; 113 } 114 115 args->next_sector = zone->start + zone->len; 116 return args->orig_cb(zone, args->zone_idx++, args->orig_data); 117 } 118 119 /* 120 * Helper for drivers of zoned targets to implement struct target_type 121 * report_zones operation. 122 */ 123 int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector, 124 struct dm_report_zones_args *args, unsigned int nr_zones) 125 { 126 /* 127 * Set the target mapping start sector first so that 128 * dm_report_zones_cb() can correctly remap zone information. 129 */ 130 args->start = start; 131 132 return blkdev_report_zones(bdev, sector, nr_zones, 133 dm_report_zones_cb, args); 134 } 135 EXPORT_SYMBOL_GPL(dm_report_zones); 136 137 bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) 138 { 139 struct request_queue *q = md->queue; 140 141 if (!blk_queue_is_zoned(q)) 142 return false; 143 144 switch (bio_op(bio)) { 145 case REQ_OP_WRITE_ZEROES: 146 case REQ_OP_WRITE: 147 return !op_is_flush(bio->bi_opf) && bio_sectors(bio); 148 default: 149 return false; 150 } 151 } 152 153 /* 154 * Revalidate the zones of a mapped device to initialize resource necessary 155 * for zone append emulation. Note that we cannot simply use the block layer 156 * blk_revalidate_disk_zones() function here as the mapped device is suspended 157 * (this is called from __bind() context). 158 */ 159 int dm_revalidate_zones(struct dm_table *t, struct request_queue *q) 160 { 161 struct mapped_device *md = t->md; 162 struct gendisk *disk = md->disk; 163 unsigned int nr_zones = disk->nr_zones; 164 int ret; 165 166 if (!get_capacity(disk)) 167 return 0; 168 169 /* 170 * Do not revalidate if zone write plug resources have already 171 * been allocated. 172 */ 173 if (dm_has_zone_plugs(md)) 174 return 0; 175 176 DMINFO("%s using %s zone append", disk->disk_name, 177 queue_emulates_zone_append(q) ? "emulated" : "native"); 178 179 /* 180 * Our table is not live yet. So the call to dm_get_live_table() 181 * in dm_blk_report_zones() will fail. Set a temporary pointer to 182 * our table for dm_blk_report_zones() to use directly. 183 */ 184 md->zone_revalidate_map = t; 185 md->revalidate_map_task = current; 186 ret = blk_revalidate_disk_zones(disk); 187 md->revalidate_map_task = NULL; 188 md->zone_revalidate_map = NULL; 189 190 if (ret) { 191 DMERR("Revalidate zones failed %d", ret); 192 disk->nr_zones = nr_zones; 193 return ret; 194 } 195 196 md->nr_zones = disk->nr_zones; 197 198 return 0; 199 } 200 201 static int device_not_zone_append_capable(struct dm_target *ti, 202 struct dm_dev *dev, sector_t start, 203 sector_t len, void *data) 204 { 205 return !bdev_is_zoned(dev->bdev); 206 } 207 208 static bool dm_table_supports_zone_append(struct dm_table *t) 209 { 210 for (unsigned int i = 0; i < t->num_targets; i++) { 211 struct dm_target *ti = dm_table_get_target(t, i); 212 213 if (ti->emulate_zone_append) 214 return false; 215 216 if (!ti->type->iterate_devices || 217 ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL)) 218 return false; 219 } 220 221 return true; 222 } 223 224 struct dm_device_zone_count { 225 sector_t start; 226 sector_t len; 227 unsigned int total_nr_seq_zones; 228 unsigned int target_nr_seq_zones; 229 }; 230 231 /* 232 * Count the total number of and the number of mapped sequential zones of a 233 * target zoned device. 234 */ 235 static int dm_device_count_zones_cb(struct blk_zone *zone, 236 unsigned int idx, void *data) 237 { 238 struct dm_device_zone_count *zc = data; 239 240 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { 241 zc->total_nr_seq_zones++; 242 if (zone->start >= zc->start && 243 zone->start < zc->start + zc->len) 244 zc->target_nr_seq_zones++; 245 } 246 247 return 0; 248 } 249 250 static int dm_device_count_zones(struct dm_dev *dev, 251 struct dm_device_zone_count *zc) 252 { 253 int ret; 254 255 ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, 256 dm_device_count_zones_cb, zc); 257 if (ret < 0) 258 return ret; 259 if (!ret) 260 return -EIO; 261 return 0; 262 } 263 264 struct dm_zone_resource_limits { 265 unsigned int mapped_nr_seq_zones; 266 struct queue_limits *lim; 267 bool reliable_limits; 268 }; 269 270 static int device_get_zone_resource_limits(struct dm_target *ti, 271 struct dm_dev *dev, sector_t start, 272 sector_t len, void *data) 273 { 274 struct dm_zone_resource_limits *zlim = data; 275 struct gendisk *disk = dev->bdev->bd_disk; 276 unsigned int max_open_zones, max_active_zones; 277 int ret; 278 struct dm_device_zone_count zc = { 279 .start = start, 280 .len = len, 281 }; 282 283 /* 284 * If the target is not the whole device, the device zone resources may 285 * be shared between different targets. Check this by counting the 286 * number of mapped sequential zones: if this number is smaller than the 287 * total number of sequential zones of the target device, then resource 288 * sharing may happen and the zone limits will not be reliable. 289 */ 290 ret = dm_device_count_zones(dev, &zc); 291 if (ret) { 292 DMERR("Count %s zones failed %d", disk->disk_name, ret); 293 return ret; 294 } 295 296 /* 297 * If the target does not map any sequential zones, then we do not need 298 * any zone resource limits. 299 */ 300 if (!zc.target_nr_seq_zones) 301 return 0; 302 303 /* 304 * If the target does not map all sequential zones, the limits 305 * will not be reliable and we cannot use REQ_OP_ZONE_RESET_ALL. 306 */ 307 if (zc.target_nr_seq_zones < zc.total_nr_seq_zones) { 308 zlim->reliable_limits = false; 309 ti->zone_reset_all_supported = false; 310 } 311 312 /* 313 * If the target maps less sequential zones than the limit values, then 314 * we do not have limits for this target. 315 */ 316 max_active_zones = disk->queue->limits.max_active_zones; 317 if (max_active_zones >= zc.target_nr_seq_zones) 318 max_active_zones = 0; 319 zlim->lim->max_active_zones = 320 min_not_zero(max_active_zones, zlim->lim->max_active_zones); 321 322 max_open_zones = disk->queue->limits.max_open_zones; 323 if (max_open_zones >= zc.target_nr_seq_zones) 324 max_open_zones = 0; 325 zlim->lim->max_open_zones = 326 min_not_zero(max_open_zones, zlim->lim->max_open_zones); 327 328 /* 329 * Also count the total number of sequential zones for the mapped 330 * device so that when we are done inspecting all its targets, we are 331 * able to check if the mapped device actually has any sequential zones. 332 */ 333 zlim->mapped_nr_seq_zones += zc.target_nr_seq_zones; 334 335 return 0; 336 } 337 338 int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, 339 struct queue_limits *lim) 340 { 341 struct mapped_device *md = t->md; 342 struct gendisk *disk = md->disk; 343 struct dm_zone_resource_limits zlim = { 344 .reliable_limits = true, 345 .lim = lim, 346 }; 347 348 /* 349 * Check if zone append is natively supported, and if not, set the 350 * mapped device queue as needing zone append emulation. If zone 351 * append is natively supported, make sure that 352 * max_hw_zone_append_sectors is not set to 0. 353 */ 354 WARN_ON_ONCE(queue_is_mq(q)); 355 if (!dm_table_supports_zone_append(t)) 356 lim->max_hw_zone_append_sectors = 0; 357 else if (lim->max_hw_zone_append_sectors == 0) 358 lim->max_hw_zone_append_sectors = lim->max_zone_append_sectors; 359 360 /* 361 * Determine the max open and max active zone limits for the mapped 362 * device by inspecting the zone resource limits and the zones mapped 363 * by each target. 364 */ 365 for (unsigned int i = 0; i < t->num_targets; i++) { 366 struct dm_target *ti = dm_table_get_target(t, i); 367 368 /* 369 * Assume that the target can accept REQ_OP_ZONE_RESET_ALL. 370 * device_get_zone_resource_limits() may adjust this if one of 371 * the device used by the target does not have all its 372 * sequential write required zones mapped. 373 */ 374 ti->zone_reset_all_supported = true; 375 376 if (!ti->type->iterate_devices || 377 ti->type->iterate_devices(ti, 378 device_get_zone_resource_limits, &zlim)) { 379 DMERR("Could not determine %s zone resource limits", 380 disk->disk_name); 381 return -ENODEV; 382 } 383 } 384 385 /* 386 * If we only have conventional zones mapped, expose the mapped device 387 + as a regular device. 388 */ 389 if (!zlim.mapped_nr_seq_zones) { 390 lim->max_open_zones = 0; 391 lim->max_active_zones = 0; 392 lim->max_hw_zone_append_sectors = 0; 393 lim->max_zone_append_sectors = 0; 394 lim->zone_write_granularity = 0; 395 lim->chunk_sectors = 0; 396 lim->features &= ~BLK_FEAT_ZONED; 397 return 0; 398 } 399 400 if (get_capacity(disk) && dm_has_zone_plugs(t->md)) { 401 if (q->limits.chunk_sectors != lim->chunk_sectors) { 402 DMWARN("%s: device has zone write plug resources. " 403 "Cannot change zone size", 404 disk->disk_name); 405 return -EINVAL; 406 } 407 if (lim->max_hw_zone_append_sectors != 0 && 408 !dm_table_is_wildcard(t)) { 409 DMWARN("%s: device has zone write plug resources. " 410 "New table must emulate zone append", 411 disk->disk_name); 412 return -EINVAL; 413 } 414 } 415 /* 416 * Warn once (when the capacity is not yet set) if the mapped device is 417 * partially using zone resources of the target devices as that leads to 418 * unreliable limits, i.e. if another mapped device uses the same 419 * underlying devices, we cannot enforce zone limits to guarantee that 420 * writing will not lead to errors. Note that we really should return 421 * an error for such case but there is no easy way to find out if 422 * another mapped device uses the same underlying zoned devices. 423 */ 424 if (!get_capacity(disk) && !zlim.reliable_limits) 425 DMWARN("%s zone resource limits may be unreliable", 426 disk->disk_name); 427 428 if (lim->features & BLK_FEAT_ZONED && 429 !static_key_enabled(&zoned_enabled.key)) 430 static_branch_enable(&zoned_enabled); 431 return 0; 432 } 433 434 void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim) 435 { 436 struct mapped_device *md = t->md; 437 438 if (lim->features & BLK_FEAT_ZONED) { 439 if (dm_table_supports_zone_append(t)) 440 clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 441 else 442 set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 443 } else { 444 clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 445 md->nr_zones = 0; 446 md->disk->nr_zones = 0; 447 } 448 } 449 450 451 /* 452 * IO completion callback called from clone_endio(). 453 */ 454 void dm_zone_endio(struct dm_io *io, struct bio *clone) 455 { 456 struct mapped_device *md = io->md; 457 struct gendisk *disk = md->disk; 458 struct bio *orig_bio = io->orig_bio; 459 460 /* 461 * Get the offset within the zone of the written sector 462 * and add that to the original bio sector position. 463 */ 464 if (clone->bi_status == BLK_STS_OK && 465 bio_op(clone) == REQ_OP_ZONE_APPEND) { 466 orig_bio->bi_iter.bi_sector += 467 bdev_offset_from_zone_start(disk->part0, 468 clone->bi_iter.bi_sector); 469 } 470 } 471 472 static int dm_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, 473 void *data) 474 { 475 /* 476 * For an all-zones reset, ignore conventional, empty, read-only 477 * and offline zones. 478 */ 479 switch (zone->cond) { 480 case BLK_ZONE_COND_NOT_WP: 481 case BLK_ZONE_COND_EMPTY: 482 case BLK_ZONE_COND_READONLY: 483 case BLK_ZONE_COND_OFFLINE: 484 return 0; 485 default: 486 set_bit(idx, (unsigned long *)data); 487 return 0; 488 } 489 } 490 491 int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, 492 sector_t sector, unsigned int nr_zones, 493 unsigned long *need_reset) 494 { 495 int ret; 496 497 ret = dm_blk_do_report_zones(md, t, sector, nr_zones, 498 dm_zone_need_reset_cb, need_reset); 499 if (ret != nr_zones) { 500 DMERR("Get %s zone reset bitmap failed\n", 501 md->disk->disk_name); 502 return -EIO; 503 } 504 505 return 0; 506 } 507