1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 33 /* 34 * Virtual device read-ahead caching. 35 * 36 * This file implements a simple LRU read-ahead cache. When the DMU reads 37 * a given block, it will often want other, nearby blocks soon thereafter. 38 * We take advantage of this by reading a larger disk region and caching 39 * the result. In the best case, this can turn 256 back-to-back 512-byte 40 * reads into a single 128k read followed by 255 cache hits; this reduces 41 * latency dramatically. In the worst case, it can turn an isolated 512-byte 42 * read into a 128k read, which doesn't affect latency all that much but is 43 * terribly wasteful of bandwidth. A more intelligent version of the cache 44 * could keep track of access patterns and not do read-ahead unless it sees 45 * at least two temporally close I/Os to the same region. Currently, only 46 * metadata I/O is inflated. A futher enhancement could take advantage of 47 * more semantic information about the I/O. And it could use something 48 * faster than an AVL tree; that was chosen solely for convenience. 49 * 50 * There are five cache operations: allocate, fill, read, write, evict. 51 * 52 * (1) Allocate. This reserves a cache entry for the specified region. 53 * We separate the allocate and fill operations so that multiple threads 54 * don't generate I/O for the same cache miss. 55 * 56 * (2) Fill. When the I/O for a cache miss completes, the fill routine 57 * places the data in the previously allocated cache entry. 58 * 59 * (3) Read. Read data from the cache. 60 * 61 * (4) Write. Update cache contents after write completion. 62 * 63 * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry 64 * if the total cache size exceeds zfs_vdev_cache_size. 65 */ 66 67 /* 68 * These tunables are for performance analysis. 69 */ 70 /* 71 * All i/os smaller than zfs_vdev_cache_max will be turned into 72 * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software 73 * track buffer. At most zfs_vdev_cache_size bytes will be kept in each 74 * vdev's vdev_cache. 75 */ 76 int zfs_vdev_cache_max = 1<<14; 77 int zfs_vdev_cache_size = 10ULL << 20; 78 int zfs_vdev_cache_bshift = 16; 79 80 #define VCBS (1 << zfs_vdev_cache_bshift) 81 82 static int 83 vdev_cache_offset_compare(const void *a1, const void *a2) 84 { 85 const vdev_cache_entry_t *ve1 = a1; 86 const vdev_cache_entry_t *ve2 = a2; 87 88 if (ve1->ve_offset < ve2->ve_offset) 89 return (-1); 90 if (ve1->ve_offset > ve2->ve_offset) 91 return (1); 92 return (0); 93 } 94 95 static int 96 vdev_cache_lastused_compare(const void *a1, const void *a2) 97 { 98 const vdev_cache_entry_t *ve1 = a1; 99 const vdev_cache_entry_t *ve2 = a2; 100 101 if (ve1->ve_lastused < ve2->ve_lastused) 102 return (-1); 103 if (ve1->ve_lastused > ve2->ve_lastused) 104 return (1); 105 106 /* 107 * Among equally old entries, sort by offset to ensure uniqueness. 108 */ 109 return (vdev_cache_offset_compare(a1, a2)); 110 } 111 112 /* 113 * Evict the specified entry from the cache. 114 */ 115 static void 116 vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) 117 { 118 ASSERT(MUTEX_HELD(&vc->vc_lock)); 119 ASSERT(ve->ve_fill_io == NULL); 120 ASSERT(ve->ve_data != NULL); 121 122 dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n", 123 vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused, 124 ve->ve_hits, ve->ve_missed_update); 125 126 avl_remove(&vc->vc_lastused_tree, ve); 127 avl_remove(&vc->vc_offset_tree, ve); 128 zio_buf_free(ve->ve_data, VCBS); 129 kmem_free(ve, sizeof (vdev_cache_entry_t)); 130 } 131 132 /* 133 * Allocate an entry in the cache. At the point we don't have the data, 134 * we're just creating a placeholder so that multiple threads don't all 135 * go off and read the same blocks. 136 */ 137 static vdev_cache_entry_t * 138 vdev_cache_allocate(zio_t *zio) 139 { 140 vdev_cache_t *vc = &zio->io_vd->vdev_cache; 141 uint64_t offset = P2ALIGN(zio->io_offset, VCBS); 142 vdev_cache_entry_t *ve; 143 144 ASSERT(MUTEX_HELD(&vc->vc_lock)); 145 146 if (zfs_vdev_cache_size == 0) 147 return (NULL); 148 149 /* 150 * If adding a new entry would exceed the cache size, 151 * evict the oldest entry (LRU). 152 */ 153 if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > 154 zfs_vdev_cache_size) { 155 ve = avl_first(&vc->vc_lastused_tree); 156 if (ve->ve_fill_io != NULL) { 157 dprintf("can't evict in %p, still filling\n", vc); 158 return (NULL); 159 } 160 ASSERT(ve->ve_hits != 0); 161 vdev_cache_evict(vc, ve); 162 } 163 164 ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); 165 ve->ve_offset = offset; 166 ve->ve_lastused = lbolt; 167 ve->ve_data = zio_buf_alloc(VCBS); 168 169 avl_add(&vc->vc_offset_tree, ve); 170 avl_add(&vc->vc_lastused_tree, ve); 171 172 return (ve); 173 } 174 175 static void 176 vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) 177 { 178 uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 179 180 ASSERT(MUTEX_HELD(&vc->vc_lock)); 181 ASSERT(ve->ve_fill_io == NULL); 182 183 if (ve->ve_lastused != lbolt) { 184 avl_remove(&vc->vc_lastused_tree, ve); 185 ve->ve_lastused = lbolt; 186 avl_add(&vc->vc_lastused_tree, ve); 187 } 188 189 ve->ve_hits++; 190 bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); 191 } 192 193 /* 194 * Fill a previously allocated cache entry with data. 195 */ 196 static void 197 vdev_cache_fill(zio_t *zio) 198 { 199 vdev_t *vd = zio->io_vd; 200 vdev_cache_t *vc = &vd->vdev_cache; 201 vdev_cache_entry_t *ve = zio->io_private; 202 zio_t *dio; 203 204 ASSERT(zio->io_size == VCBS); 205 206 /* 207 * Add data to the cache. 208 */ 209 mutex_enter(&vc->vc_lock); 210 211 ASSERT(ve->ve_fill_io == zio); 212 ASSERT(ve->ve_offset == zio->io_offset); 213 ASSERT(ve->ve_data == zio->io_data); 214 215 ve->ve_fill_io = NULL; 216 217 /* 218 * Even if this cache line was invalidated by a missed write update, 219 * any reads that were queued up before the missed update are still 220 * valid, so we can satisfy them from this line before we evict it. 221 */ 222 for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) 223 vdev_cache_hit(vc, ve, dio); 224 225 if (zio->io_error || ve->ve_missed_update) 226 vdev_cache_evict(vc, ve); 227 228 mutex_exit(&vc->vc_lock); 229 230 while ((dio = zio->io_delegate_list) != NULL) { 231 zio->io_delegate_list = dio->io_delegate_next; 232 dio->io_delegate_next = NULL; 233 dio->io_error = zio->io_error; 234 zio_next_stage(dio); 235 } 236 } 237 238 /* 239 * Read data from the cache. Returns 0 on cache hit, errno on a miss. 240 */ 241 int 242 vdev_cache_read(zio_t *zio) 243 { 244 vdev_cache_t *vc = &zio->io_vd->vdev_cache; 245 vdev_cache_entry_t *ve, ve_search; 246 uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); 247 uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 248 zio_t *fio; 249 250 ASSERT(zio->io_type == ZIO_TYPE_READ); 251 252 if (zio->io_flags & ZIO_FLAG_DONT_CACHE) 253 return (EINVAL); 254 255 if (zio->io_size > zfs_vdev_cache_max) 256 return (EOVERFLOW); 257 258 /* 259 * If the I/O straddles two or more cache blocks, don't cache it. 260 */ 261 if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS)) 262 return (EXDEV); 263 264 ASSERT(cache_phase + zio->io_size <= VCBS); 265 266 mutex_enter(&vc->vc_lock); 267 268 ve_search.ve_offset = cache_offset; 269 ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); 270 271 if (ve != NULL) { 272 if (ve->ve_missed_update) { 273 mutex_exit(&vc->vc_lock); 274 return (ESTALE); 275 } 276 277 if ((fio = ve->ve_fill_io) != NULL) { 278 zio->io_delegate_next = fio->io_delegate_list; 279 fio->io_delegate_list = zio; 280 zio_vdev_io_bypass(zio); 281 mutex_exit(&vc->vc_lock); 282 return (0); 283 } 284 285 vdev_cache_hit(vc, ve, zio); 286 zio_vdev_io_bypass(zio); 287 288 mutex_exit(&vc->vc_lock); 289 zio_next_stage(zio); 290 return (0); 291 } 292 293 if (!(zio->io_flags & ZIO_FLAG_METADATA)) { 294 mutex_exit(&vc->vc_lock); 295 return (EINVAL); 296 } 297 298 ve = vdev_cache_allocate(zio); 299 300 if (ve == NULL) { 301 mutex_exit(&vc->vc_lock); 302 return (ENOMEM); 303 } 304 305 fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, 306 ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, 307 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | 308 ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK, 309 vdev_cache_fill, ve); 310 311 ve->ve_fill_io = fio; 312 fio->io_delegate_list = zio; 313 zio_vdev_io_bypass(zio); 314 315 mutex_exit(&vc->vc_lock); 316 zio_nowait(fio); 317 318 return (0); 319 } 320 321 /* 322 * Update cache contents upon write completion. 323 */ 324 void 325 vdev_cache_write(zio_t *zio) 326 { 327 vdev_cache_t *vc = &zio->io_vd->vdev_cache; 328 vdev_cache_entry_t *ve, ve_search; 329 uint64_t io_start = zio->io_offset; 330 uint64_t io_end = io_start + zio->io_size; 331 uint64_t min_offset = P2ALIGN(io_start, VCBS); 332 uint64_t max_offset = P2ROUNDUP(io_end, VCBS); 333 avl_index_t where; 334 335 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 336 337 mutex_enter(&vc->vc_lock); 338 339 ve_search.ve_offset = min_offset; 340 ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); 341 342 if (ve == NULL) 343 ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); 344 345 while (ve != NULL && ve->ve_offset < max_offset) { 346 uint64_t start = MAX(ve->ve_offset, io_start); 347 uint64_t end = MIN(ve->ve_offset + VCBS, io_end); 348 349 if (ve->ve_fill_io != NULL) { 350 ve->ve_missed_update = 1; 351 } else { 352 bcopy((char *)zio->io_data + start - io_start, 353 ve->ve_data + start - ve->ve_offset, end - start); 354 } 355 ve = AVL_NEXT(&vc->vc_offset_tree, ve); 356 } 357 mutex_exit(&vc->vc_lock); 358 } 359 360 void 361 vdev_cache_purge(vdev_t *vd) 362 { 363 vdev_cache_t *vc = &vd->vdev_cache; 364 vdev_cache_entry_t *ve; 365 366 mutex_enter(&vc->vc_lock); 367 while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) 368 vdev_cache_evict(vc, ve); 369 mutex_exit(&vc->vc_lock); 370 } 371 372 void 373 vdev_cache_init(vdev_t *vd) 374 { 375 vdev_cache_t *vc = &vd->vdev_cache; 376 377 mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); 378 379 avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, 380 sizeof (vdev_cache_entry_t), 381 offsetof(struct vdev_cache_entry, ve_offset_node)); 382 383 avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, 384 sizeof (vdev_cache_entry_t), 385 offsetof(struct vdev_cache_entry, ve_lastused_node)); 386 } 387 388 void 389 vdev_cache_fini(vdev_t *vd) 390 { 391 vdev_cache_t *vc = &vd->vdev_cache; 392 393 vdev_cache_purge(vd); 394 395 avl_destroy(&vc->vc_offset_tree); 396 avl_destroy(&vc->vc_lastused_tree); 397 398 mutex_destroy(&vc->vc_lock); 399 } 400