1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2013, 2014 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/dmu.h> 32 #include <sys/dnode.h> 33 #include <sys/zio.h> 34 #include <sys/range_tree.h> 35 36 kmem_cache_t *range_seg_cache; 37 38 void 39 range_tree_init(void) 40 { 41 ASSERT(range_seg_cache == NULL); 42 range_seg_cache = kmem_cache_create("range_seg_cache", 43 sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 44 } 45 46 void 47 range_tree_fini(void) 48 { 49 kmem_cache_destroy(range_seg_cache); 50 range_seg_cache = NULL; 51 } 52 53 void 54 range_tree_stat_verify(range_tree_t *rt) 55 { 56 range_seg_t *rs; 57 uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; 58 int i; 59 60 for (rs = avl_first(&rt->rt_root); rs != NULL; 61 rs = AVL_NEXT(&rt->rt_root, rs)) { 62 uint64_t size = rs->rs_end - rs->rs_start; 63 int idx = highbit64(size) - 1; 64 65 hist[idx]++; 66 ASSERT3U(hist[idx], !=, 0); 67 } 68 69 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 70 if (hist[i] != rt->rt_histogram[i]) { 71 zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu", 72 i, hist, hist[i], rt->rt_histogram[i]); 73 } 74 VERIFY3U(hist[i], ==, rt->rt_histogram[i]); 75 } 76 } 77 78 static void 79 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) 80 { 81 uint64_t size = rs->rs_end - rs->rs_start; 82 int idx = highbit64(size) - 1; 83 84 ASSERT(size != 0); 85 ASSERT3U(idx, <, 86 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); 87 88 ASSERT(MUTEX_HELD(rt->rt_lock)); 89 rt->rt_histogram[idx]++; 90 ASSERT3U(rt->rt_histogram[idx], !=, 0); 91 } 92 93 static void 94 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) 95 { 96 uint64_t size = rs->rs_end - rs->rs_start; 97 int idx = highbit64(size) - 1; 98 99 ASSERT(size != 0); 100 ASSERT3U(idx, <, 101 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); 102 103 ASSERT(MUTEX_HELD(rt->rt_lock)); 104 ASSERT3U(rt->rt_histogram[idx], !=, 0); 105 rt->rt_histogram[idx]--; 106 } 107 108 /* 109 * NOTE: caller is responsible for all locking. 110 */ 111 static int 112 range_tree_seg_compare(const void *x1, const void *x2) 113 { 114 const range_seg_t *r1 = x1; 115 const range_seg_t *r2 = x2; 116 117 if (r1->rs_start < r2->rs_start) { 118 if (r1->rs_end > r2->rs_start) 119 return (0); 120 return (-1); 121 } 122 if (r1->rs_start > r2->rs_start) { 123 if (r1->rs_start < r2->rs_end) 124 return (0); 125 return (1); 126 } 127 return (0); 128 } 129 130 range_tree_t * 131 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp) 132 { 133 range_tree_t *rt; 134 135 rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); 136 137 avl_create(&rt->rt_root, range_tree_seg_compare, 138 sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); 139 140 rt->rt_lock = lp; 141 rt->rt_ops = ops; 142 rt->rt_arg = arg; 143 144 if (rt->rt_ops != NULL) 145 rt->rt_ops->rtop_create(rt, rt->rt_arg); 146 147 return (rt); 148 } 149 150 void 151 range_tree_destroy(range_tree_t *rt) 152 { 153 VERIFY0(rt->rt_space); 154 155 if (rt->rt_ops != NULL) 156 rt->rt_ops->rtop_destroy(rt, rt->rt_arg); 157 158 avl_destroy(&rt->rt_root); 159 kmem_free(rt, sizeof (*rt)); 160 } 161 162 void 163 range_tree_add(void *arg, uint64_t start, uint64_t size) 164 { 165 range_tree_t *rt = arg; 166 avl_index_t where; 167 range_seg_t rsearch, *rs_before, *rs_after, *rs; 168 uint64_t end = start + size; 169 boolean_t merge_before, merge_after; 170 171 ASSERT(MUTEX_HELD(rt->rt_lock)); 172 VERIFY(size != 0); 173 174 rsearch.rs_start = start; 175 rsearch.rs_end = end; 176 rs = avl_find(&rt->rt_root, &rsearch, &where); 177 178 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) { 179 zfs_panic_recover("zfs: allocating allocated segment" 180 "(offset=%llu size=%llu)\n", 181 (longlong_t)start, (longlong_t)size); 182 return; 183 } 184 185 /* Make sure we don't overlap with either of our neighbors */ 186 VERIFY(rs == NULL); 187 188 rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); 189 rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); 190 191 merge_before = (rs_before != NULL && rs_before->rs_end == start); 192 merge_after = (rs_after != NULL && rs_after->rs_start == end); 193 194 if (merge_before && merge_after) { 195 avl_remove(&rt->rt_root, rs_before); 196 if (rt->rt_ops != NULL) { 197 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); 198 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); 199 } 200 201 range_tree_stat_decr(rt, rs_before); 202 range_tree_stat_decr(rt, rs_after); 203 204 rs_after->rs_start = rs_before->rs_start; 205 kmem_cache_free(range_seg_cache, rs_before); 206 rs = rs_after; 207 } else if (merge_before) { 208 if (rt->rt_ops != NULL) 209 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); 210 211 range_tree_stat_decr(rt, rs_before); 212 213 rs_before->rs_end = end; 214 rs = rs_before; 215 } else if (merge_after) { 216 if (rt->rt_ops != NULL) 217 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); 218 219 range_tree_stat_decr(rt, rs_after); 220 221 rs_after->rs_start = start; 222 rs = rs_after; 223 } else { 224 rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); 225 rs->rs_start = start; 226 rs->rs_end = end; 227 avl_insert(&rt->rt_root, rs, where); 228 } 229 230 if (rt->rt_ops != NULL) 231 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 232 233 range_tree_stat_incr(rt, rs); 234 rt->rt_space += size; 235 } 236 237 void 238 range_tree_remove(void *arg, uint64_t start, uint64_t size) 239 { 240 range_tree_t *rt = arg; 241 avl_index_t where; 242 range_seg_t rsearch, *rs, *newseg; 243 uint64_t end = start + size; 244 boolean_t left_over, right_over; 245 246 ASSERT(MUTEX_HELD(rt->rt_lock)); 247 VERIFY3U(size, !=, 0); 248 VERIFY3U(size, <=, rt->rt_space); 249 250 rsearch.rs_start = start; 251 rsearch.rs_end = end; 252 rs = avl_find(&rt->rt_root, &rsearch, &where); 253 254 /* Make sure we completely overlap with someone */ 255 if (rs == NULL) { 256 zfs_panic_recover("zfs: freeing free segment " 257 "(offset=%llu size=%llu)", 258 (longlong_t)start, (longlong_t)size); 259 return; 260 } 261 VERIFY3U(rs->rs_start, <=, start); 262 VERIFY3U(rs->rs_end, >=, end); 263 264 left_over = (rs->rs_start != start); 265 right_over = (rs->rs_end != end); 266 267 range_tree_stat_decr(rt, rs); 268 269 if (rt->rt_ops != NULL) 270 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); 271 272 if (left_over && right_over) { 273 newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); 274 newseg->rs_start = end; 275 newseg->rs_end = rs->rs_end; 276 range_tree_stat_incr(rt, newseg); 277 278 rs->rs_end = start; 279 280 avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); 281 if (rt->rt_ops != NULL) 282 rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); 283 } else if (left_over) { 284 rs->rs_end = start; 285 } else if (right_over) { 286 rs->rs_start = end; 287 } else { 288 avl_remove(&rt->rt_root, rs); 289 kmem_cache_free(range_seg_cache, rs); 290 rs = NULL; 291 } 292 293 if (rs != NULL) { 294 range_tree_stat_incr(rt, rs); 295 296 if (rt->rt_ops != NULL) 297 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 298 } 299 300 rt->rt_space -= size; 301 } 302 303 static range_seg_t * 304 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) 305 { 306 avl_index_t where; 307 range_seg_t rsearch; 308 uint64_t end = start + size; 309 310 ASSERT(MUTEX_HELD(rt->rt_lock)); 311 VERIFY(size != 0); 312 313 rsearch.rs_start = start; 314 rsearch.rs_end = end; 315 return (avl_find(&rt->rt_root, &rsearch, &where)); 316 } 317 318 static range_seg_t * 319 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) 320 { 321 range_seg_t *rs = range_tree_find_impl(rt, start, size); 322 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) 323 return (rs); 324 return (NULL); 325 } 326 327 void 328 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) 329 { 330 range_seg_t *rs; 331 332 mutex_enter(rt->rt_lock); 333 rs = range_tree_find(rt, off, size); 334 if (rs != NULL) 335 panic("freeing free block; rs=%p", (void *)rs); 336 mutex_exit(rt->rt_lock); 337 } 338 339 boolean_t 340 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) 341 { 342 return (range_tree_find(rt, start, size) != NULL); 343 } 344 345 /* 346 * Ensure that this range is not in the tree, regardless of whether 347 * it is currently in the tree. 348 */ 349 void 350 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) 351 { 352 range_seg_t *rs; 353 354 while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { 355 uint64_t free_start = MAX(rs->rs_start, start); 356 uint64_t free_end = MIN(rs->rs_end, start + size); 357 range_tree_remove(rt, free_start, free_end - free_start); 358 } 359 } 360 361 void 362 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) 363 { 364 range_tree_t *rt; 365 366 ASSERT(MUTEX_HELD((*rtsrc)->rt_lock)); 367 ASSERT0(range_tree_space(*rtdst)); 368 ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); 369 370 rt = *rtsrc; 371 *rtsrc = *rtdst; 372 *rtdst = rt; 373 } 374 375 void 376 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) 377 { 378 range_seg_t *rs; 379 void *cookie = NULL; 380 381 ASSERT(MUTEX_HELD(rt->rt_lock)); 382 383 if (rt->rt_ops != NULL) 384 rt->rt_ops->rtop_vacate(rt, rt->rt_arg); 385 386 while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { 387 if (func != NULL) 388 func(arg, rs->rs_start, rs->rs_end - rs->rs_start); 389 kmem_cache_free(range_seg_cache, rs); 390 } 391 392 bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); 393 rt->rt_space = 0; 394 } 395 396 void 397 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) 398 { 399 range_seg_t *rs; 400 401 ASSERT(MUTEX_HELD(rt->rt_lock)); 402 403 for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) 404 func(arg, rs->rs_start, rs->rs_end - rs->rs_start); 405 } 406 407 uint64_t 408 range_tree_space(range_tree_t *rt) 409 { 410 return (rt->rt_space); 411 } 412