1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2013, 2014 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/dmu.h> 32 #include <sys/dnode.h> 33 #include <sys/zio.h> 34 #include <sys/range_tree.h> 35 36 static kmem_cache_t *range_seg_cache; 37 38 void 39 range_tree_init(void) 40 { 41 ASSERT(range_seg_cache == NULL); 42 range_seg_cache = kmem_cache_create("range_seg_cache", 43 sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 44 } 45 46 void 47 range_tree_fini(void) 48 { 49 kmem_cache_destroy(range_seg_cache); 50 range_seg_cache = NULL; 51 } 52 53 void 54 range_tree_stat_verify(range_tree_t *rt) 55 { 56 range_seg_t *rs; 57 uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; 58 int i; 59 60 for (rs = avl_first(&rt->rt_root); rs != NULL; 61 rs = AVL_NEXT(&rt->rt_root, rs)) { 62 uint64_t size = rs->rs_end - rs->rs_start; 63 int idx = highbit64(size) - 1; 64 65 hist[idx]++; 66 ASSERT3U(hist[idx], !=, 0); 67 } 68 69 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { 70 if (hist[i] != rt->rt_histogram[i]) { 71 zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu", 72 i, hist, hist[i], rt->rt_histogram[i]); 73 } 74 VERIFY3U(hist[i], ==, rt->rt_histogram[i]); 75 } 76 } 77 78 static void 79 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) 80 { 81 uint64_t size = rs->rs_end - rs->rs_start; 82 int idx = highbit64(size) - 1; 83 84 ASSERT3U(idx, <, 85 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); 86 87 ASSERT(MUTEX_HELD(rt->rt_lock)); 88 rt->rt_histogram[idx]++; 89 ASSERT3U(rt->rt_histogram[idx], !=, 0); 90 } 91 92 static void 93 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) 94 { 95 uint64_t size = rs->rs_end - rs->rs_start; 96 int idx = highbit64(size) - 1; 97 98 ASSERT3U(idx, <, 99 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); 100 101 ASSERT(MUTEX_HELD(rt->rt_lock)); 102 ASSERT3U(rt->rt_histogram[idx], !=, 0); 103 rt->rt_histogram[idx]--; 104 } 105 106 /* 107 * NOTE: caller is responsible for all locking. 108 */ 109 static int 110 range_tree_seg_compare(const void *x1, const void *x2) 111 { 112 const range_seg_t *r1 = x1; 113 const range_seg_t *r2 = x2; 114 115 if (r1->rs_start < r2->rs_start) { 116 if (r1->rs_end > r2->rs_start) 117 return (0); 118 return (-1); 119 } 120 if (r1->rs_start > r2->rs_start) { 121 if (r1->rs_start < r2->rs_end) 122 return (0); 123 return (1); 124 } 125 return (0); 126 } 127 128 range_tree_t * 129 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp) 130 { 131 range_tree_t *rt; 132 133 rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); 134 135 avl_create(&rt->rt_root, range_tree_seg_compare, 136 sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); 137 138 rt->rt_lock = lp; 139 rt->rt_ops = ops; 140 rt->rt_arg = arg; 141 142 if (rt->rt_ops != NULL) 143 rt->rt_ops->rtop_create(rt, rt->rt_arg); 144 145 return (rt); 146 } 147 148 void 149 range_tree_destroy(range_tree_t *rt) 150 { 151 VERIFY0(rt->rt_space); 152 153 if (rt->rt_ops != NULL) 154 rt->rt_ops->rtop_destroy(rt, rt->rt_arg); 155 156 avl_destroy(&rt->rt_root); 157 kmem_free(rt, sizeof (*rt)); 158 } 159 160 void 161 range_tree_add(void *arg, uint64_t start, uint64_t size) 162 { 163 range_tree_t *rt = arg; 164 avl_index_t where; 165 range_seg_t rsearch, *rs_before, *rs_after, *rs; 166 uint64_t end = start + size; 167 boolean_t merge_before, merge_after; 168 169 ASSERT(MUTEX_HELD(rt->rt_lock)); 170 VERIFY(size != 0); 171 172 rsearch.rs_start = start; 173 rsearch.rs_end = end; 174 rs = avl_find(&rt->rt_root, &rsearch, &where); 175 176 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) { 177 zfs_panic_recover("zfs: allocating allocated segment" 178 "(offset=%llu size=%llu)\n", 179 (longlong_t)start, (longlong_t)size); 180 return; 181 } 182 183 /* Make sure we don't overlap with either of our neighbors */ 184 VERIFY(rs == NULL); 185 186 rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); 187 rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); 188 189 merge_before = (rs_before != NULL && rs_before->rs_end == start); 190 merge_after = (rs_after != NULL && rs_after->rs_start == end); 191 192 if (merge_before && merge_after) { 193 avl_remove(&rt->rt_root, rs_before); 194 if (rt->rt_ops != NULL) { 195 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); 196 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); 197 } 198 199 range_tree_stat_decr(rt, rs_before); 200 range_tree_stat_decr(rt, rs_after); 201 202 rs_after->rs_start = rs_before->rs_start; 203 kmem_cache_free(range_seg_cache, rs_before); 204 rs = rs_after; 205 } else if (merge_before) { 206 if (rt->rt_ops != NULL) 207 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); 208 209 range_tree_stat_decr(rt, rs_before); 210 211 rs_before->rs_end = end; 212 rs = rs_before; 213 } else if (merge_after) { 214 if (rt->rt_ops != NULL) 215 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); 216 217 range_tree_stat_decr(rt, rs_after); 218 219 rs_after->rs_start = start; 220 rs = rs_after; 221 } else { 222 rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); 223 rs->rs_start = start; 224 rs->rs_end = end; 225 avl_insert(&rt->rt_root, rs, where); 226 } 227 228 if (rt->rt_ops != NULL) 229 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 230 231 range_tree_stat_incr(rt, rs); 232 rt->rt_space += size; 233 } 234 235 void 236 range_tree_remove(void *arg, uint64_t start, uint64_t size) 237 { 238 range_tree_t *rt = arg; 239 avl_index_t where; 240 range_seg_t rsearch, *rs, *newseg; 241 uint64_t end = start + size; 242 boolean_t left_over, right_over; 243 244 ASSERT(MUTEX_HELD(rt->rt_lock)); 245 VERIFY3U(size, !=, 0); 246 VERIFY3U(size, <=, rt->rt_space); 247 248 rsearch.rs_start = start; 249 rsearch.rs_end = end; 250 rs = avl_find(&rt->rt_root, &rsearch, &where); 251 252 /* Make sure we completely overlap with someone */ 253 if (rs == NULL) { 254 zfs_panic_recover("zfs: freeing free segment " 255 "(offset=%llu size=%llu)", 256 (longlong_t)start, (longlong_t)size); 257 return; 258 } 259 VERIFY3U(rs->rs_start, <=, start); 260 VERIFY3U(rs->rs_end, >=, end); 261 262 left_over = (rs->rs_start != start); 263 right_over = (rs->rs_end != end); 264 265 range_tree_stat_decr(rt, rs); 266 267 if (rt->rt_ops != NULL) 268 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); 269 270 if (left_over && right_over) { 271 newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); 272 newseg->rs_start = end; 273 newseg->rs_end = rs->rs_end; 274 range_tree_stat_incr(rt, newseg); 275 276 rs->rs_end = start; 277 278 avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); 279 if (rt->rt_ops != NULL) 280 rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); 281 } else if (left_over) { 282 rs->rs_end = start; 283 } else if (right_over) { 284 rs->rs_start = end; 285 } else { 286 avl_remove(&rt->rt_root, rs); 287 kmem_cache_free(range_seg_cache, rs); 288 rs = NULL; 289 } 290 291 if (rs != NULL) { 292 range_tree_stat_incr(rt, rs); 293 294 if (rt->rt_ops != NULL) 295 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); 296 } 297 298 rt->rt_space -= size; 299 } 300 301 static range_seg_t * 302 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) 303 { 304 avl_index_t where; 305 range_seg_t rsearch; 306 uint64_t end = start + size; 307 308 ASSERT(MUTEX_HELD(rt->rt_lock)); 309 VERIFY(size != 0); 310 311 rsearch.rs_start = start; 312 rsearch.rs_end = end; 313 return (avl_find(&rt->rt_root, &rsearch, &where)); 314 } 315 316 static range_seg_t * 317 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) 318 { 319 range_seg_t *rs = range_tree_find_impl(rt, start, size); 320 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) 321 return (rs); 322 return (NULL); 323 } 324 325 void 326 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) 327 { 328 range_seg_t *rs; 329 330 mutex_enter(rt->rt_lock); 331 rs = range_tree_find(rt, off, size); 332 if (rs != NULL) 333 panic("freeing free block; rs=%p", (void *)rs); 334 mutex_exit(rt->rt_lock); 335 } 336 337 boolean_t 338 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) 339 { 340 return (range_tree_find(rt, start, size) != NULL); 341 } 342 343 /* 344 * Ensure that this range is not in the tree, regardless of whether 345 * it is currently in the tree. 346 */ 347 void 348 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) 349 { 350 range_seg_t *rs; 351 352 while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { 353 uint64_t free_start = MAX(rs->rs_start, start); 354 uint64_t free_end = MIN(rs->rs_end, start + size); 355 range_tree_remove(rt, free_start, free_end - free_start); 356 } 357 } 358 359 void 360 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) 361 { 362 range_tree_t *rt; 363 364 ASSERT(MUTEX_HELD((*rtsrc)->rt_lock)); 365 ASSERT0(range_tree_space(*rtdst)); 366 ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); 367 368 rt = *rtsrc; 369 *rtsrc = *rtdst; 370 *rtdst = rt; 371 } 372 373 void 374 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) 375 { 376 range_seg_t *rs; 377 void *cookie = NULL; 378 379 ASSERT(MUTEX_HELD(rt->rt_lock)); 380 381 if (rt->rt_ops != NULL) 382 rt->rt_ops->rtop_vacate(rt, rt->rt_arg); 383 384 while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { 385 if (func != NULL) 386 func(arg, rs->rs_start, rs->rs_end - rs->rs_start); 387 kmem_cache_free(range_seg_cache, rs); 388 } 389 390 bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); 391 rt->rt_space = 0; 392 } 393 394 void 395 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) 396 { 397 range_seg_t *rs; 398 399 ASSERT(MUTEX_HELD(rt->rt_lock)); 400 401 for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) 402 func(arg, rs->rs_start, rs->rs_end - rs->rs_start); 403 } 404 405 uint64_t 406 range_tree_space(range_tree_t *rt) 407 { 408 return (rt->rt_space); 409 } 410