xref: /titanic_41/usr/src/uts/common/avs/ns/dsw/ii_tree.c (revision 3270659f55e0928d6edec3d26217cc29398a8149)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ksynch.h>
28 #include <sys/kmem.h>
29 #include <sys/errno.h>
30 #include <sys/cmn_err.h>
31 #include <sys/debug.h>
32 #include <sys/cred.h>
33 #include <sys/file.h>
34 #include <sys/ddi.h>
35 #include <sys/nsctl/nsctl.h>
36 #include <sys/unistat/spcs_s.h>
37 #include <sys/unistat/spcs_errors.h>
38 
39 #include <sys/unistat/spcs_s_k.h>
40 #include "dsw.h"
41 #include "dsw_dev.h"
42 
43 #ifdef DS_DDICT
44 #include "../contract.h"
45 #endif
46 
47 #include <sys/sdt.h>		/* dtrace is S10 or later */
48 
49 /*
50  * Instant Image.
51  *
52  * This file contains the chunk map lookup functions of II.
53  *
54  */
55 #define	CHUNK_FBA(chunk) DSW_CHK2FBA(chunk)
56 
57 extern int ii_debug;	/* debug level switch */
58 int ii_map_debug = 0;
59 
60 #ifdef II_MULTIMULTI_TERABYTE
61 typedef	int64_t	nodeid_t;
62 typedef	int32_t	nodeid32_t;
63 #else
64 typedef	int32_t	nodeid_t;
65 #endif
66 
67 typedef struct	ii_node {
68 	chunkid_t	vchunk_id;		/* virtual chunk id */
69 } NODE;
70 
71 typedef struct ii_nodelink_s {
72 	chunkid_t	next_chunk;
73 } ii_nodelink_t;
74 
75 static	int	nodes_per_fba = FBA_SIZE(1) / sizeof (NODE);
76 
77 ii_header_t *_ii_bm_header_get(_ii_info_t *ip, nsc_buf_t **tmp);
78 int _ii_bm_header_put(ii_header_t *hdr, _ii_info_t *ip,
79     nsc_buf_t *tmp);
80 void _ii_rlse_devs(_ii_info_t *, int);
81 int _ii_rsrv_devs(_ii_info_t *, int, int);
82 void _ii_error(_ii_info_t *, int);
83 /*
84  * Private functions for use in this file.
85  */
86 static void free_node(_ii_info_t *ip, NODE *np, nodeid_t ni);
87 static chunkid_t ii_alloc_overflow(_ii_info_t *ip);
88 void ii_free_overflow(_ii_info_t *, chunkid_t);
89 extern int _ii_nsc_io(_ii_info_t *, int, nsc_fd_t *, int, nsc_off_t,
90     unsigned char *, nsc_size_t);
91 
92 static int
update_tree_header(_ii_info_t * ip)93 update_tree_header(_ii_info_t *ip)
94 {
95 	ii_header_t *header;
96 	nsc_buf_t	*tmp = NULL;
97 
98 	mutex_enter(&ip->bi_mutex);
99 	header = _ii_bm_header_get(ip, &tmp);
100 	if (header == NULL) {
101 		/* bitmap is probably offline */
102 		mutex_exit(&ip->bi_mutex);
103 		DTRACE_PROBE(_iit_update_tree_header_end);
104 		return (1);
105 	}
106 	header->ii_mstchks = ip->bi_mstchks;
107 	header->ii_shdchks = ip->bi_shdchks;
108 	header->ii_shdchkused = ip->bi_shdchkused;
109 	header->ii_shdfchk = ip->bi_shdfchk;
110 	(void) _ii_bm_header_put(header, ip, tmp);
111 	mutex_exit(&ip->bi_mutex);
112 
113 	return (0);
114 }
115 
116 static int
update_overflow_header(_ii_info_t * ip,_ii_overflow_t * op)117 update_overflow_header(_ii_info_t *ip, _ii_overflow_t *op)
118 {
119 	(void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF,
120 	    II_OHEADER_FBA, (unsigned char *)&(op->ii_do),
121 	    sizeof (_ii_doverflow_t));
122 
123 	return (0);
124 }
125 
126 static int
node_io(_ii_info_t * ip,NODE * np,nodeid_t node,int flag)127 node_io(_ii_info_t *ip, NODE *np, nodeid_t node, int flag)
128 {
129 	int	rc;
130 	int	node_fba;
131 	int	tree_fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
132 	int	offset;
133 	nsc_buf_t *tmp = NULL;
134 
135 	/*
136 	 * Don't use _ii_nsc_io() as _ii_nsc_io() requires io to start at
137 	 * an fba boundary.
138 	 */
139 
140 	/* calculate location of node on bitmap file */
141 	offset = (node % nodes_per_fba) * sizeof (NODE);
142 	node_fba = tree_fba + node / nodes_per_fba;
143 
144 	/* read disk block containing node */
145 	rc = nsc_alloc_buf(ip->bi_bmpfd, node_fba, 1, NSC_RDBUF|flag, &tmp);
146 	if (!II_SUCCESS(rc)) {
147 		_ii_error(ip, DSW_BMPOFFLINE);
148 		if (tmp)
149 			(void) nsc_free_buf(tmp);
150 
151 		DTRACE_PROBE(_iit_node_io_end);
152 		return (1);
153 	}
154 
155 	/* copy node and update bitmap file if needed */
156 	rc = 0;
157 	if (flag == NSC_RDBUF)
158 		bcopy(tmp->sb_vec->sv_addr+offset, np, sizeof (NODE));
159 	else {
160 		bcopy(np, tmp->sb_vec->sv_addr+offset, sizeof (NODE));
161 		II_NSC_WRITE(ip, bitmap, rc, tmp, node_fba, 1, 0);
162 		if (!II_SUCCESS(rc)) {
163 			_ii_error(ip, DSW_BMPOFFLINE);
164 			rc = EIO;
165 		}
166 	}
167 	if (tmp)
168 		(void) nsc_free_buf(tmp);
169 
170 	return (0);
171 }
172 
173 static int
node_fba_fill(_ii_info_t * ip,nsc_size_t nchunks,chunkid_t vchunk_id)174 node_fba_fill(_ii_info_t *ip, nsc_size_t nchunks, chunkid_t vchunk_id)
175 {
176 	int	rc;
177 	nsc_off_t	fba;
178 	nsc_size_t	fbas;
179 	nsc_size_t	maxfbas;
180 	nsc_buf_t *bp;
181 	nsc_vec_t *vp;
182 
183 	/* Determine maximum number of FBAs to allocate */
184 	rc =  nsc_maxfbas(ip->bi_bmpfd, 0, &maxfbas);
185 	if (!II_SUCCESS(rc))
186 		maxfbas = DSW_CBLK_FBA;
187 
188 	/* Write out blocks of initialied NODEs */
189 	fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
190 	fbas = FBA_LEN(nchunks * sizeof (NODE));
191 	while (fbas > 0) {
192 
193 		/* Determine number of FBA to allocate this time */
194 		if (fbas < maxfbas) maxfbas = fbas;
195 
196 		/* Allocate buffer which map to FBAs containing NODEs */
197 		bp = NULL;
198 		rc = nsc_alloc_buf(ip->bi_bmpfd, fba, maxfbas, NSC_WRBUF, &bp);
199 		if (!II_SUCCESS(rc)) {
200 			_ii_error(ip, DSW_BMPOFFLINE);
201 			DTRACE_PROBE(alloc_buf_failed);
202 			return (EIO);
203 		}
204 
205 		/* traverse vector list, filling wth initialized NODEs */
206 		for (vp = bp->sb_vec; vp->sv_addr && vp->sv_len; vp++) {
207 			NODE *pnode = (NODE *)vp->sv_addr;
208 			NODE *enode = (NODE *)(vp->sv_addr +  vp->sv_len);
209 			while (pnode < enode) {
210 				pnode->vchunk_id = vchunk_id;
211 				pnode++;
212 			}
213 		}
214 
215 		/* write FBAs containing initialized NODEs */
216 		II_NSC_WRITE(ip, bitmap, rc, bp, fba, maxfbas, 0);
217 		if (!II_SUCCESS(rc)) {
218 			_ii_error(ip, DSW_BMPOFFLINE);
219 			(void) nsc_free_buf(bp);
220 			DTRACE_PROBE(write_failed);
221 			return (EIO);
222 		}
223 
224 		/* free the buffer */
225 		(void) nsc_free_buf(bp);
226 
227 		/* Adjust nsc buffer values */
228 		fba += maxfbas;
229 		fbas -= maxfbas;
230 	}
231 
232 	return (0);
233 }
234 
235 /*
236  * Reads the node into core and returns a pointer to it.
237  */
238 
239 static NODE *
read_node(_ii_info_t * ip,nodeid_t node)240 read_node(_ii_info_t *ip, nodeid_t node)
241 {
242 	NODE *new;
243 
244 	new = (NODE *)kmem_alloc(sizeof (NODE), KM_SLEEP);
245 
246 	if (node_io(ip, new, node, NSC_RDBUF)) {
247 		kmem_free(new, sizeof (NODE));
248 		new = NULL;
249 	}
250 
251 	return (new);
252 }
253 
254 
255 static chunkid_t
alloc_chunk(_ii_info_t * ip)256 alloc_chunk(_ii_info_t *ip)
257 {
258 	ii_nodelink_t nl;
259 	int fba;
260 	chunkid_t rc = II_NULLCHUNK;
261 
262 	mutex_enter(&ip->bi_chksmutex);
263 	if (ip->bi_shdchkused < ip->bi_shdchks) {
264 		rc = ip->bi_shdchkused++;
265 	} else if (ip->bi_shdfchk != II_NULLCHUNK) {
266 		ASSERT(ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks);
267 		rc = ip->bi_shdfchk;
268 		fba = CHUNK_FBA(rc);
269 		(void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
270 		(void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_RDBUF, fba,
271 		    (unsigned char *)&nl, sizeof (nl));
272 		_ii_rlse_devs(ip, SHDR);
273 		ip->bi_shdfchk = nl.next_chunk;
274 		ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
275 		    (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
276 	} else {
277 
278 		/* into overflow */
279 		rc = ii_alloc_overflow(ip);
280 	}
281 	mutex_exit(&ip->bi_chksmutex);
282 	(void) update_tree_header(ip);
283 
284 	return (rc);
285 }
286 
287 /*
288  * releases memory for node
289  */
290 static void	/*ARGSUSED*/
release_node(_ii_info_t * ip,NODE * np,nodeid_t ni)291 release_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
292 {
293 	kmem_free(np, sizeof (NODE));
294 
295 }
296 
297 static void
write_node(_ii_info_t * ip,NODE * np,nodeid_t ni)298 write_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
299 {
300 	(void) node_io(ip, np, ni, NSC_WRBUF);
301 	release_node(ip, np, ni);
302 
303 }
304 
305 static void
free_node(_ii_info_t * ip,NODE * np,nodeid_t ni)306 free_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
307 {
308 	ii_nodelink_t nl;
309 	int	fba;
310 
311 	if (np == NULL) {
312 		DTRACE_PROBE(_iit_free_node_end);
313 		return;
314 	}
315 
316 	mutex_enter(&ip->bi_chksmutex);
317 	if (II_ISOVERFLOW(np->vchunk_id)) {
318 		/* link chunk onto overflow free list */
319 		ii_free_overflow(ip, np->vchunk_id);
320 	} else {
321 		/* write old free list head into chunk */
322 		nl.next_chunk = ip->bi_shdfchk;
323 		ip->bi_shdfchk = np->vchunk_id;
324 		ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
325 		    (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
326 		fba = CHUNK_FBA(np->vchunk_id);
327 		(void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
328 		(void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_WRBUF, fba,
329 		    (unsigned char *)&nl, sizeof (nl));
330 		_ii_rlse_devs(ip, SHDR);
331 		/* update free counts */
332 		/* ip->bi_unused++; */
333 	}
334 	np->vchunk_id = II_NULLCHUNK;
335 	(void) node_io(ip, np, ni, NSC_WRBUF);
336 	(void) update_tree_header(ip);
337 	mutex_exit(&ip->bi_chksmutex);
338 
339 }
340 
341 /*
342  * Public functions for dsw_dev to use.
343  */
344 
345 /*
346  * Overflow volume functions.
347  */
348 
349 /* put overflow chunk on the overflow volume free list */
350 void
ii_free_overflow(_ii_info_t * ip,chunkid_t chunk)351 ii_free_overflow(_ii_info_t *ip, chunkid_t chunk)
352 {
353 	ii_nodelink_t nl;
354 	_ii_overflow_t *op;
355 	int fba;
356 
357 	if (!II_ISOVERFLOW(chunk)) {
358 		DTRACE_PROBE(_iit_free_overflow_end_1);
359 		return;
360 	}
361 	chunk = II_2OVERFLOW(chunk);
362 
363 	op = ip->bi_overflow;
364 	if (op == NULL) {
365 #ifdef DEBUG
366 		cmn_err(CE_PANIC, "overflow used, but not attached ip %p",
367 		    (void *) ip);
368 #endif
369 		DTRACE_PROBE(_iit_free_overflow_end_2);
370 		return;
371 	}
372 	mutex_enter(&(op->ii_mutex));
373 
374 	DTRACE_PROBE(_iit_free_overflow);
375 
376 	/* write old free list head into chunk */
377 	nl.next_chunk = op->ii_freehead;
378 	fba = CHUNK_FBA(chunk);
379 	(void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
380 	(void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF, fba,
381 	    (unsigned char *)&nl, sizeof (nl));
382 	/* update free counts */
383 	op->ii_unused++;
384 	ASSERT(op->ii_used > 0);		/* always use 1 for header */
385 
386 	/* write chunk id into header freelist start */
387 	op->ii_freehead =  chunk;
388 
389 	(void) update_overflow_header(ip, op);
390 	nsc_release(op->ii_dev->bi_fd);
391 	mutex_exit(&(op->ii_mutex));
392 
393 }
394 
395 /* reclaim any overflow storage used by the volume */
396 void
ii_reclaim_overflow(_ii_info_t * ip)397 ii_reclaim_overflow(_ii_info_t *ip)
398 {
399 	NODE	*node;
400 	nodeid_t node_id;
401 	_ii_overflow_t *op;
402 
403 	if ((ip->bi_flags & (DSW_VOVERFLOW | DSW_FRECLAIM)) == 0) {
404 		DTRACE_PROBE(_iit_reclaim_overflow_end);
405 		return;
406 	}
407 
408 	/*
409 	 * Determine whether overflow should be reclaimed:
410 	 * 1/ If we're not doing a group volume update
411 	 * OR
412 	 * 2/ If the number of detaches != number of attached vols
413 	 */
414 	op = ip->bi_overflow;
415 	if (op && (((op->ii_flags & IIO_VOL_UPDATE) == 0) ||
416 	    (op->ii_detachcnt != op->ii_drefcnt))) {
417 #ifndef II_MULTIMULTI_TERABYTE
418 		/* assert volume size fits into node_id */
419 		ASSERT(ip->bi_mstchks <= INT32_MAX);
420 #endif
421 		for (node_id = 0; node_id < ip->bi_mstchks; node_id++) {
422 			if ((node = read_node(ip, node_id)) == NULL) {
423 				DTRACE_PROBE(_iit_reclaim_overflow_end);
424 				return;
425 			}
426 			ii_free_overflow(ip, node->vchunk_id);
427 			release_node(ip, node, node_id);
428 		}
429 	} else {
430 		/* need to reset the overflow volume header */
431 		op->ii_freehead = II_NULLNODE;
432 		op->ii_used = 1;		/* we have used the header */
433 		op->ii_unused = op->ii_nchunks - op->ii_used;
434 		(void) update_overflow_header(ip, op);
435 	}
436 
437 	DTRACE_PROBE(_iit_reclaim_overflow);
438 
439 	if ((ip->bi_flags & DSW_VOVERFLOW) == DSW_VOVERFLOW) {
440 		mutex_enter(&ip->bi_mutex);
441 		II_FLAG_CLR(DSW_VOVERFLOW, ip);
442 		mutex_exit(&ip->bi_mutex);
443 	}
444 	--iigkstat.spilled_over.value.ul;
445 
446 }
447 
448 static chunkid_t
ii_alloc_overflow(_ii_info_t * ip)449 ii_alloc_overflow(_ii_info_t *ip)
450 {
451 	chunkid_t chunk;
452 	ii_nodelink_t nl;
453 	_ii_overflow_t *op;
454 	int fba;
455 
456 	if ((op = ip->bi_overflow) == NULL) {
457 		DTRACE_PROBE(_iit_alloc_overflow_end);
458 		return (II_NULLCHUNK);	/* no overflow volume attached */
459 	}
460 
461 	mutex_enter(&(op->ii_mutex));
462 
463 	DTRACE_PROBE(_iit_alloc_overflow);
464 
465 	if (op->ii_unused < 1) {
466 		mutex_exit(&(op->ii_mutex));
467 		DTRACE_PROBE(_iit_alloc_overflow_end);
468 		return (II_NULLCHUNK);
469 	}
470 	(void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
471 	if (op->ii_freehead != II_NULLCHUNK) {
472 		/* pick first from free list */
473 		chunk = op->ii_freehead;
474 		fba = CHUNK_FBA(chunk);
475 		(void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_RDBUF, fba,
476 		    (unsigned char *)&nl, sizeof (nl));
477 		op->ii_freehead = nl.next_chunk;
478 		/* decrease unused count, fix bug 4419956 */
479 		op->ii_unused--;
480 	} else {
481 		/* otherwise pick first unused */
482 		if (op->ii_used > op->ii_nchunks)
483 			chunk = II_NULLCHUNK;
484 		else {
485 			chunk = op->ii_used++;
486 			op->ii_unused--;
487 		}
488 	}
489 	if (chunk != II_NULLCHUNK) {
490 		chunk = II_2OVERFLOW(chunk);
491 		if ((ip->bi_flags&DSW_VOVERFLOW) == 0) {
492 			mutex_enter(&ip->bi_mutex);
493 			II_FLAG_SET(DSW_VOVERFLOW, ip);
494 			mutex_exit(&ip->bi_mutex);
495 			++iigkstat.spilled_over.value.ul;
496 		}
497 	}
498 	(void) update_overflow_header(ip, op);
499 	nsc_release(op->ii_dev->bi_fd);
500 	mutex_exit(&(op->ii_mutex));
501 
502 	return (chunk);
503 }
504 /*
505  * Find or insert key into search tree.
506  */
507 
508 chunkid_t
ii_tsearch(_ii_info_t * ip,chunkid_t chunk_id)509 ii_tsearch(_ii_info_t *ip, chunkid_t chunk_id)
510 			/* Address of the root of the tree */
511 {
512 	NODE	*rootp = NULL;
513 	chunkid_t n;	/* New node id if key not found */
514 
515 	if ((rootp = read_node(ip, chunk_id)) == NULL) {
516 		DTRACE_PROBE(_iit_tsearch_end);
517 		return (II_NULLNODE);
518 	}
519 	n = rootp->vchunk_id;
520 	if (n != II_NULLCHUNK) { /* chunk allocated, return location */
521 		release_node(ip, rootp, 0);
522 		DTRACE_PROBE(_iit_tsearch_end);
523 		return (n);
524 	}
525 	n = alloc_chunk(ip);
526 	if (n != II_NULLCHUNK) {
527 		rootp->vchunk_id = n;
528 		write_node(ip, rootp, chunk_id);
529 	} else
530 		release_node(ip, rootp, 0);
531 
532 	return (n);
533 }
534 
535 /* Delete node with key chunkid */
536 void
ii_tdelete(_ii_info_t * ip,chunkid_t chunkid)537 ii_tdelete(_ii_info_t *ip,
538 	chunkid_t chunkid)	/* Key to be deleted */
539 {
540 	NODE *np = NULL;
541 
542 	if ((np = read_node(ip, chunkid)) == NULL) {
543 		DTRACE_PROBE(_iit_tdelete_end);
544 		return;
545 	}
546 
547 	ASSERT(np->vchunk_id != II_NULLCHUNK);
548 	free_node(ip, np, chunkid);
549 	np->vchunk_id = II_NULLCHUNK;
550 	write_node(ip, np, chunkid);
551 
552 }
553 
554 /*
555  * initialise an empty map for ip
556  */
557 
558 int
ii_tinit(_ii_info_t * ip)559 ii_tinit(_ii_info_t *ip)
560 {
561 	int rc = 0;
562 
563 	/* overflow can't be attached before first call to this function */
564 	if (ip->bi_overflow)
565 		ii_reclaim_overflow(ip);
566 
567 	mutex_enter(&ip->bi_chksmutex);
568 	ip->bi_shdfchk = II_NULLCHUNK;	/* set freelist to empty chain */
569 	ip->bi_shdchkused = 0;
570 
571 	/* fill index (bi_mstchks size) with II_NULLCHUNK */
572 	rc = node_fba_fill(ip, ip->bi_mstchks, II_NULLCHUNK);
573 	if (rc == 0)
574 		rc = update_tree_header(ip);
575 	mutex_exit(&ip->bi_chksmutex);
576 
577 	return (rc);
578 }
579 
580 /*
581  * Calculate the size of map space provided by a bitmap volume with
582  * tree_len fba's spare for the tree.
583  */
584 
585 nsc_size_t
ii_btsize(nsc_size_t tree_len)586 ii_btsize(nsc_size_t tree_len)
587 {
588 	nsc_size_t nchunks;
589 
590 	nchunks = tree_len * nodes_per_fba;
591 
592 	if (ii_debug > 1)
593 		cmn_err(CE_NOTE,
594 		    "!ii_btsize: bitmap with %" NSC_SZFMT
595 		    " spare fba's will map %" NSC_SZFMT " chunks",
596 		    tree_len, nchunks);
597 
598 	return (nchunks);
599 }
600