1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/types.h>
27 #include <sys/ksynch.h>
28 #include <sys/kmem.h>
29 #include <sys/errno.h>
30 #include <sys/cmn_err.h>
31 #include <sys/debug.h>
32 #include <sys/cred.h>
33 #include <sys/file.h>
34 #include <sys/ddi.h>
35 #include <sys/nsctl/nsctl.h>
36 #include <sys/unistat/spcs_s.h>
37 #include <sys/unistat/spcs_errors.h>
38
39 #include <sys/unistat/spcs_s_k.h>
40 #include "dsw.h"
41 #include "dsw_dev.h"
42
43 #ifdef DS_DDICT
44 #include "../contract.h"
45 #endif
46
47 #include <sys/sdt.h> /* dtrace is S10 or later */
48
49 /*
50 * Instant Image.
51 *
52 * This file contains the chunk map lookup functions of II.
53 *
54 */
55 #define CHUNK_FBA(chunk) DSW_CHK2FBA(chunk)
56
57 extern int ii_debug; /* debug level switch */
58 int ii_map_debug = 0;
59
60 #ifdef II_MULTIMULTI_TERABYTE
61 typedef int64_t nodeid_t;
62 typedef int32_t nodeid32_t;
63 #else
64 typedef int32_t nodeid_t;
65 #endif
66
67 typedef struct ii_node {
68 chunkid_t vchunk_id; /* virtual chunk id */
69 } NODE;
70
71 typedef struct ii_nodelink_s {
72 chunkid_t next_chunk;
73 } ii_nodelink_t;
74
75 static int nodes_per_fba = FBA_SIZE(1) / sizeof (NODE);
76
77 ii_header_t *_ii_bm_header_get(_ii_info_t *ip, nsc_buf_t **tmp);
78 int _ii_bm_header_put(ii_header_t *hdr, _ii_info_t *ip,
79 nsc_buf_t *tmp);
80 void _ii_rlse_devs(_ii_info_t *, int);
81 int _ii_rsrv_devs(_ii_info_t *, int, int);
82 void _ii_error(_ii_info_t *, int);
83 /*
84 * Private functions for use in this file.
85 */
86 static void free_node(_ii_info_t *ip, NODE *np, nodeid_t ni);
87 static chunkid_t ii_alloc_overflow(_ii_info_t *ip);
88 void ii_free_overflow(_ii_info_t *, chunkid_t);
89 extern int _ii_nsc_io(_ii_info_t *, int, nsc_fd_t *, int, nsc_off_t,
90 unsigned char *, nsc_size_t);
91
92 static int
update_tree_header(_ii_info_t * ip)93 update_tree_header(_ii_info_t *ip)
94 {
95 ii_header_t *header;
96 nsc_buf_t *tmp = NULL;
97
98 mutex_enter(&ip->bi_mutex);
99 header = _ii_bm_header_get(ip, &tmp);
100 if (header == NULL) {
101 /* bitmap is probably offline */
102 mutex_exit(&ip->bi_mutex);
103 DTRACE_PROBE(_iit_update_tree_header_end);
104 return (1);
105 }
106 header->ii_mstchks = ip->bi_mstchks;
107 header->ii_shdchks = ip->bi_shdchks;
108 header->ii_shdchkused = ip->bi_shdchkused;
109 header->ii_shdfchk = ip->bi_shdfchk;
110 (void) _ii_bm_header_put(header, ip, tmp);
111 mutex_exit(&ip->bi_mutex);
112
113 return (0);
114 }
115
116 static int
update_overflow_header(_ii_info_t * ip,_ii_overflow_t * op)117 update_overflow_header(_ii_info_t *ip, _ii_overflow_t *op)
118 {
119 (void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF,
120 II_OHEADER_FBA, (unsigned char *)&(op->ii_do),
121 sizeof (_ii_doverflow_t));
122
123 return (0);
124 }
125
126 static int
node_io(_ii_info_t * ip,NODE * np,nodeid_t node,int flag)127 node_io(_ii_info_t *ip, NODE *np, nodeid_t node, int flag)
128 {
129 int rc;
130 int node_fba;
131 int tree_fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
132 int offset;
133 nsc_buf_t *tmp = NULL;
134
135 /*
136 * Don't use _ii_nsc_io() as _ii_nsc_io() requires io to start at
137 * an fba boundary.
138 */
139
140 /* calculate location of node on bitmap file */
141 offset = (node % nodes_per_fba) * sizeof (NODE);
142 node_fba = tree_fba + node / nodes_per_fba;
143
144 /* read disk block containing node */
145 rc = nsc_alloc_buf(ip->bi_bmpfd, node_fba, 1, NSC_RDBUF|flag, &tmp);
146 if (!II_SUCCESS(rc)) {
147 _ii_error(ip, DSW_BMPOFFLINE);
148 if (tmp)
149 (void) nsc_free_buf(tmp);
150
151 DTRACE_PROBE(_iit_node_io_end);
152 return (1);
153 }
154
155 /* copy node and update bitmap file if needed */
156 rc = 0;
157 if (flag == NSC_RDBUF)
158 bcopy(tmp->sb_vec->sv_addr+offset, np, sizeof (NODE));
159 else {
160 bcopy(np, tmp->sb_vec->sv_addr+offset, sizeof (NODE));
161 II_NSC_WRITE(ip, bitmap, rc, tmp, node_fba, 1, 0);
162 if (!II_SUCCESS(rc)) {
163 _ii_error(ip, DSW_BMPOFFLINE);
164 rc = EIO;
165 }
166 }
167 if (tmp)
168 (void) nsc_free_buf(tmp);
169
170 return (0);
171 }
172
173 static int
node_fba_fill(_ii_info_t * ip,nsc_size_t nchunks,chunkid_t vchunk_id)174 node_fba_fill(_ii_info_t *ip, nsc_size_t nchunks, chunkid_t vchunk_id)
175 {
176 int rc;
177 nsc_off_t fba;
178 nsc_size_t fbas;
179 nsc_size_t maxfbas;
180 nsc_buf_t *bp;
181 nsc_vec_t *vp;
182
183 /* Determine maximum number of FBAs to allocate */
184 rc = nsc_maxfbas(ip->bi_bmpfd, 0, &maxfbas);
185 if (!II_SUCCESS(rc))
186 maxfbas = DSW_CBLK_FBA;
187
188 /* Write out blocks of initialied NODEs */
189 fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
190 fbas = FBA_LEN(nchunks * sizeof (NODE));
191 while (fbas > 0) {
192
193 /* Determine number of FBA to allocate this time */
194 if (fbas < maxfbas) maxfbas = fbas;
195
196 /* Allocate buffer which map to FBAs containing NODEs */
197 bp = NULL;
198 rc = nsc_alloc_buf(ip->bi_bmpfd, fba, maxfbas, NSC_WRBUF, &bp);
199 if (!II_SUCCESS(rc)) {
200 _ii_error(ip, DSW_BMPOFFLINE);
201 DTRACE_PROBE(alloc_buf_failed);
202 return (EIO);
203 }
204
205 /* traverse vector list, filling wth initialized NODEs */
206 for (vp = bp->sb_vec; vp->sv_addr && vp->sv_len; vp++) {
207 NODE *pnode = (NODE *)vp->sv_addr;
208 NODE *enode = (NODE *)(vp->sv_addr + vp->sv_len);
209 while (pnode < enode) {
210 pnode->vchunk_id = vchunk_id;
211 pnode++;
212 }
213 }
214
215 /* write FBAs containing initialized NODEs */
216 II_NSC_WRITE(ip, bitmap, rc, bp, fba, maxfbas, 0);
217 if (!II_SUCCESS(rc)) {
218 _ii_error(ip, DSW_BMPOFFLINE);
219 (void) nsc_free_buf(bp);
220 DTRACE_PROBE(write_failed);
221 return (EIO);
222 }
223
224 /* free the buffer */
225 (void) nsc_free_buf(bp);
226
227 /* Adjust nsc buffer values */
228 fba += maxfbas;
229 fbas -= maxfbas;
230 }
231
232 return (0);
233 }
234
235 /*
236 * Reads the node into core and returns a pointer to it.
237 */
238
239 static NODE *
read_node(_ii_info_t * ip,nodeid_t node)240 read_node(_ii_info_t *ip, nodeid_t node)
241 {
242 NODE *new;
243
244 new = (NODE *)kmem_alloc(sizeof (NODE), KM_SLEEP);
245
246 if (node_io(ip, new, node, NSC_RDBUF)) {
247 kmem_free(new, sizeof (NODE));
248 new = NULL;
249 }
250
251 return (new);
252 }
253
254
255 static chunkid_t
alloc_chunk(_ii_info_t * ip)256 alloc_chunk(_ii_info_t *ip)
257 {
258 ii_nodelink_t nl;
259 int fba;
260 chunkid_t rc = II_NULLCHUNK;
261
262 mutex_enter(&ip->bi_chksmutex);
263 if (ip->bi_shdchkused < ip->bi_shdchks) {
264 rc = ip->bi_shdchkused++;
265 } else if (ip->bi_shdfchk != II_NULLCHUNK) {
266 ASSERT(ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks);
267 rc = ip->bi_shdfchk;
268 fba = CHUNK_FBA(rc);
269 (void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
270 (void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_RDBUF, fba,
271 (unsigned char *)&nl, sizeof (nl));
272 _ii_rlse_devs(ip, SHDR);
273 ip->bi_shdfchk = nl.next_chunk;
274 ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
275 (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
276 } else {
277
278 /* into overflow */
279 rc = ii_alloc_overflow(ip);
280 }
281 mutex_exit(&ip->bi_chksmutex);
282 (void) update_tree_header(ip);
283
284 return (rc);
285 }
286
287 /*
288 * releases memory for node
289 */
290 static void /*ARGSUSED*/
release_node(_ii_info_t * ip,NODE * np,nodeid_t ni)291 release_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
292 {
293 kmem_free(np, sizeof (NODE));
294
295 }
296
297 static void
write_node(_ii_info_t * ip,NODE * np,nodeid_t ni)298 write_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
299 {
300 (void) node_io(ip, np, ni, NSC_WRBUF);
301 release_node(ip, np, ni);
302
303 }
304
305 static void
free_node(_ii_info_t * ip,NODE * np,nodeid_t ni)306 free_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
307 {
308 ii_nodelink_t nl;
309 int fba;
310
311 if (np == NULL) {
312 DTRACE_PROBE(_iit_free_node_end);
313 return;
314 }
315
316 mutex_enter(&ip->bi_chksmutex);
317 if (II_ISOVERFLOW(np->vchunk_id)) {
318 /* link chunk onto overflow free list */
319 ii_free_overflow(ip, np->vchunk_id);
320 } else {
321 /* write old free list head into chunk */
322 nl.next_chunk = ip->bi_shdfchk;
323 ip->bi_shdfchk = np->vchunk_id;
324 ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
325 (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
326 fba = CHUNK_FBA(np->vchunk_id);
327 (void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
328 (void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_WRBUF, fba,
329 (unsigned char *)&nl, sizeof (nl));
330 _ii_rlse_devs(ip, SHDR);
331 /* update free counts */
332 /* ip->bi_unused++; */
333 }
334 np->vchunk_id = II_NULLCHUNK;
335 (void) node_io(ip, np, ni, NSC_WRBUF);
336 (void) update_tree_header(ip);
337 mutex_exit(&ip->bi_chksmutex);
338
339 }
340
341 /*
342 * Public functions for dsw_dev to use.
343 */
344
345 /*
346 * Overflow volume functions.
347 */
348
349 /* put overflow chunk on the overflow volume free list */
350 void
ii_free_overflow(_ii_info_t * ip,chunkid_t chunk)351 ii_free_overflow(_ii_info_t *ip, chunkid_t chunk)
352 {
353 ii_nodelink_t nl;
354 _ii_overflow_t *op;
355 int fba;
356
357 if (!II_ISOVERFLOW(chunk)) {
358 DTRACE_PROBE(_iit_free_overflow_end_1);
359 return;
360 }
361 chunk = II_2OVERFLOW(chunk);
362
363 op = ip->bi_overflow;
364 if (op == NULL) {
365 #ifdef DEBUG
366 cmn_err(CE_PANIC, "overflow used, but not attached ip %p",
367 (void *) ip);
368 #endif
369 DTRACE_PROBE(_iit_free_overflow_end_2);
370 return;
371 }
372 mutex_enter(&(op->ii_mutex));
373
374 DTRACE_PROBE(_iit_free_overflow);
375
376 /* write old free list head into chunk */
377 nl.next_chunk = op->ii_freehead;
378 fba = CHUNK_FBA(chunk);
379 (void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
380 (void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF, fba,
381 (unsigned char *)&nl, sizeof (nl));
382 /* update free counts */
383 op->ii_unused++;
384 ASSERT(op->ii_used > 0); /* always use 1 for header */
385
386 /* write chunk id into header freelist start */
387 op->ii_freehead = chunk;
388
389 (void) update_overflow_header(ip, op);
390 nsc_release(op->ii_dev->bi_fd);
391 mutex_exit(&(op->ii_mutex));
392
393 }
394
395 /* reclaim any overflow storage used by the volume */
396 void
ii_reclaim_overflow(_ii_info_t * ip)397 ii_reclaim_overflow(_ii_info_t *ip)
398 {
399 NODE *node;
400 nodeid_t node_id;
401 _ii_overflow_t *op;
402
403 if ((ip->bi_flags & (DSW_VOVERFLOW | DSW_FRECLAIM)) == 0) {
404 DTRACE_PROBE(_iit_reclaim_overflow_end);
405 return;
406 }
407
408 /*
409 * Determine whether overflow should be reclaimed:
410 * 1/ If we're not doing a group volume update
411 * OR
412 * 2/ If the number of detaches != number of attached vols
413 */
414 op = ip->bi_overflow;
415 if (op && (((op->ii_flags & IIO_VOL_UPDATE) == 0) ||
416 (op->ii_detachcnt != op->ii_drefcnt))) {
417 #ifndef II_MULTIMULTI_TERABYTE
418 /* assert volume size fits into node_id */
419 ASSERT(ip->bi_mstchks <= INT32_MAX);
420 #endif
421 for (node_id = 0; node_id < ip->bi_mstchks; node_id++) {
422 if ((node = read_node(ip, node_id)) == NULL) {
423 DTRACE_PROBE(_iit_reclaim_overflow_end);
424 return;
425 }
426 ii_free_overflow(ip, node->vchunk_id);
427 release_node(ip, node, node_id);
428 }
429 } else {
430 /* need to reset the overflow volume header */
431 op->ii_freehead = II_NULLNODE;
432 op->ii_used = 1; /* we have used the header */
433 op->ii_unused = op->ii_nchunks - op->ii_used;
434 (void) update_overflow_header(ip, op);
435 }
436
437 DTRACE_PROBE(_iit_reclaim_overflow);
438
439 if ((ip->bi_flags & DSW_VOVERFLOW) == DSW_VOVERFLOW) {
440 mutex_enter(&ip->bi_mutex);
441 II_FLAG_CLR(DSW_VOVERFLOW, ip);
442 mutex_exit(&ip->bi_mutex);
443 }
444 --iigkstat.spilled_over.value.ul;
445
446 }
447
448 static chunkid_t
ii_alloc_overflow(_ii_info_t * ip)449 ii_alloc_overflow(_ii_info_t *ip)
450 {
451 chunkid_t chunk;
452 ii_nodelink_t nl;
453 _ii_overflow_t *op;
454 int fba;
455
456 if ((op = ip->bi_overflow) == NULL) {
457 DTRACE_PROBE(_iit_alloc_overflow_end);
458 return (II_NULLCHUNK); /* no overflow volume attached */
459 }
460
461 mutex_enter(&(op->ii_mutex));
462
463 DTRACE_PROBE(_iit_alloc_overflow);
464
465 if (op->ii_unused < 1) {
466 mutex_exit(&(op->ii_mutex));
467 DTRACE_PROBE(_iit_alloc_overflow_end);
468 return (II_NULLCHUNK);
469 }
470 (void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
471 if (op->ii_freehead != II_NULLCHUNK) {
472 /* pick first from free list */
473 chunk = op->ii_freehead;
474 fba = CHUNK_FBA(chunk);
475 (void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_RDBUF, fba,
476 (unsigned char *)&nl, sizeof (nl));
477 op->ii_freehead = nl.next_chunk;
478 /* decrease unused count, fix bug 4419956 */
479 op->ii_unused--;
480 } else {
481 /* otherwise pick first unused */
482 if (op->ii_used > op->ii_nchunks)
483 chunk = II_NULLCHUNK;
484 else {
485 chunk = op->ii_used++;
486 op->ii_unused--;
487 }
488 }
489 if (chunk != II_NULLCHUNK) {
490 chunk = II_2OVERFLOW(chunk);
491 if ((ip->bi_flags&DSW_VOVERFLOW) == 0) {
492 mutex_enter(&ip->bi_mutex);
493 II_FLAG_SET(DSW_VOVERFLOW, ip);
494 mutex_exit(&ip->bi_mutex);
495 ++iigkstat.spilled_over.value.ul;
496 }
497 }
498 (void) update_overflow_header(ip, op);
499 nsc_release(op->ii_dev->bi_fd);
500 mutex_exit(&(op->ii_mutex));
501
502 return (chunk);
503 }
504 /*
505 * Find or insert key into search tree.
506 */
507
508 chunkid_t
ii_tsearch(_ii_info_t * ip,chunkid_t chunk_id)509 ii_tsearch(_ii_info_t *ip, chunkid_t chunk_id)
510 /* Address of the root of the tree */
511 {
512 NODE *rootp = NULL;
513 chunkid_t n; /* New node id if key not found */
514
515 if ((rootp = read_node(ip, chunk_id)) == NULL) {
516 DTRACE_PROBE(_iit_tsearch_end);
517 return (II_NULLNODE);
518 }
519 n = rootp->vchunk_id;
520 if (n != II_NULLCHUNK) { /* chunk allocated, return location */
521 release_node(ip, rootp, 0);
522 DTRACE_PROBE(_iit_tsearch_end);
523 return (n);
524 }
525 n = alloc_chunk(ip);
526 if (n != II_NULLCHUNK) {
527 rootp->vchunk_id = n;
528 write_node(ip, rootp, chunk_id);
529 } else
530 release_node(ip, rootp, 0);
531
532 return (n);
533 }
534
535 /* Delete node with key chunkid */
536 void
ii_tdelete(_ii_info_t * ip,chunkid_t chunkid)537 ii_tdelete(_ii_info_t *ip,
538 chunkid_t chunkid) /* Key to be deleted */
539 {
540 NODE *np = NULL;
541
542 if ((np = read_node(ip, chunkid)) == NULL) {
543 DTRACE_PROBE(_iit_tdelete_end);
544 return;
545 }
546
547 ASSERT(np->vchunk_id != II_NULLCHUNK);
548 free_node(ip, np, chunkid);
549 np->vchunk_id = II_NULLCHUNK;
550 write_node(ip, np, chunkid);
551
552 }
553
554 /*
555 * initialise an empty map for ip
556 */
557
558 int
ii_tinit(_ii_info_t * ip)559 ii_tinit(_ii_info_t *ip)
560 {
561 int rc = 0;
562
563 /* overflow can't be attached before first call to this function */
564 if (ip->bi_overflow)
565 ii_reclaim_overflow(ip);
566
567 mutex_enter(&ip->bi_chksmutex);
568 ip->bi_shdfchk = II_NULLCHUNK; /* set freelist to empty chain */
569 ip->bi_shdchkused = 0;
570
571 /* fill index (bi_mstchks size) with II_NULLCHUNK */
572 rc = node_fba_fill(ip, ip->bi_mstchks, II_NULLCHUNK);
573 if (rc == 0)
574 rc = update_tree_header(ip);
575 mutex_exit(&ip->bi_chksmutex);
576
577 return (rc);
578 }
579
580 /*
581 * Calculate the size of map space provided by a bitmap volume with
582 * tree_len fba's spare for the tree.
583 */
584
585 nsc_size_t
ii_btsize(nsc_size_t tree_len)586 ii_btsize(nsc_size_t tree_len)
587 {
588 nsc_size_t nchunks;
589
590 nchunks = tree_len * nodes_per_fba;
591
592 if (ii_debug > 1)
593 cmn_err(CE_NOTE,
594 "!ii_btsize: bitmap with %" NSC_SZFMT
595 " spare fba's will map %" NSC_SZFMT " chunks",
596 tree_len, nchunks);
597
598 return (nchunks);
599 }
600