xref: /freebsd/sys/geom/raid/md_intel.c (revision 95d45410b5100e07f6f98450bcd841a8945d4726)
1 /*-
2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3  * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include <sys/param.h>
32 #include <sys/bio.h>
33 #include <sys/endian.h>
34 #include <sys/kernel.h>
35 #include <sys/kobj.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
41 #include <sys/taskqueue.h>
42 #include <geom/geom.h>
43 #include "geom/raid/g_raid.h"
44 #include "g_raid_md_if.h"
45 
46 static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata");
47 
48 struct intel_raid_map {
49 	uint32_t	offset;
50 	uint32_t	disk_sectors;
51 	uint32_t	stripe_count;
52 	uint16_t	strip_sectors;
53 	uint8_t		status;
54 #define INTEL_S_READY           0x00
55 #define INTEL_S_UNINITIALIZED   0x01
56 #define INTEL_S_DEGRADED        0x02
57 #define INTEL_S_FAILURE         0x03
58 
59 	uint8_t		type;
60 #define INTEL_T_RAID0           0x00
61 #define INTEL_T_RAID1           0x01
62 #define INTEL_T_RAID5           0x05
63 
64 	uint8_t		total_disks;
65 	uint8_t		total_domains;
66 	uint8_t		failed_disk_num;
67 	uint8_t		ddf;
68 	uint32_t	offset_hi;
69 	uint32_t	disk_sectors_hi;
70 	uint32_t	stripe_count_hi;
71 	uint32_t	filler_2[4];
72 	uint32_t	disk_idx[1];	/* total_disks entries. */
73 #define INTEL_DI_IDX	0x00ffffff
74 #define INTEL_DI_RBLD	0x01000000
75 } __packed;
76 
77 struct intel_raid_vol {
78 	uint8_t		name[16];
79 	u_int64_t	total_sectors __packed;
80 	uint32_t	state;
81 #define INTEL_ST_BOOTABLE		0x00000001
82 #define INTEL_ST_BOOT_DEVICE		0x00000002
83 #define INTEL_ST_READ_COALESCING	0x00000004
84 #define INTEL_ST_WRITE_COALESCING	0x00000008
85 #define INTEL_ST_LAST_SHUTDOWN_DIRTY	0x00000010
86 #define INTEL_ST_HIDDEN_AT_BOOT		0x00000020
87 #define INTEL_ST_CURRENTLY_HIDDEN	0x00000040
88 #define INTEL_ST_VERIFY_AND_FIX		0x00000080
89 #define INTEL_ST_MAP_STATE_UNINIT	0x00000100
90 #define INTEL_ST_NO_AUTO_RECOVERY	0x00000200
91 #define INTEL_ST_CLONE_N_GO		0x00000400
92 #define INTEL_ST_CLONE_MAN_SYNC		0x00000800
93 #define INTEL_ST_CNG_MASTER_DISK_NUM	0x00001000
94 	uint32_t	reserved;
95 	uint8_t		migr_priority;
96 	uint8_t		num_sub_vols;
97 	uint8_t		tid;
98 	uint8_t		cng_master_disk;
99 	uint16_t	cache_policy;
100 	uint8_t		cng_state;
101 #define INTEL_CNGST_UPDATED		0
102 #define INTEL_CNGST_NEEDS_UPDATE	1
103 #define INTEL_CNGST_MASTER_MISSING	2
104 	uint8_t		cng_sub_state;
105 	uint32_t	filler_0[10];
106 
107 	uint32_t	curr_migr_unit;
108 	uint32_t	checkpoint_id;
109 	uint8_t		migr_state;
110 	uint8_t		migr_type;
111 #define INTEL_MT_INIT		0
112 #define INTEL_MT_REBUILD	1
113 #define INTEL_MT_VERIFY		2
114 #define INTEL_MT_GEN_MIGR	3
115 #define INTEL_MT_STATE_CHANGE	4
116 #define INTEL_MT_REPAIR		5
117 	uint8_t		dirty;
118 	uint8_t		fs_state;
119 	uint16_t	verify_errors;
120 	uint16_t	bad_blocks;
121 	uint32_t	curr_migr_unit_hi;
122 	uint32_t	filler_1[3];
123 	struct intel_raid_map map[1];	/* 2 entries if migr_state != 0. */
124 } __packed;
125 
126 struct intel_raid_disk {
127 #define INTEL_SERIAL_LEN	16
128 	uint8_t		serial[INTEL_SERIAL_LEN];
129 	uint32_t	sectors;
130 	uint32_t	id;
131 	uint32_t	flags;
132 #define INTEL_F_SPARE		0x01
133 #define INTEL_F_ASSIGNED	0x02
134 #define INTEL_F_FAILED		0x04
135 #define INTEL_F_ONLINE		0x08
136 #define INTEL_F_DISABLED	0x80
137 	uint32_t	owner_cfg_num;
138 	uint32_t	sectors_hi;
139 	uint32_t	filler[3];
140 } __packed;
141 
142 struct intel_raid_conf {
143 	uint8_t		intel_id[24];
144 #define INTEL_MAGIC             "Intel Raid ISM Cfg Sig. "
145 
146 	uint8_t		version[6];
147 #define INTEL_VERSION_1000	"1.0.00"	/* RAID0 */
148 #define INTEL_VERSION_1100	"1.1.00"	/* RAID1 */
149 #define INTEL_VERSION_1200	"1.2.00"	/* Many volumes */
150 #define INTEL_VERSION_1201	"1.2.01"	/* 3 or 4 disks */
151 #define INTEL_VERSION_1202	"1.2.02"	/* RAID5 */
152 #define INTEL_VERSION_1204	"1.2.04"	/* 5 or 6 disks */
153 #define INTEL_VERSION_1206	"1.2.06"	/* CNG */
154 #define INTEL_VERSION_1300	"1.3.00"	/* Attributes */
155 
156 	uint8_t		dummy_0[2];
157 	uint32_t	checksum;
158 	uint32_t	config_size;
159 	uint32_t	config_id;
160 	uint32_t	generation;
161 	uint32_t	error_log_size;
162 	uint32_t	attributes;
163 #define INTEL_ATTR_RAID0	0x00000001
164 #define INTEL_ATTR_RAID1	0x00000002
165 #define INTEL_ATTR_RAID10	0x00000004
166 #define INTEL_ATTR_RAID1E	0x00000008
167 #define INTEL_ATTR_RAID5	0x00000010
168 #define INTEL_ATTR_RAIDCNG	0x00000020
169 #define INTEL_ATTR_EXT_STRIP	0x00000040
170 #define INTEL_ATTR_NVM_CACHE	0x02000000
171 #define INTEL_ATTR_2TB_DISK	0x04000000
172 #define INTEL_ATTR_BBM		0x08000000
173 #define INTEL_ATTR_NVM_CACHE2	0x10000000
174 #define INTEL_ATTR_2TB		0x20000000
175 #define INTEL_ATTR_PM		0x40000000
176 #define INTEL_ATTR_CHECKSUM	0x80000000
177 
178 	uint8_t		total_disks;
179 	uint8_t		total_volumes;
180 	uint8_t		error_log_pos;
181 	uint8_t		dummy_2[1];
182 	uint32_t	cache_size;
183 	uint32_t	orig_config_id;
184 	uint32_t	pwr_cycle_count;
185 	uint32_t	bbm_log_size;
186 	uint32_t	filler_0[35];
187 	struct intel_raid_disk	disk[1];	/* total_disks entries. */
188 	/* Here goes total_volumes of struct intel_raid_vol. */
189 } __packed;
190 
191 #define INTEL_ATTR_SUPPORTED	( INTEL_ATTR_RAID0 | INTEL_ATTR_RAID1 |	\
192     INTEL_ATTR_RAID10 | INTEL_ATTR_RAID1E | INTEL_ATTR_RAID5 |		\
193     INTEL_ATTR_RAIDCNG | INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK |	\
194     INTEL_ATTR_2TB | INTEL_ATTR_PM | INTEL_ATTR_CHECKSUM )
195 
196 #define INTEL_MAX_MD_SIZE(ndisks)				\
197     (sizeof(struct intel_raid_conf) +				\
198      sizeof(struct intel_raid_disk) * (ndisks - 1) +		\
199      sizeof(struct intel_raid_vol) * 2 +			\
200      sizeof(struct intel_raid_map) * 2 +			\
201      sizeof(uint32_t) * (ndisks - 1) * 4)
202 
203 struct g_raid_md_intel_perdisk {
204 	struct intel_raid_conf	*pd_meta;
205 	int			 pd_disk_pos;
206 	struct intel_raid_disk	 pd_disk_meta;
207 };
208 
209 struct g_raid_md_intel_pervolume {
210 	int			 pv_volume_pos;
211 	int			 pv_cng;
212 	int			 pv_cng_man_sync;
213 	int			 pv_cng_master_disk;
214 };
215 
216 struct g_raid_md_intel_object {
217 	struct g_raid_md_object	 mdio_base;
218 	uint32_t		 mdio_config_id;
219 	uint32_t		 mdio_orig_config_id;
220 	uint32_t		 mdio_generation;
221 	struct intel_raid_conf	*mdio_meta;
222 	struct callout		 mdio_start_co;	/* STARTING state timer. */
223 	int			 mdio_disks_present;
224 	int			 mdio_started;
225 	int			 mdio_incomplete;
226 	struct root_hold_token	*mdio_rootmount; /* Root mount delay token. */
227 };
228 
229 static g_raid_md_create_t g_raid_md_create_intel;
230 static g_raid_md_taste_t g_raid_md_taste_intel;
231 static g_raid_md_event_t g_raid_md_event_intel;
232 static g_raid_md_ctl_t g_raid_md_ctl_intel;
233 static g_raid_md_write_t g_raid_md_write_intel;
234 static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel;
235 static g_raid_md_free_disk_t g_raid_md_free_disk_intel;
236 static g_raid_md_free_volume_t g_raid_md_free_volume_intel;
237 static g_raid_md_free_t g_raid_md_free_intel;
238 
239 static kobj_method_t g_raid_md_intel_methods[] = {
240 	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_intel),
241 	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_intel),
242 	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_intel),
243 	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_intel),
244 	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_intel),
245 	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_intel),
246 	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_intel),
247 	KOBJMETHOD(g_raid_md_free_volume,	g_raid_md_free_volume_intel),
248 	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_intel),
249 	{ 0, 0 }
250 };
251 
252 static struct g_raid_md_class g_raid_md_intel_class = {
253 	"Intel",
254 	g_raid_md_intel_methods,
255 	sizeof(struct g_raid_md_intel_object),
256 	.mdc_enable = 1,
257 	.mdc_priority = 100
258 };
259 
260 
261 static struct intel_raid_map *
262 intel_get_map(struct intel_raid_vol *mvol, int i)
263 {
264 	struct intel_raid_map *mmap;
265 
266 	if (i > (mvol->migr_state ? 1 : 0))
267 		return (NULL);
268 	mmap = &mvol->map[0];
269 	for (; i > 0; i--) {
270 		mmap = (struct intel_raid_map *)
271 		    &mmap->disk_idx[mmap->total_disks];
272 	}
273 	return ((struct intel_raid_map *)mmap);
274 }
275 
276 static struct intel_raid_vol *
277 intel_get_volume(struct intel_raid_conf *meta, int i)
278 {
279 	struct intel_raid_vol *mvol;
280 	struct intel_raid_map *mmap;
281 
282 	if (i > 1)
283 		return (NULL);
284 	mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks];
285 	for (; i > 0; i--) {
286 		mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0);
287 		mvol = (struct intel_raid_vol *)
288 		    &mmap->disk_idx[mmap->total_disks];
289 	}
290 	return (mvol);
291 }
292 
293 static off_t
294 intel_get_map_offset(struct intel_raid_map *mmap)
295 {
296 	off_t offset = (off_t)mmap->offset_hi << 32;
297 
298 	offset += mmap->offset;
299 	return (offset);
300 }
301 
302 static void
303 intel_set_map_offset(struct intel_raid_map *mmap, off_t offset)
304 {
305 
306 	mmap->offset = offset & 0xffffffff;
307 	mmap->offset_hi = offset >> 32;
308 }
309 
310 static off_t
311 intel_get_map_disk_sectors(struct intel_raid_map *mmap)
312 {
313 	off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32;
314 
315 	disk_sectors += mmap->disk_sectors;
316 	return (disk_sectors);
317 }
318 
319 static void
320 intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors)
321 {
322 
323 	mmap->disk_sectors = disk_sectors & 0xffffffff;
324 	mmap->disk_sectors_hi = disk_sectors >> 32;
325 }
326 
327 static void
328 intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count)
329 {
330 
331 	mmap->stripe_count = stripe_count & 0xffffffff;
332 	mmap->stripe_count_hi = stripe_count >> 32;
333 }
334 
335 static off_t
336 intel_get_disk_sectors(struct intel_raid_disk *disk)
337 {
338 	off_t sectors = (off_t)disk->sectors_hi << 32;
339 
340 	sectors += disk->sectors;
341 	return (sectors);
342 }
343 
344 static void
345 intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors)
346 {
347 
348 	disk->sectors = sectors & 0xffffffff;
349 	disk->sectors_hi = sectors >> 32;
350 }
351 
352 static off_t
353 intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol)
354 {
355 	off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32;
356 
357 	curr_migr_unit += vol->curr_migr_unit;
358 	return (curr_migr_unit);
359 }
360 
361 static void
362 intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit)
363 {
364 
365 	vol->curr_migr_unit = curr_migr_unit & 0xffffffff;
366 	vol->curr_migr_unit_hi = curr_migr_unit >> 32;
367 }
368 
369 static void
370 g_raid_md_intel_print(struct intel_raid_conf *meta)
371 {
372 	struct intel_raid_vol *mvol;
373 	struct intel_raid_map *mmap;
374 	int i, j, k;
375 
376 	if (g_raid_debug < 1)
377 		return;
378 
379 	printf("********* ATA Intel MatrixRAID Metadata *********\n");
380 	printf("intel_id            <%.24s>\n", meta->intel_id);
381 	printf("version             <%.6s>\n", meta->version);
382 	printf("checksum            0x%08x\n", meta->checksum);
383 	printf("config_size         0x%08x\n", meta->config_size);
384 	printf("config_id           0x%08x\n", meta->config_id);
385 	printf("generation          0x%08x\n", meta->generation);
386 	printf("error_log_size      %d\n", meta->error_log_size);
387 	printf("attributes          0x%08x\n", meta->attributes);
388 	printf("total_disks         %u\n", meta->total_disks);
389 	printf("total_volumes       %u\n", meta->total_volumes);
390 	printf("error_log_pos       %u\n", meta->error_log_pos);
391 	printf("cache_size          %u\n", meta->cache_size);
392 	printf("orig_config_id      0x%08x\n", meta->orig_config_id);
393 	printf("pwr_cycle_count     %u\n", meta->pwr_cycle_count);
394 	printf("bbm_log_size        %u\n", meta->bbm_log_size);
395 	printf("DISK#   serial disk_sectors disk_sectors_hi disk_id flags owner\n");
396 	for (i = 0; i < meta->total_disks; i++ ) {
397 		printf("    %d   <%.16s> %u %u 0x%08x 0x%08x %08x\n", i,
398 		    meta->disk[i].serial, meta->disk[i].sectors,
399 		    meta->disk[i].sectors_hi, meta->disk[i].id,
400 		    meta->disk[i].flags, meta->disk[i].owner_cfg_num);
401 	}
402 	for (i = 0; i < meta->total_volumes; i++) {
403 		mvol = intel_get_volume(meta, i);
404 		printf(" ****** Volume %d ******\n", i);
405 		printf(" name               %.16s\n", mvol->name);
406 		printf(" total_sectors      %ju\n", mvol->total_sectors);
407 		printf(" state              0x%08x\n", mvol->state);
408 		printf(" reserved           %u\n", mvol->reserved);
409 		printf(" migr_priority      %u\n", mvol->migr_priority);
410 		printf(" num_sub_vols       %u\n", mvol->num_sub_vols);
411 		printf(" tid                %u\n", mvol->tid);
412 		printf(" cng_master_disk    %u\n", mvol->cng_master_disk);
413 		printf(" cache_policy       %u\n", mvol->cache_policy);
414 		printf(" cng_state          %u\n", mvol->cng_state);
415 		printf(" cng_sub_state      %u\n", mvol->cng_sub_state);
416 		printf(" curr_migr_unit     %u\n", mvol->curr_migr_unit);
417 		printf(" curr_migr_unit_hi  %u\n", mvol->curr_migr_unit_hi);
418 		printf(" checkpoint_id      %u\n", mvol->checkpoint_id);
419 		printf(" migr_state         %u\n", mvol->migr_state);
420 		printf(" migr_type          %u\n", mvol->migr_type);
421 		printf(" dirty              %u\n", mvol->dirty);
422 		printf(" fs_state           %u\n", mvol->fs_state);
423 		printf(" verify_errors      %u\n", mvol->verify_errors);
424 		printf(" bad_blocks         %u\n", mvol->bad_blocks);
425 
426 		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
427 			printf("  *** Map %d ***\n", j);
428 			mmap = intel_get_map(mvol, j);
429 			printf("  offset            %u\n", mmap->offset);
430 			printf("  offset_hi         %u\n", mmap->offset_hi);
431 			printf("  disk_sectors      %u\n", mmap->disk_sectors);
432 			printf("  disk_sectors_hi   %u\n", mmap->disk_sectors_hi);
433 			printf("  stripe_count      %u\n", mmap->stripe_count);
434 			printf("  stripe_count_hi   %u\n", mmap->stripe_count_hi);
435 			printf("  strip_sectors     %u\n", mmap->strip_sectors);
436 			printf("  status            %u\n", mmap->status);
437 			printf("  type              %u\n", mmap->type);
438 			printf("  total_disks       %u\n", mmap->total_disks);
439 			printf("  total_domains     %u\n", mmap->total_domains);
440 			printf("  failed_disk_num   %u\n", mmap->failed_disk_num);
441 			printf("  ddf               %u\n", mmap->ddf);
442 			printf("  disk_idx         ");
443 			for (k = 0; k < mmap->total_disks; k++)
444 				printf(" 0x%08x", mmap->disk_idx[k]);
445 			printf("\n");
446 		}
447 	}
448 	printf("=================================================\n");
449 }
450 
451 static struct intel_raid_conf *
452 intel_meta_copy(struct intel_raid_conf *meta)
453 {
454 	struct intel_raid_conf *nmeta;
455 
456 	nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK);
457 	memcpy(nmeta, meta, meta->config_size);
458 	return (nmeta);
459 }
460 
461 static int
462 intel_meta_find_disk(struct intel_raid_conf *meta, char *serial)
463 {
464 	int pos;
465 
466 	for (pos = 0; pos < meta->total_disks; pos++) {
467 		if (strncmp(meta->disk[pos].serial,
468 		    serial, INTEL_SERIAL_LEN) == 0)
469 			return (pos);
470 	}
471 	return (-1);
472 }
473 
474 static struct intel_raid_conf *
475 intel_meta_read(struct g_consumer *cp)
476 {
477 	struct g_provider *pp;
478 	struct intel_raid_conf *meta;
479 	struct intel_raid_vol *mvol;
480 	struct intel_raid_map *mmap, *mmap1;
481 	char *buf;
482 	int error, i, j, k, left, size;
483 	uint32_t checksum, *ptr;
484 
485 	pp = cp->provider;
486 
487 	/* Read the anchor sector. */
488 	buf = g_read_data(cp,
489 	    pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error);
490 	if (buf == NULL) {
491 		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
492 		    pp->name, error);
493 		return (NULL);
494 	}
495 	meta = (struct intel_raid_conf *)buf;
496 
497 	/* Check if this is an Intel RAID struct */
498 	if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) {
499 		G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name);
500 		g_free(buf);
501 		return (NULL);
502 	}
503 	if (meta->config_size > 65536 ||
504 	    meta->config_size < sizeof(struct intel_raid_conf)) {
505 		G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d",
506 		    meta->config_size);
507 		g_free(buf);
508 		return (NULL);
509 	}
510 	size = meta->config_size;
511 	meta = malloc(size, M_MD_INTEL, M_WAITOK);
512 	memcpy(meta, buf, min(size, pp->sectorsize));
513 	g_free(buf);
514 
515 	/* Read all the rest, if needed. */
516 	if (meta->config_size > pp->sectorsize) {
517 		left = (meta->config_size - 1) / pp->sectorsize;
518 		buf = g_read_data(cp,
519 		    pp->mediasize - pp->sectorsize * (2 + left),
520 		    pp->sectorsize * left, &error);
521 		if (buf == NULL) {
522 			G_RAID_DEBUG(1, "Cannot read remaining metadata"
523 			    " part from %s (error=%d).",
524 			    pp->name, error);
525 			free(meta, M_MD_INTEL);
526 			return (NULL);
527 		}
528 		memcpy(((char *)meta) + pp->sectorsize, buf,
529 		    pp->sectorsize * left);
530 		g_free(buf);
531 	}
532 
533 	/* Check metadata checksum. */
534 	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
535 	    i < (meta->config_size / sizeof(uint32_t)); i++) {
536 		checksum += *ptr++;
537 	}
538 	checksum -= meta->checksum;
539 	if (checksum != meta->checksum) {
540 		G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name);
541 		free(meta, M_MD_INTEL);
542 		return (NULL);
543 	}
544 
545 	/* Validate metadata size. */
546 	size = sizeof(struct intel_raid_conf) +
547 	    sizeof(struct intel_raid_disk) * (meta->total_disks - 1) +
548 	    sizeof(struct intel_raid_vol) * meta->total_volumes;
549 	if (size > meta->config_size) {
550 badsize:
551 		G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d",
552 		    meta->config_size, size);
553 		free(meta, M_MD_INTEL);
554 		return (NULL);
555 	}
556 	for (i = 0; i < meta->total_volumes; i++) {
557 		mvol = intel_get_volume(meta, i);
558 		mmap = intel_get_map(mvol, 0);
559 		size += 4 * (mmap->total_disks - 1);
560 		if (size > meta->config_size)
561 			goto badsize;
562 		if (mvol->migr_state) {
563 			size += sizeof(struct intel_raid_map);
564 			if (size > meta->config_size)
565 				goto badsize;
566 			mmap = intel_get_map(mvol, 1);
567 			size += 4 * (mmap->total_disks - 1);
568 			if (size > meta->config_size)
569 				goto badsize;
570 		}
571 	}
572 
573 	g_raid_md_intel_print(meta);
574 
575 	if (strncmp(meta->version, INTEL_VERSION_1300, 6) > 0) {
576 		G_RAID_DEBUG(1, "Intel unsupported version: '%.6s'",
577 		    meta->version);
578 		free(meta, M_MD_INTEL);
579 		return (NULL);
580 	}
581 
582 	if (strncmp(meta->version, INTEL_VERSION_1300, 6) >= 0 &&
583 	    (meta->attributes & ~INTEL_ATTR_SUPPORTED) != 0) {
584 		G_RAID_DEBUG(1, "Intel unsupported attributes: 0x%08x",
585 		    meta->attributes & ~INTEL_ATTR_SUPPORTED);
586 		free(meta, M_MD_INTEL);
587 		return (NULL);
588 	}
589 
590 	/* Validate disk indexes. */
591 	for (i = 0; i < meta->total_volumes; i++) {
592 		mvol = intel_get_volume(meta, i);
593 		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
594 			mmap = intel_get_map(mvol, j);
595 			for (k = 0; k < mmap->total_disks; k++) {
596 				if ((mmap->disk_idx[k] & INTEL_DI_IDX) >
597 				    meta->total_disks) {
598 					G_RAID_DEBUG(1, "Intel metadata disk"
599 					    " index %d too big (>%d)",
600 					    mmap->disk_idx[k] & INTEL_DI_IDX,
601 					    meta->total_disks);
602 					free(meta, M_MD_INTEL);
603 					return (NULL);
604 				}
605 			}
606 		}
607 	}
608 
609 	/* Validate migration types. */
610 	for (i = 0; i < meta->total_volumes; i++) {
611 		mvol = intel_get_volume(meta, i);
612 		/* Deny unknown migration types. */
613 		if (mvol->migr_state &&
614 		    mvol->migr_type != INTEL_MT_INIT &&
615 		    mvol->migr_type != INTEL_MT_REBUILD &&
616 		    mvol->migr_type != INTEL_MT_VERIFY &&
617 		    mvol->migr_type != INTEL_MT_GEN_MIGR &&
618 		    mvol->migr_type != INTEL_MT_REPAIR) {
619 			G_RAID_DEBUG(1, "Intel metadata has unsupported"
620 			    " migration type %d", mvol->migr_type);
621 			free(meta, M_MD_INTEL);
622 			return (NULL);
623 		}
624 		/* Deny general migrations except SINGLE->RAID1. */
625 		if (mvol->migr_state &&
626 		    mvol->migr_type == INTEL_MT_GEN_MIGR) {
627 			mmap = intel_get_map(mvol, 0);
628 			mmap1 = intel_get_map(mvol, 1);
629 			if (mmap1->total_disks != 1 ||
630 			    mmap->type != INTEL_T_RAID1 ||
631 			    mmap->total_disks != 2 ||
632 			    mmap->offset != mmap1->offset ||
633 			    mmap->disk_sectors != mmap1->disk_sectors ||
634 			    mmap->total_domains != mmap->total_disks ||
635 			    mmap->offset_hi != mmap1->offset_hi ||
636 			    mmap->disk_sectors_hi != mmap1->disk_sectors_hi ||
637 			    (mmap->disk_idx[0] != mmap1->disk_idx[0] &&
638 			     mmap->disk_idx[0] != mmap1->disk_idx[1])) {
639 				G_RAID_DEBUG(1, "Intel metadata has unsupported"
640 				    " variant of general migration");
641 				free(meta, M_MD_INTEL);
642 				return (NULL);
643 			}
644 		}
645 	}
646 
647 	return (meta);
648 }
649 
650 static int
651 intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta)
652 {
653 	struct g_provider *pp;
654 	char *buf;
655 	int error, i, sectors;
656 	uint32_t checksum, *ptr;
657 
658 	pp = cp->provider;
659 
660 	/* Recalculate checksum for case if metadata were changed. */
661 	meta->checksum = 0;
662 	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
663 	    i < (meta->config_size / sizeof(uint32_t)); i++) {
664 		checksum += *ptr++;
665 	}
666 	meta->checksum = checksum;
667 
668 	/* Create and fill buffer. */
669 	sectors = (meta->config_size + pp->sectorsize - 1) / pp->sectorsize;
670 	buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
671 	if (sectors > 1) {
672 		memcpy(buf, ((char *)meta) + pp->sectorsize,
673 		    (sectors - 1) * pp->sectorsize);
674 	}
675 	memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize);
676 
677 	error = g_write_data(cp,
678 	    pp->mediasize - pp->sectorsize * (1 + sectors),
679 	    buf, pp->sectorsize * sectors);
680 	if (error != 0) {
681 		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
682 		    pp->name, error);
683 	}
684 
685 	free(buf, M_MD_INTEL);
686 	return (error);
687 }
688 
689 static int
690 intel_meta_erase(struct g_consumer *cp)
691 {
692 	struct g_provider *pp;
693 	char *buf;
694 	int error;
695 
696 	pp = cp->provider;
697 	buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
698 	error = g_write_data(cp,
699 	    pp->mediasize - 2 * pp->sectorsize,
700 	    buf, pp->sectorsize);
701 	if (error != 0) {
702 		G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
703 		    pp->name, error);
704 	}
705 	free(buf, M_MD_INTEL);
706 	return (error);
707 }
708 
709 static int
710 intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d)
711 {
712 	struct intel_raid_conf *meta;
713 	int error;
714 
715 	/* Fill anchor and single disk. */
716 	meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO);
717 	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
718 	memcpy(&meta->version[0], INTEL_VERSION_1000,
719 	    sizeof(INTEL_VERSION_1000) - 1);
720 	meta->config_size = INTEL_MAX_MD_SIZE(1);
721 	meta->config_id = meta->orig_config_id = arc4random();
722 	meta->generation = 1;
723 	meta->total_disks = 1;
724 	meta->disk[0] = *d;
725 	error = intel_meta_write(cp, meta);
726 	free(meta, M_MD_INTEL);
727 	return (error);
728 }
729 
730 static struct g_raid_disk *
731 g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id)
732 {
733 	struct g_raid_disk	*disk;
734 	struct g_raid_md_intel_perdisk *pd;
735 
736 	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
737 		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
738 		if (pd->pd_disk_pos == id)
739 			break;
740 	}
741 	return (disk);
742 }
743 
744 static int
745 g_raid_md_intel_supported(int level, int qual, int disks, int force)
746 {
747 
748 	switch (level) {
749 	case G_RAID_VOLUME_RL_RAID0:
750 		if (disks < 1)
751 			return (0);
752 		if (!force && (disks < 2 || disks > 6))
753 			return (0);
754 		break;
755 	case G_RAID_VOLUME_RL_RAID1:
756 		if (disks < 1)
757 			return (0);
758 		if (!force && (disks != 2))
759 			return (0);
760 		break;
761 	case G_RAID_VOLUME_RL_RAID1E:
762 		if (disks < 2)
763 			return (0);
764 		if (!force && (disks != 4))
765 			return (0);
766 		break;
767 	case G_RAID_VOLUME_RL_RAID5:
768 		if (disks < 3)
769 			return (0);
770 		if (!force && disks > 6)
771 			return (0);
772 		if (qual != G_RAID_VOLUME_RLQ_R5LA)
773 			return (0);
774 		break;
775 	default:
776 		return (0);
777 	}
778 	if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
779 		return (0);
780 	return (1);
781 }
782 
783 static struct g_raid_volume *
784 g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id)
785 {
786 	struct g_raid_volume	*mvol;
787 	struct g_raid_md_intel_pervolume *pv;
788 
789 	TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) {
790 		pv = mvol->v_md_data;
791 		if (pv->pv_volume_pos == id)
792 			break;
793 	}
794 	return (mvol);
795 }
796 
797 static int
798 g_raid_md_intel_start_disk(struct g_raid_disk *disk)
799 {
800 	struct g_raid_softc *sc;
801 	struct g_raid_subdisk *sd, *tmpsd;
802 	struct g_raid_disk *olddisk, *tmpdisk;
803 	struct g_raid_md_object *md;
804 	struct g_raid_md_intel_object *mdi;
805 	struct g_raid_md_intel_pervolume *pv;
806 	struct g_raid_md_intel_perdisk *pd, *oldpd;
807 	struct intel_raid_conf *meta;
808 	struct intel_raid_vol *mvol;
809 	struct intel_raid_map *mmap0, *mmap1;
810 	int disk_pos, resurrection = 0, migr_global, i;
811 
812 	sc = disk->d_softc;
813 	md = sc->sc_md;
814 	mdi = (struct g_raid_md_intel_object *)md;
815 	meta = mdi->mdio_meta;
816 	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
817 	olddisk = NULL;
818 
819 	/* Find disk position in metadata by it's serial. */
820 	disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial);
821 	if (disk_pos < 0) {
822 		G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk");
823 		/* Failed stale disk is useless for us. */
824 		if ((pd->pd_disk_meta.flags & INTEL_F_FAILED) &&
825 		    !(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) {
826 			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
827 			return (0);
828 		}
829 		/* If we are in the start process, that's all for now. */
830 		if (!mdi->mdio_started)
831 			goto nofit;
832 		/*
833 		 * If we have already started - try to get use of the disk.
834 		 * Try to replace OFFLINE disks first, then FAILED.
835 		 */
836 		TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) {
837 			if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE &&
838 			    tmpdisk->d_state != G_RAID_DISK_S_FAILED)
839 				continue;
840 			/* Make sure this disk is big enough. */
841 			TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) {
842 				off_t disk_sectors =
843 				    intel_get_disk_sectors(&pd->pd_disk_meta);
844 
845 				if (sd->sd_offset + sd->sd_size + 4096 >
846 				    disk_sectors * 512) {
847 					G_RAID_DEBUG1(1, sc,
848 					    "Disk too small (%llu < %llu)",
849 					    (unsigned long long)
850 					    disk_sectors * 512,
851 					    (unsigned long long)
852 					    sd->sd_offset + sd->sd_size + 4096);
853 					break;
854 				}
855 			}
856 			if (sd != NULL)
857 				continue;
858 			if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) {
859 				olddisk = tmpdisk;
860 				break;
861 			} else if (olddisk == NULL)
862 				olddisk = tmpdisk;
863 		}
864 		if (olddisk == NULL) {
865 nofit:
866 			if (pd->pd_disk_meta.flags & INTEL_F_SPARE) {
867 				g_raid_change_disk_state(disk,
868 				    G_RAID_DISK_S_SPARE);
869 				return (1);
870 			} else {
871 				g_raid_change_disk_state(disk,
872 				    G_RAID_DISK_S_STALE);
873 				return (0);
874 			}
875 		}
876 		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
877 		disk_pos = oldpd->pd_disk_pos;
878 		resurrection = 1;
879 	}
880 
881 	if (olddisk == NULL) {
882 		/* Find placeholder by position. */
883 		olddisk = g_raid_md_intel_get_disk(sc, disk_pos);
884 		if (olddisk == NULL)
885 			panic("No disk at position %d!", disk_pos);
886 		if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
887 			G_RAID_DEBUG1(1, sc, "More then one disk for pos %d",
888 			    disk_pos);
889 			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
890 			return (0);
891 		}
892 		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
893 	}
894 
895 	/* Replace failed disk or placeholder with new disk. */
896 	TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) {
897 		TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next);
898 		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
899 		sd->sd_disk = disk;
900 	}
901 	oldpd->pd_disk_pos = -2;
902 	pd->pd_disk_pos = disk_pos;
903 
904 	/* If it was placeholder -- destroy it. */
905 	if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) {
906 		g_raid_destroy_disk(olddisk);
907 	} else {
908 		/* Otherwise, make it STALE_FAILED. */
909 		g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED);
910 		/* Update global metadata just in case. */
911 		memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta,
912 		    sizeof(struct intel_raid_disk));
913 	}
914 
915 	/* Welcome the new disk. */
916 	if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
917 	    !(pd->pd_disk_meta.flags & INTEL_F_SPARE))
918 		g_raid_change_disk_state(disk, G_RAID_DISK_S_DISABLED);
919 	else if (resurrection)
920 		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
921 	else if (meta->disk[disk_pos].flags & INTEL_F_FAILED)
922 		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
923 	else if (meta->disk[disk_pos].flags & INTEL_F_SPARE)
924 		g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
925 	else
926 		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
927 	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
928 		pv = sd->sd_volume->v_md_data;
929 		mvol = intel_get_volume(meta, pv->pv_volume_pos);
930 		mmap0 = intel_get_map(mvol, 0);
931 		if (mvol->migr_state)
932 			mmap1 = intel_get_map(mvol, 1);
933 		else
934 			mmap1 = mmap0;
935 
936 		migr_global = 1;
937 		for (i = 0; i < mmap0->total_disks; i++) {
938 			if ((mmap0->disk_idx[i] & INTEL_DI_RBLD) == 0 &&
939 			    (mmap1->disk_idx[i] & INTEL_DI_RBLD) != 0)
940 				migr_global = 0;
941 		}
942 
943 		if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
944 		    !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) {
945 			/* Disabled disk, useless. */
946 			g_raid_change_subdisk_state(sd,
947 			    G_RAID_SUBDISK_S_NONE);
948 		} else if (resurrection) {
949 			/* Stale disk, almost same as new. */
950 			g_raid_change_subdisk_state(sd,
951 			    G_RAID_SUBDISK_S_NEW);
952 		} else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) {
953 			/* Failed disk, almost useless. */
954 			g_raid_change_subdisk_state(sd,
955 			    G_RAID_SUBDISK_S_FAILED);
956 		} else if (mvol->migr_state == 0) {
957 			if (mmap0->status == INTEL_S_UNINITIALIZED &&
958 			    (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) {
959 				/* Freshly created uninitialized volume. */
960 				g_raid_change_subdisk_state(sd,
961 				    G_RAID_SUBDISK_S_UNINITIALIZED);
962 			} else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
963 				/* Freshly inserted disk. */
964 				g_raid_change_subdisk_state(sd,
965 				    G_RAID_SUBDISK_S_NEW);
966 			} else if (mvol->dirty && (!pv->pv_cng ||
967 			    pv->pv_cng_master_disk != disk_pos)) {
968 				/* Dirty volume (unclean shutdown). */
969 				g_raid_change_subdisk_state(sd,
970 				    G_RAID_SUBDISK_S_STALE);
971 			} else {
972 				/* Up to date disk. */
973 				g_raid_change_subdisk_state(sd,
974 				    G_RAID_SUBDISK_S_ACTIVE);
975 			}
976 		} else if (mvol->migr_type == INTEL_MT_INIT ||
977 			   mvol->migr_type == INTEL_MT_REBUILD) {
978 			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
979 				/* Freshly inserted disk. */
980 				g_raid_change_subdisk_state(sd,
981 				    G_RAID_SUBDISK_S_NEW);
982 			} else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
983 				/* Rebuilding disk. */
984 				g_raid_change_subdisk_state(sd,
985 				    G_RAID_SUBDISK_S_REBUILD);
986 				if (mvol->dirty) {
987 					sd->sd_rebuild_pos = 0;
988 				} else {
989 					sd->sd_rebuild_pos =
990 					    intel_get_vol_curr_migr_unit(mvol) *
991 					    sd->sd_volume->v_strip_size *
992 					    mmap0->total_domains;
993 				}
994 			} else if (mvol->migr_type == INTEL_MT_INIT &&
995 			    migr_global) {
996 				/* Freshly created uninitialized volume. */
997 				g_raid_change_subdisk_state(sd,
998 				    G_RAID_SUBDISK_S_UNINITIALIZED);
999 			} else if (mvol->dirty && (!pv->pv_cng ||
1000 			    pv->pv_cng_master_disk != disk_pos)) {
1001 				/* Dirty volume (unclean shutdown). */
1002 				g_raid_change_subdisk_state(sd,
1003 				    G_RAID_SUBDISK_S_STALE);
1004 			} else {
1005 				/* Up to date disk. */
1006 				g_raid_change_subdisk_state(sd,
1007 				    G_RAID_SUBDISK_S_ACTIVE);
1008 			}
1009 		} else if (mvol->migr_type == INTEL_MT_VERIFY ||
1010 			   mvol->migr_type == INTEL_MT_REPAIR) {
1011 			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1012 				/* Freshly inserted disk. */
1013 				g_raid_change_subdisk_state(sd,
1014 				    G_RAID_SUBDISK_S_NEW);
1015 			} else if ((mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) ||
1016 			    migr_global) {
1017 				/* Resyncing disk. */
1018 				g_raid_change_subdisk_state(sd,
1019 				    G_RAID_SUBDISK_S_RESYNC);
1020 				if (mvol->dirty) {
1021 					sd->sd_rebuild_pos = 0;
1022 				} else {
1023 					sd->sd_rebuild_pos =
1024 					    intel_get_vol_curr_migr_unit(mvol) *
1025 					    sd->sd_volume->v_strip_size *
1026 					    mmap0->total_domains;
1027 				}
1028 			} else if (mvol->dirty) {
1029 				/* Dirty volume (unclean shutdown). */
1030 				g_raid_change_subdisk_state(sd,
1031 				    G_RAID_SUBDISK_S_STALE);
1032 			} else {
1033 				/* Up to date disk. */
1034 				g_raid_change_subdisk_state(sd,
1035 				    G_RAID_SUBDISK_S_ACTIVE);
1036 			}
1037 		} else if (mvol->migr_type == INTEL_MT_GEN_MIGR) {
1038 			if ((mmap1->disk_idx[0] & INTEL_DI_IDX) != disk_pos) {
1039 				/* Freshly inserted disk. */
1040 				g_raid_change_subdisk_state(sd,
1041 				    G_RAID_SUBDISK_S_NEW);
1042 			} else {
1043 				/* Up to date disk. */
1044 				g_raid_change_subdisk_state(sd,
1045 				    G_RAID_SUBDISK_S_ACTIVE);
1046 			}
1047 		}
1048 		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1049 		    G_RAID_EVENT_SUBDISK);
1050 	}
1051 
1052 	/* Update status of our need for spare. */
1053 	if (mdi->mdio_started) {
1054 		mdi->mdio_incomplete =
1055 		    (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1056 		     g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) <
1057 		     meta->total_disks);
1058 	}
1059 
1060 	return (resurrection);
1061 }
1062 
1063 static void
1064 g_disk_md_intel_retaste(void *arg, int pending)
1065 {
1066 
1067 	G_RAID_DEBUG(1, "Array is not complete, trying to retaste.");
1068 	g_retaste(&g_raid_class);
1069 	free(arg, M_MD_INTEL);
1070 }
1071 
1072 static void
1073 g_raid_md_intel_refill(struct g_raid_softc *sc)
1074 {
1075 	struct g_raid_md_object *md;
1076 	struct g_raid_md_intel_object *mdi;
1077 	struct intel_raid_conf *meta;
1078 	struct g_raid_disk *disk;
1079 	struct task *task;
1080 	int update, na;
1081 
1082 	md = sc->sc_md;
1083 	mdi = (struct g_raid_md_intel_object *)md;
1084 	meta = mdi->mdio_meta;
1085 	update = 0;
1086 	do {
1087 		/* Make sure we miss anything. */
1088 		na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1089 		    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED);
1090 		if (na == meta->total_disks)
1091 			break;
1092 
1093 		G_RAID_DEBUG1(1, md->mdo_softc,
1094 		    "Array is not complete (%d of %d), "
1095 		    "trying to refill.", na, meta->total_disks);
1096 
1097 		/* Try to get use some of STALE disks. */
1098 		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1099 			if (disk->d_state == G_RAID_DISK_S_STALE) {
1100 				update += g_raid_md_intel_start_disk(disk);
1101 				if (disk->d_state == G_RAID_DISK_S_ACTIVE ||
1102 				    disk->d_state == G_RAID_DISK_S_DISABLED)
1103 					break;
1104 			}
1105 		}
1106 		if (disk != NULL)
1107 			continue;
1108 
1109 		/* Try to get use some of SPARE disks. */
1110 		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1111 			if (disk->d_state == G_RAID_DISK_S_SPARE) {
1112 				update += g_raid_md_intel_start_disk(disk);
1113 				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
1114 					break;
1115 			}
1116 		}
1117 	} while (disk != NULL);
1118 
1119 	/* Write new metadata if we changed something. */
1120 	if (update) {
1121 		g_raid_md_write_intel(md, NULL, NULL, NULL);
1122 		meta = mdi->mdio_meta;
1123 	}
1124 
1125 	/* Update status of our need for spare. */
1126 	mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1127 	    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks);
1128 
1129 	/* Request retaste hoping to find spare. */
1130 	if (mdi->mdio_incomplete) {
1131 		task = malloc(sizeof(struct task),
1132 		    M_MD_INTEL, M_WAITOK | M_ZERO);
1133 		TASK_INIT(task, 0, g_disk_md_intel_retaste, task);
1134 		taskqueue_enqueue(taskqueue_swi, task);
1135 	}
1136 }
1137 
1138 static void
1139 g_raid_md_intel_start(struct g_raid_softc *sc)
1140 {
1141 	struct g_raid_md_object *md;
1142 	struct g_raid_md_intel_object *mdi;
1143 	struct g_raid_md_intel_pervolume *pv;
1144 	struct g_raid_md_intel_perdisk *pd;
1145 	struct intel_raid_conf *meta;
1146 	struct intel_raid_vol *mvol;
1147 	struct intel_raid_map *mmap;
1148 	struct g_raid_volume *vol;
1149 	struct g_raid_subdisk *sd;
1150 	struct g_raid_disk *disk;
1151 	int i, j, disk_pos;
1152 
1153 	md = sc->sc_md;
1154 	mdi = (struct g_raid_md_intel_object *)md;
1155 	meta = mdi->mdio_meta;
1156 
1157 	/* Create volumes and subdisks. */
1158 	for (i = 0; i < meta->total_volumes; i++) {
1159 		mvol = intel_get_volume(meta, i);
1160 		mmap = intel_get_map(mvol, 0);
1161 		vol = g_raid_create_volume(sc, mvol->name, mvol->tid - 1);
1162 		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1163 		pv->pv_volume_pos = i;
1164 		pv->pv_cng = (mvol->state & INTEL_ST_CLONE_N_GO) != 0;
1165 		pv->pv_cng_man_sync = (mvol->state & INTEL_ST_CLONE_MAN_SYNC) != 0;
1166 		if (mvol->cng_master_disk < mmap->total_disks)
1167 			pv->pv_cng_master_disk = mvol->cng_master_disk;
1168 		vol->v_md_data = pv;
1169 		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
1170 		if (mmap->type == INTEL_T_RAID0)
1171 			vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
1172 		else if (mmap->type == INTEL_T_RAID1 &&
1173 		    mmap->total_domains >= 2 &&
1174 		    mmap->total_domains <= mmap->total_disks) {
1175 			/* Assume total_domains is correct. */
1176 			if (mmap->total_domains == mmap->total_disks)
1177 				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1178 			else
1179 				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1180 		} else if (mmap->type == INTEL_T_RAID1) {
1181 			/* total_domains looks wrong. */
1182 			if (mmap->total_disks <= 2)
1183 				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1184 			else
1185 				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1186 		} else if (mmap->type == INTEL_T_RAID5) {
1187 			vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
1188 			vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
1189 		} else
1190 			vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1191 		vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ
1192 		vol->v_disks_count = mmap->total_disks;
1193 		vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ
1194 		vol->v_sectorsize = 512; //ZZZ
1195 		for (j = 0; j < vol->v_disks_count; j++) {
1196 			sd = &vol->v_subdisks[j];
1197 			sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ
1198 			sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ
1199 		}
1200 		g_raid_start_volume(vol);
1201 	}
1202 
1203 	/* Create disk placeholders to store data for later writing. */
1204 	for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) {
1205 		pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1206 		pd->pd_disk_pos = disk_pos;
1207 		pd->pd_disk_meta = meta->disk[disk_pos];
1208 		disk = g_raid_create_disk(sc);
1209 		disk->d_md_data = (void *)pd;
1210 		disk->d_state = G_RAID_DISK_S_OFFLINE;
1211 		for (i = 0; i < meta->total_volumes; i++) {
1212 			mvol = intel_get_volume(meta, i);
1213 			mmap = intel_get_map(mvol, 0);
1214 			for (j = 0; j < mmap->total_disks; j++) {
1215 				if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos)
1216 					break;
1217 			}
1218 			if (j == mmap->total_disks)
1219 				continue;
1220 			vol = g_raid_md_intel_get_volume(sc, i);
1221 			sd = &vol->v_subdisks[j];
1222 			sd->sd_disk = disk;
1223 			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1224 		}
1225 	}
1226 
1227 	/* Make all disks found till the moment take their places. */
1228 	do {
1229 		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1230 			if (disk->d_state == G_RAID_DISK_S_NONE) {
1231 				g_raid_md_intel_start_disk(disk);
1232 				break;
1233 			}
1234 		}
1235 	} while (disk != NULL);
1236 
1237 	mdi->mdio_started = 1;
1238 	G_RAID_DEBUG1(0, sc, "Array started.");
1239 	g_raid_md_write_intel(md, NULL, NULL, NULL);
1240 
1241 	/* Pickup any STALE/SPARE disks to refill array if needed. */
1242 	g_raid_md_intel_refill(sc);
1243 
1244 	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1245 		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1246 		    G_RAID_EVENT_VOLUME);
1247 	}
1248 
1249 	callout_stop(&mdi->mdio_start_co);
1250 	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount);
1251 	root_mount_rel(mdi->mdio_rootmount);
1252 	mdi->mdio_rootmount = NULL;
1253 }
1254 
1255 static void
1256 g_raid_md_intel_new_disk(struct g_raid_disk *disk)
1257 {
1258 	struct g_raid_softc *sc;
1259 	struct g_raid_md_object *md;
1260 	struct g_raid_md_intel_object *mdi;
1261 	struct intel_raid_conf *pdmeta;
1262 	struct g_raid_md_intel_perdisk *pd;
1263 
1264 	sc = disk->d_softc;
1265 	md = sc->sc_md;
1266 	mdi = (struct g_raid_md_intel_object *)md;
1267 	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1268 	pdmeta = pd->pd_meta;
1269 
1270 	if (mdi->mdio_started) {
1271 		if (g_raid_md_intel_start_disk(disk))
1272 			g_raid_md_write_intel(md, NULL, NULL, NULL);
1273 	} else {
1274 		/* If we haven't started yet - check metadata freshness. */
1275 		if (mdi->mdio_meta == NULL ||
1276 		    ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) {
1277 			G_RAID_DEBUG1(1, sc, "Newer disk");
1278 			if (mdi->mdio_meta != NULL)
1279 				free(mdi->mdio_meta, M_MD_INTEL);
1280 			mdi->mdio_meta = intel_meta_copy(pdmeta);
1281 			mdi->mdio_generation = mdi->mdio_meta->generation;
1282 			mdi->mdio_disks_present = 1;
1283 		} else if (pdmeta->generation == mdi->mdio_generation) {
1284 			mdi->mdio_disks_present++;
1285 			G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
1286 			    mdi->mdio_disks_present,
1287 			    mdi->mdio_meta->total_disks);
1288 		} else {
1289 			G_RAID_DEBUG1(1, sc, "Older disk");
1290 		}
1291 		/* If we collected all needed disks - start array. */
1292 		if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks)
1293 			g_raid_md_intel_start(sc);
1294 	}
1295 }
1296 
1297 static void
1298 g_raid_intel_go(void *arg)
1299 {
1300 	struct g_raid_softc *sc;
1301 	struct g_raid_md_object *md;
1302 	struct g_raid_md_intel_object *mdi;
1303 
1304 	sc = arg;
1305 	md = sc->sc_md;
1306 	mdi = (struct g_raid_md_intel_object *)md;
1307 	if (!mdi->mdio_started) {
1308 		G_RAID_DEBUG1(0, sc, "Force array start due to timeout.");
1309 		g_raid_event_send(sc, G_RAID_NODE_E_START, 0);
1310 	}
1311 }
1312 
1313 static int
1314 g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp,
1315     struct g_geom **gp)
1316 {
1317 	struct g_raid_softc *sc;
1318 	struct g_raid_md_intel_object *mdi;
1319 	char name[16];
1320 
1321 	mdi = (struct g_raid_md_intel_object *)md;
1322 	mdi->mdio_config_id = mdi->mdio_orig_config_id = arc4random();
1323 	mdi->mdio_generation = 0;
1324 	snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id);
1325 	sc = g_raid_create_node(mp, name, md);
1326 	if (sc == NULL)
1327 		return (G_RAID_MD_TASTE_FAIL);
1328 	md->mdo_softc = sc;
1329 	*gp = sc->sc_geom;
1330 	return (G_RAID_MD_TASTE_NEW);
1331 }
1332 
1333 /*
1334  * Return the last N characters of the serial label.  The Linux and
1335  * ataraid(7) code always uses the last 16 characters of the label to
1336  * store into the Intel meta format.  Generalize this to N characters
1337  * since that's easy.  Labels can be up to 20 characters for SATA drives
1338  * and up 251 characters for SAS drives.  Since intel controllers don't
1339  * support SAS drives, just stick with the SATA limits for stack friendliness.
1340  */
1341 static int
1342 g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen)
1343 {
1344 	char serial_buffer[24];
1345 	int len, error;
1346 
1347 	len = sizeof(serial_buffer);
1348 	error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer);
1349 	if (error != 0)
1350 		return (error);
1351 	len = strlen(serial_buffer);
1352 	if (len > serlen)
1353 		len -= serlen;
1354 	else
1355 		len = 0;
1356 	strncpy(serial, serial_buffer + len, serlen);
1357 	return (0);
1358 }
1359 
1360 static int
1361 g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp,
1362                               struct g_consumer *cp, struct g_geom **gp)
1363 {
1364 	struct g_consumer *rcp;
1365 	struct g_provider *pp;
1366 	struct g_raid_md_intel_object *mdi, *mdi1;
1367 	struct g_raid_softc *sc;
1368 	struct g_raid_disk *disk;
1369 	struct intel_raid_conf *meta;
1370 	struct g_raid_md_intel_perdisk *pd;
1371 	struct g_geom *geom;
1372 	int error, disk_pos, result, spare, len;
1373 	char serial[INTEL_SERIAL_LEN];
1374 	char name[16];
1375 	uint16_t vendor;
1376 
1377 	G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name);
1378 	mdi = (struct g_raid_md_intel_object *)md;
1379 	pp = cp->provider;
1380 
1381 	/* Read metadata from device. */
1382 	meta = NULL;
1383 	vendor = 0xffff;
1384 	disk_pos = 0;
1385 	g_topology_unlock();
1386 	error = g_raid_md_get_label(cp, serial, sizeof(serial));
1387 	if (error != 0) {
1388 		G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).",
1389 		    pp->name, error);
1390 		goto fail2;
1391 	}
1392 	len = 2;
1393 	if (pp->geom->rank == 1)
1394 		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1395 	meta = intel_meta_read(cp);
1396 	g_topology_lock();
1397 	if (meta == NULL) {
1398 		if (g_raid_aggressive_spare) {
1399 			if (vendor != 0x8086) {
1400 				G_RAID_DEBUG(1,
1401 				    "Intel vendor mismatch 0x%04x != 0x8086",
1402 				    vendor);
1403 			} else {
1404 				G_RAID_DEBUG(1,
1405 				    "No Intel metadata, forcing spare.");
1406 				spare = 2;
1407 				goto search;
1408 			}
1409 		}
1410 		return (G_RAID_MD_TASTE_FAIL);
1411 	}
1412 
1413 	/* Check this disk position in obtained metadata. */
1414 	disk_pos = intel_meta_find_disk(meta, serial);
1415 	if (disk_pos < 0) {
1416 		G_RAID_DEBUG(1, "Intel serial '%s' not found", serial);
1417 		goto fail1;
1418 	}
1419 	if (intel_get_disk_sectors(&meta->disk[disk_pos]) !=
1420 	    (pp->mediasize / pp->sectorsize)) {
1421 		G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju",
1422 		    intel_get_disk_sectors(&meta->disk[disk_pos]),
1423 		    (off_t)(pp->mediasize / pp->sectorsize));
1424 		goto fail1;
1425 	}
1426 
1427 	G_RAID_DEBUG(1, "Intel disk position %d", disk_pos);
1428 	spare = meta->disk[disk_pos].flags & INTEL_F_SPARE;
1429 
1430 search:
1431 	/* Search for matching node. */
1432 	sc = NULL;
1433 	mdi1 = NULL;
1434 	LIST_FOREACH(geom, &mp->geom, geom) {
1435 		sc = geom->softc;
1436 		if (sc == NULL)
1437 			continue;
1438 		if (sc->sc_stopping != 0)
1439 			continue;
1440 		if (sc->sc_md->mdo_class != md->mdo_class)
1441 			continue;
1442 		mdi1 = (struct g_raid_md_intel_object *)sc->sc_md;
1443 		if (spare) {
1444 			if (mdi1->mdio_incomplete)
1445 				break;
1446 		} else {
1447 			if (mdi1->mdio_config_id == meta->config_id)
1448 				break;
1449 		}
1450 	}
1451 
1452 	/* Found matching node. */
1453 	if (geom != NULL) {
1454 		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1455 		result = G_RAID_MD_TASTE_EXISTING;
1456 
1457 	} else if (spare) { /* Not found needy node -- left for later. */
1458 		G_RAID_DEBUG(1, "Spare is not needed at this time");
1459 		goto fail1;
1460 
1461 	} else { /* Not found matching node -- create one. */
1462 		result = G_RAID_MD_TASTE_NEW;
1463 		mdi->mdio_config_id = meta->config_id;
1464 		mdi->mdio_orig_config_id = meta->orig_config_id;
1465 		snprintf(name, sizeof(name), "Intel-%08x", meta->config_id);
1466 		sc = g_raid_create_node(mp, name, md);
1467 		md->mdo_softc = sc;
1468 		geom = sc->sc_geom;
1469 		callout_init(&mdi->mdio_start_co, 1);
1470 		callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz,
1471 		    g_raid_intel_go, sc);
1472 		mdi->mdio_rootmount = root_mount_hold("GRAID-Intel");
1473 		G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount);
1474 	}
1475 
1476 	/* There is no return after this point, so we close passed consumer. */
1477 	g_access(cp, -1, 0, 0);
1478 
1479 	rcp = g_new_consumer(geom);
1480 	rcp->flags |= G_CF_DIRECT_RECEIVE;
1481 	g_attach(rcp, pp);
1482 	if (g_access(rcp, 1, 1, 1) != 0)
1483 		; //goto fail1;
1484 
1485 	g_topology_unlock();
1486 	sx_xlock(&sc->sc_lock);
1487 
1488 	pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1489 	pd->pd_meta = meta;
1490 	pd->pd_disk_pos = -1;
1491 	if (spare == 2) {
1492 		memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN);
1493 		intel_set_disk_sectors(&pd->pd_disk_meta,
1494 		    pp->mediasize / pp->sectorsize);
1495 		pd->pd_disk_meta.id = 0;
1496 		pd->pd_disk_meta.flags = INTEL_F_SPARE;
1497 	} else {
1498 		pd->pd_disk_meta = meta->disk[disk_pos];
1499 	}
1500 	disk = g_raid_create_disk(sc);
1501 	disk->d_md_data = (void *)pd;
1502 	disk->d_consumer = rcp;
1503 	rcp->private = disk;
1504 
1505 	g_raid_get_disk_info(disk);
1506 
1507 	g_raid_md_intel_new_disk(disk);
1508 
1509 	sx_xunlock(&sc->sc_lock);
1510 	g_topology_lock();
1511 	*gp = geom;
1512 	return (result);
1513 fail2:
1514 	g_topology_lock();
1515 fail1:
1516 	free(meta, M_MD_INTEL);
1517 	return (G_RAID_MD_TASTE_FAIL);
1518 }
1519 
1520 static int
1521 g_raid_md_event_intel(struct g_raid_md_object *md,
1522     struct g_raid_disk *disk, u_int event)
1523 {
1524 	struct g_raid_softc *sc;
1525 	struct g_raid_subdisk *sd;
1526 	struct g_raid_md_intel_object *mdi;
1527 	struct g_raid_md_intel_perdisk *pd;
1528 
1529 	sc = md->mdo_softc;
1530 	mdi = (struct g_raid_md_intel_object *)md;
1531 	if (disk == NULL) {
1532 		switch (event) {
1533 		case G_RAID_NODE_E_START:
1534 			if (!mdi->mdio_started)
1535 				g_raid_md_intel_start(sc);
1536 			return (0);
1537 		}
1538 		return (-1);
1539 	}
1540 	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1541 	switch (event) {
1542 	case G_RAID_DISK_E_DISCONNECTED:
1543 		/* If disk was assigned, just update statuses. */
1544 		if (pd->pd_disk_pos >= 0) {
1545 			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1546 			if (disk->d_consumer) {
1547 				g_raid_kill_consumer(sc, disk->d_consumer);
1548 				disk->d_consumer = NULL;
1549 			}
1550 			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
1551 				g_raid_change_subdisk_state(sd,
1552 				    G_RAID_SUBDISK_S_NONE);
1553 				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
1554 				    G_RAID_EVENT_SUBDISK);
1555 			}
1556 		} else {
1557 			/* Otherwise -- delete. */
1558 			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1559 			g_raid_destroy_disk(disk);
1560 		}
1561 
1562 		/* Write updated metadata to all disks. */
1563 		g_raid_md_write_intel(md, NULL, NULL, NULL);
1564 
1565 		/* Check if anything left except placeholders. */
1566 		if (g_raid_ndisks(sc, -1) ==
1567 		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
1568 			g_raid_destroy_node(sc, 0);
1569 		else
1570 			g_raid_md_intel_refill(sc);
1571 		return (0);
1572 	}
1573 	return (-2);
1574 }
1575 
1576 static int
1577 g_raid_md_ctl_intel(struct g_raid_md_object *md,
1578     struct gctl_req *req)
1579 {
1580 	struct g_raid_softc *sc;
1581 	struct g_raid_volume *vol, *vol1;
1582 	struct g_raid_subdisk *sd;
1583 	struct g_raid_disk *disk;
1584 	struct g_raid_md_intel_object *mdi;
1585 	struct g_raid_md_intel_pervolume *pv;
1586 	struct g_raid_md_intel_perdisk *pd;
1587 	struct g_consumer *cp;
1588 	struct g_provider *pp;
1589 	char arg[16], serial[INTEL_SERIAL_LEN];
1590 	const char *nodename, *verb, *volname, *levelname, *diskname;
1591 	char *tmp;
1592 	int *nargs, *force;
1593 	off_t off, size, sectorsize, strip, disk_sectors;
1594 	intmax_t *sizearg, *striparg;
1595 	int numdisks, i, len, level, qual, update;
1596 	int error;
1597 
1598 	sc = md->mdo_softc;
1599 	mdi = (struct g_raid_md_intel_object *)md;
1600 	verb = gctl_get_param(req, "verb", NULL);
1601 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1602 	error = 0;
1603 	if (strcmp(verb, "label") == 0) {
1604 
1605 		if (*nargs < 4) {
1606 			gctl_error(req, "Invalid number of arguments.");
1607 			return (-1);
1608 		}
1609 		volname = gctl_get_asciiparam(req, "arg1");
1610 		if (volname == NULL) {
1611 			gctl_error(req, "No volume name.");
1612 			return (-2);
1613 		}
1614 		levelname = gctl_get_asciiparam(req, "arg2");
1615 		if (levelname == NULL) {
1616 			gctl_error(req, "No RAID level.");
1617 			return (-3);
1618 		}
1619 		if (strcasecmp(levelname, "RAID5") == 0)
1620 			levelname = "RAID5-LA";
1621 		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1622 			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1623 			return (-4);
1624 		}
1625 		numdisks = *nargs - 3;
1626 		force = gctl_get_paraml(req, "force", sizeof(*force));
1627 		if (!g_raid_md_intel_supported(level, qual, numdisks,
1628 		    force ? *force : 0)) {
1629 			gctl_error(req, "Unsupported RAID level "
1630 			    "(0x%02x/0x%02x), or number of disks (%d).",
1631 			    level, qual, numdisks);
1632 			return (-5);
1633 		}
1634 
1635 		/* Search for disks, connect them and probe. */
1636 		size = 0x7fffffffffffffffllu;
1637 		sectorsize = 0;
1638 		for (i = 0; i < numdisks; i++) {
1639 			snprintf(arg, sizeof(arg), "arg%d", i + 3);
1640 			diskname = gctl_get_asciiparam(req, arg);
1641 			if (diskname == NULL) {
1642 				gctl_error(req, "No disk name (%s).", arg);
1643 				error = -6;
1644 				break;
1645 			}
1646 			if (strcmp(diskname, "NONE") == 0) {
1647 				cp = NULL;
1648 				pp = NULL;
1649 			} else {
1650 				g_topology_lock();
1651 				cp = g_raid_open_consumer(sc, diskname);
1652 				if (cp == NULL) {
1653 					gctl_error(req, "Can't open disk '%s'.",
1654 					    diskname);
1655 					g_topology_unlock();
1656 					error = -7;
1657 					break;
1658 				}
1659 				pp = cp->provider;
1660 			}
1661 			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1662 			pd->pd_disk_pos = i;
1663 			disk = g_raid_create_disk(sc);
1664 			disk->d_md_data = (void *)pd;
1665 			disk->d_consumer = cp;
1666 			if (cp == NULL) {
1667 				strcpy(&pd->pd_disk_meta.serial[0], "NONE");
1668 				pd->pd_disk_meta.id = 0xffffffff;
1669 				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
1670 				continue;
1671 			}
1672 			cp->private = disk;
1673 			g_topology_unlock();
1674 
1675 			error = g_raid_md_get_label(cp,
1676 			    &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN);
1677 			if (error != 0) {
1678 				gctl_error(req,
1679 				    "Can't get serial for provider '%s'.",
1680 				    diskname);
1681 				error = -8;
1682 				break;
1683 			}
1684 
1685 			g_raid_get_disk_info(disk);
1686 
1687 			intel_set_disk_sectors(&pd->pd_disk_meta,
1688 			    pp->mediasize / pp->sectorsize);
1689 			if (size > pp->mediasize)
1690 				size = pp->mediasize;
1691 			if (sectorsize < pp->sectorsize)
1692 				sectorsize = pp->sectorsize;
1693 			pd->pd_disk_meta.id = 0;
1694 			pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE;
1695 		}
1696 		if (error != 0)
1697 			return (error);
1698 
1699 		if (sectorsize <= 0) {
1700 			gctl_error(req, "Can't get sector size.");
1701 			return (-8);
1702 		}
1703 
1704 		/* Reserve some space for metadata. */
1705 		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1706 
1707 		/* Handle size argument. */
1708 		len = sizeof(*sizearg);
1709 		sizearg = gctl_get_param(req, "size", &len);
1710 		if (sizearg != NULL && len == sizeof(*sizearg) &&
1711 		    *sizearg > 0) {
1712 			if (*sizearg > size) {
1713 				gctl_error(req, "Size too big %lld > %lld.",
1714 				    (long long)*sizearg, (long long)size);
1715 				return (-9);
1716 			}
1717 			size = *sizearg;
1718 		}
1719 
1720 		/* Handle strip argument. */
1721 		strip = 131072;
1722 		len = sizeof(*striparg);
1723 		striparg = gctl_get_param(req, "strip", &len);
1724 		if (striparg != NULL && len == sizeof(*striparg) &&
1725 		    *striparg > 0) {
1726 			if (*striparg < sectorsize) {
1727 				gctl_error(req, "Strip size too small.");
1728 				return (-10);
1729 			}
1730 			if (*striparg % sectorsize != 0) {
1731 				gctl_error(req, "Incorrect strip size.");
1732 				return (-11);
1733 			}
1734 			if (strip > 65535 * sectorsize) {
1735 				gctl_error(req, "Strip size too big.");
1736 				return (-12);
1737 			}
1738 			strip = *striparg;
1739 		}
1740 
1741 		/* Round size down to strip or sector. */
1742 		if (level == G_RAID_VOLUME_RL_RAID1)
1743 			size -= (size % sectorsize);
1744 		else if (level == G_RAID_VOLUME_RL_RAID1E &&
1745 		    (numdisks & 1) != 0)
1746 			size -= (size % (2 * strip));
1747 		else
1748 			size -= (size % strip);
1749 		if (size <= 0) {
1750 			gctl_error(req, "Size too small.");
1751 			return (-13);
1752 		}
1753 
1754 		/* We have all we need, create things: volume, ... */
1755 		mdi->mdio_started = 1;
1756 		vol = g_raid_create_volume(sc, volname, -1);
1757 		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1758 		pv->pv_volume_pos = 0;
1759 		vol->v_md_data = pv;
1760 		vol->v_raid_level = level;
1761 		vol->v_raid_level_qualifier = qual;
1762 		vol->v_strip_size = strip;
1763 		vol->v_disks_count = numdisks;
1764 		if (level == G_RAID_VOLUME_RL_RAID0)
1765 			vol->v_mediasize = size * numdisks;
1766 		else if (level == G_RAID_VOLUME_RL_RAID1)
1767 			vol->v_mediasize = size;
1768 		else if (level == G_RAID_VOLUME_RL_RAID5)
1769 			vol->v_mediasize = size * (numdisks - 1);
1770 		else { /* RAID1E */
1771 			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1772 			    strip;
1773 		}
1774 		vol->v_sectorsize = sectorsize;
1775 		g_raid_start_volume(vol);
1776 
1777 		/* , and subdisks. */
1778 		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1779 			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1780 			sd = &vol->v_subdisks[pd->pd_disk_pos];
1781 			sd->sd_disk = disk;
1782 			sd->sd_offset = 0;
1783 			sd->sd_size = size;
1784 			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1785 			if (sd->sd_disk->d_consumer != NULL) {
1786 				g_raid_change_disk_state(disk,
1787 				    G_RAID_DISK_S_ACTIVE);
1788 				if (level == G_RAID_VOLUME_RL_RAID5)
1789 					g_raid_change_subdisk_state(sd,
1790 					    G_RAID_SUBDISK_S_UNINITIALIZED);
1791 				else
1792 					g_raid_change_subdisk_state(sd,
1793 					    G_RAID_SUBDISK_S_ACTIVE);
1794 				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1795 				    G_RAID_EVENT_SUBDISK);
1796 			} else {
1797 				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1798 			}
1799 		}
1800 
1801 		/* Write metadata based on created entities. */
1802 		G_RAID_DEBUG1(0, sc, "Array started.");
1803 		g_raid_md_write_intel(md, NULL, NULL, NULL);
1804 
1805 		/* Pickup any STALE/SPARE disks to refill array if needed. */
1806 		g_raid_md_intel_refill(sc);
1807 
1808 		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1809 		    G_RAID_EVENT_VOLUME);
1810 		return (0);
1811 	}
1812 	if (strcmp(verb, "add") == 0) {
1813 
1814 		if (*nargs != 3) {
1815 			gctl_error(req, "Invalid number of arguments.");
1816 			return (-1);
1817 		}
1818 		volname = gctl_get_asciiparam(req, "arg1");
1819 		if (volname == NULL) {
1820 			gctl_error(req, "No volume name.");
1821 			return (-2);
1822 		}
1823 		levelname = gctl_get_asciiparam(req, "arg2");
1824 		if (levelname == NULL) {
1825 			gctl_error(req, "No RAID level.");
1826 			return (-3);
1827 		}
1828 		if (strcasecmp(levelname, "RAID5") == 0)
1829 			levelname = "RAID5-LA";
1830 		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1831 			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1832 			return (-4);
1833 		}
1834 
1835 		/* Look for existing volumes. */
1836 		i = 0;
1837 		vol1 = NULL;
1838 		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1839 			vol1 = vol;
1840 			i++;
1841 		}
1842 		if (i > 1) {
1843 			gctl_error(req, "Maximum two volumes supported.");
1844 			return (-6);
1845 		}
1846 		if (vol1 == NULL) {
1847 			gctl_error(req, "At least one volume must exist.");
1848 			return (-7);
1849 		}
1850 
1851 		numdisks = vol1->v_disks_count;
1852 		force = gctl_get_paraml(req, "force", sizeof(*force));
1853 		if (!g_raid_md_intel_supported(level, qual, numdisks,
1854 		    force ? *force : 0)) {
1855 			gctl_error(req, "Unsupported RAID level "
1856 			    "(0x%02x/0x%02x), or number of disks (%d).",
1857 			    level, qual, numdisks);
1858 			return (-5);
1859 		}
1860 
1861 		/* Collect info about present disks. */
1862 		size = 0x7fffffffffffffffllu;
1863 		sectorsize = 512;
1864 		for (i = 0; i < numdisks; i++) {
1865 			disk = vol1->v_subdisks[i].sd_disk;
1866 			pd = (struct g_raid_md_intel_perdisk *)
1867 			    disk->d_md_data;
1868 			disk_sectors =
1869 			    intel_get_disk_sectors(&pd->pd_disk_meta);
1870 
1871 			if (disk_sectors * 512 < size)
1872 				size = disk_sectors * 512;
1873 			if (disk->d_consumer != NULL &&
1874 			    disk->d_consumer->provider != NULL &&
1875 			    disk->d_consumer->provider->sectorsize >
1876 			     sectorsize) {
1877 				sectorsize =
1878 				    disk->d_consumer->provider->sectorsize;
1879 			}
1880 		}
1881 
1882 		/* Reserve some space for metadata. */
1883 		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1884 
1885 		/* Decide insert before or after. */
1886 		sd = &vol1->v_subdisks[0];
1887 		if (sd->sd_offset >
1888 		    size - (sd->sd_offset + sd->sd_size)) {
1889 			off = 0;
1890 			size = sd->sd_offset;
1891 		} else {
1892 			off = sd->sd_offset + sd->sd_size;
1893 			size = size - (sd->sd_offset + sd->sd_size);
1894 		}
1895 
1896 		/* Handle strip argument. */
1897 		strip = 131072;
1898 		len = sizeof(*striparg);
1899 		striparg = gctl_get_param(req, "strip", &len);
1900 		if (striparg != NULL && len == sizeof(*striparg) &&
1901 		    *striparg > 0) {
1902 			if (*striparg < sectorsize) {
1903 				gctl_error(req, "Strip size too small.");
1904 				return (-10);
1905 			}
1906 			if (*striparg % sectorsize != 0) {
1907 				gctl_error(req, "Incorrect strip size.");
1908 				return (-11);
1909 			}
1910 			if (strip > 65535 * sectorsize) {
1911 				gctl_error(req, "Strip size too big.");
1912 				return (-12);
1913 			}
1914 			strip = *striparg;
1915 		}
1916 
1917 		/* Round offset up to strip. */
1918 		if (off % strip != 0) {
1919 			size -= strip - off % strip;
1920 			off += strip - off % strip;
1921 		}
1922 
1923 		/* Handle size argument. */
1924 		len = sizeof(*sizearg);
1925 		sizearg = gctl_get_param(req, "size", &len);
1926 		if (sizearg != NULL && len == sizeof(*sizearg) &&
1927 		    *sizearg > 0) {
1928 			if (*sizearg > size) {
1929 				gctl_error(req, "Size too big %lld > %lld.",
1930 				    (long long)*sizearg, (long long)size);
1931 				return (-9);
1932 			}
1933 			size = *sizearg;
1934 		}
1935 
1936 		/* Round size down to strip or sector. */
1937 		if (level == G_RAID_VOLUME_RL_RAID1)
1938 			size -= (size % sectorsize);
1939 		else
1940 			size -= (size % strip);
1941 		if (size <= 0) {
1942 			gctl_error(req, "Size too small.");
1943 			return (-13);
1944 		}
1945 		if (size > 0xffffffffllu * sectorsize) {
1946 			gctl_error(req, "Size too big.");
1947 			return (-14);
1948 		}
1949 
1950 		/* We have all we need, create things: volume, ... */
1951 		vol = g_raid_create_volume(sc, volname, -1);
1952 		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1953 		pv->pv_volume_pos = i;
1954 		vol->v_md_data = pv;
1955 		vol->v_raid_level = level;
1956 		vol->v_raid_level_qualifier = qual;
1957 		vol->v_strip_size = strip;
1958 		vol->v_disks_count = numdisks;
1959 		if (level == G_RAID_VOLUME_RL_RAID0)
1960 			vol->v_mediasize = size * numdisks;
1961 		else if (level == G_RAID_VOLUME_RL_RAID1)
1962 			vol->v_mediasize = size;
1963 		else if (level == G_RAID_VOLUME_RL_RAID5)
1964 			vol->v_mediasize = size * (numdisks - 1);
1965 		else { /* RAID1E */
1966 			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1967 			    strip;
1968 		}
1969 		vol->v_sectorsize = sectorsize;
1970 		g_raid_start_volume(vol);
1971 
1972 		/* , and subdisks. */
1973 		for (i = 0; i < numdisks; i++) {
1974 			disk = vol1->v_subdisks[i].sd_disk;
1975 			sd = &vol->v_subdisks[i];
1976 			sd->sd_disk = disk;
1977 			sd->sd_offset = off;
1978 			sd->sd_size = size;
1979 			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1980 			if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
1981 				if (level == G_RAID_VOLUME_RL_RAID5)
1982 					g_raid_change_subdisk_state(sd,
1983 					    G_RAID_SUBDISK_S_UNINITIALIZED);
1984 				else
1985 					g_raid_change_subdisk_state(sd,
1986 					    G_RAID_SUBDISK_S_ACTIVE);
1987 				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1988 				    G_RAID_EVENT_SUBDISK);
1989 			}
1990 		}
1991 
1992 		/* Write metadata based on created entities. */
1993 		g_raid_md_write_intel(md, NULL, NULL, NULL);
1994 
1995 		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1996 		    G_RAID_EVENT_VOLUME);
1997 		return (0);
1998 	}
1999 	if (strcmp(verb, "delete") == 0) {
2000 
2001 		nodename = gctl_get_asciiparam(req, "arg0");
2002 		if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
2003 			nodename = NULL;
2004 
2005 		/* Full node destruction. */
2006 		if (*nargs == 1 && nodename != NULL) {
2007 			/* Check if some volume is still open. */
2008 			force = gctl_get_paraml(req, "force", sizeof(*force));
2009 			if (force != NULL && *force == 0 &&
2010 			    g_raid_nopens(sc) != 0) {
2011 				gctl_error(req, "Some volume is still open.");
2012 				return (-4);
2013 			}
2014 
2015 			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2016 				if (disk->d_consumer)
2017 					intel_meta_erase(disk->d_consumer);
2018 			}
2019 			g_raid_destroy_node(sc, 0);
2020 			return (0);
2021 		}
2022 
2023 		/* Destroy specified volume. If it was last - all node. */
2024 		if (*nargs > 2) {
2025 			gctl_error(req, "Invalid number of arguments.");
2026 			return (-1);
2027 		}
2028 		volname = gctl_get_asciiparam(req,
2029 		    nodename != NULL ? "arg1" : "arg0");
2030 		if (volname == NULL) {
2031 			gctl_error(req, "No volume name.");
2032 			return (-2);
2033 		}
2034 
2035 		/* Search for volume. */
2036 		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2037 			if (strcmp(vol->v_name, volname) == 0)
2038 				break;
2039 			pp = vol->v_provider;
2040 			if (pp == NULL)
2041 				continue;
2042 			if (strcmp(pp->name, volname) == 0)
2043 				break;
2044 			if (strncmp(pp->name, "raid/", 5) == 0 &&
2045 			    strcmp(pp->name + 5, volname) == 0)
2046 				break;
2047 		}
2048 		if (vol == NULL) {
2049 			i = strtol(volname, &tmp, 10);
2050 			if (verb != volname && tmp[0] == 0) {
2051 				TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2052 					if (vol->v_global_id == i)
2053 						break;
2054 				}
2055 			}
2056 		}
2057 		if (vol == NULL) {
2058 			gctl_error(req, "Volume '%s' not found.", volname);
2059 			return (-3);
2060 		}
2061 
2062 		/* Check if volume is still open. */
2063 		force = gctl_get_paraml(req, "force", sizeof(*force));
2064 		if (force != NULL && *force == 0 &&
2065 		    vol->v_provider_open != 0) {
2066 			gctl_error(req, "Volume is still open.");
2067 			return (-4);
2068 		}
2069 
2070 		/* Destroy volume and potentially node. */
2071 		i = 0;
2072 		TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
2073 			i++;
2074 		if (i >= 2) {
2075 			g_raid_destroy_volume(vol);
2076 			g_raid_md_write_intel(md, NULL, NULL, NULL);
2077 		} else {
2078 			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2079 				if (disk->d_consumer)
2080 					intel_meta_erase(disk->d_consumer);
2081 			}
2082 			g_raid_destroy_node(sc, 0);
2083 		}
2084 		return (0);
2085 	}
2086 	if (strcmp(verb, "remove") == 0 ||
2087 	    strcmp(verb, "fail") == 0) {
2088 		if (*nargs < 2) {
2089 			gctl_error(req, "Invalid number of arguments.");
2090 			return (-1);
2091 		}
2092 		for (i = 1; i < *nargs; i++) {
2093 			snprintf(arg, sizeof(arg), "arg%d", i);
2094 			diskname = gctl_get_asciiparam(req, arg);
2095 			if (diskname == NULL) {
2096 				gctl_error(req, "No disk name (%s).", arg);
2097 				error = -2;
2098 				break;
2099 			}
2100 			if (strncmp(diskname, "/dev/", 5) == 0)
2101 				diskname += 5;
2102 
2103 			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2104 				if (disk->d_consumer != NULL &&
2105 				    disk->d_consumer->provider != NULL &&
2106 				    strcmp(disk->d_consumer->provider->name,
2107 				     diskname) == 0)
2108 					break;
2109 			}
2110 			if (disk == NULL) {
2111 				gctl_error(req, "Disk '%s' not found.",
2112 				    diskname);
2113 				error = -3;
2114 				break;
2115 			}
2116 
2117 			if (strcmp(verb, "fail") == 0) {
2118 				g_raid_md_fail_disk_intel(md, NULL, disk);
2119 				continue;
2120 			}
2121 
2122 			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2123 
2124 			/* Erase metadata on deleting disk. */
2125 			intel_meta_erase(disk->d_consumer);
2126 
2127 			/* If disk was assigned, just update statuses. */
2128 			if (pd->pd_disk_pos >= 0) {
2129 				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
2130 				g_raid_kill_consumer(sc, disk->d_consumer);
2131 				disk->d_consumer = NULL;
2132 				TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2133 					g_raid_change_subdisk_state(sd,
2134 					    G_RAID_SUBDISK_S_NONE);
2135 					g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2136 					    G_RAID_EVENT_SUBDISK);
2137 				}
2138 			} else {
2139 				/* Otherwise -- delete. */
2140 				g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
2141 				g_raid_destroy_disk(disk);
2142 			}
2143 		}
2144 
2145 		/* Write updated metadata to remaining disks. */
2146 		g_raid_md_write_intel(md, NULL, NULL, NULL);
2147 
2148 		/* Check if anything left except placeholders. */
2149 		if (g_raid_ndisks(sc, -1) ==
2150 		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2151 			g_raid_destroy_node(sc, 0);
2152 		else
2153 			g_raid_md_intel_refill(sc);
2154 		return (error);
2155 	}
2156 	if (strcmp(verb, "insert") == 0) {
2157 		if (*nargs < 2) {
2158 			gctl_error(req, "Invalid number of arguments.");
2159 			return (-1);
2160 		}
2161 		update = 0;
2162 		for (i = 1; i < *nargs; i++) {
2163 			/* Get disk name. */
2164 			snprintf(arg, sizeof(arg), "arg%d", i);
2165 			diskname = gctl_get_asciiparam(req, arg);
2166 			if (diskname == NULL) {
2167 				gctl_error(req, "No disk name (%s).", arg);
2168 				error = -3;
2169 				break;
2170 			}
2171 
2172 			/* Try to find provider with specified name. */
2173 			g_topology_lock();
2174 			cp = g_raid_open_consumer(sc, diskname);
2175 			if (cp == NULL) {
2176 				gctl_error(req, "Can't open disk '%s'.",
2177 				    diskname);
2178 				g_topology_unlock();
2179 				error = -4;
2180 				break;
2181 			}
2182 			pp = cp->provider;
2183 			g_topology_unlock();
2184 
2185 			/* Read disk serial. */
2186 			error = g_raid_md_get_label(cp,
2187 			    &serial[0], INTEL_SERIAL_LEN);
2188 			if (error != 0) {
2189 				gctl_error(req,
2190 				    "Can't get serial for provider '%s'.",
2191 				    diskname);
2192 				g_raid_kill_consumer(sc, cp);
2193 				error = -7;
2194 				break;
2195 			}
2196 
2197 			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
2198 			pd->pd_disk_pos = -1;
2199 
2200 			disk = g_raid_create_disk(sc);
2201 			disk->d_consumer = cp;
2202 			disk->d_md_data = (void *)pd;
2203 			cp->private = disk;
2204 
2205 			g_raid_get_disk_info(disk);
2206 
2207 			memcpy(&pd->pd_disk_meta.serial[0], &serial[0],
2208 			    INTEL_SERIAL_LEN);
2209 			intel_set_disk_sectors(&pd->pd_disk_meta,
2210 			    pp->mediasize / pp->sectorsize);
2211 			pd->pd_disk_meta.id = 0;
2212 			pd->pd_disk_meta.flags = INTEL_F_SPARE;
2213 
2214 			/* Welcome the "new" disk. */
2215 			update += g_raid_md_intel_start_disk(disk);
2216 			if (disk->d_state == G_RAID_DISK_S_SPARE) {
2217 				intel_meta_write_spare(cp, &pd->pd_disk_meta);
2218 				g_raid_destroy_disk(disk);
2219 			} else if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2220 				gctl_error(req, "Disk '%s' doesn't fit.",
2221 				    diskname);
2222 				g_raid_destroy_disk(disk);
2223 				error = -8;
2224 				break;
2225 			}
2226 		}
2227 
2228 		/* Write new metadata if we changed something. */
2229 		if (update)
2230 			g_raid_md_write_intel(md, NULL, NULL, NULL);
2231 		return (error);
2232 	}
2233 	return (-100);
2234 }
2235 
2236 static int
2237 g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol,
2238     struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2239 {
2240 	struct g_raid_softc *sc;
2241 	struct g_raid_volume *vol;
2242 	struct g_raid_subdisk *sd;
2243 	struct g_raid_disk *disk;
2244 	struct g_raid_md_intel_object *mdi;
2245 	struct g_raid_md_intel_pervolume *pv;
2246 	struct g_raid_md_intel_perdisk *pd;
2247 	struct intel_raid_conf *meta;
2248 	struct intel_raid_vol *mvol;
2249 	struct intel_raid_map *mmap0, *mmap1;
2250 	off_t sectorsize = 512, pos;
2251 	const char *version, *cv;
2252 	int vi, sdi, numdisks, len, state, stale;
2253 
2254 	sc = md->mdo_softc;
2255 	mdi = (struct g_raid_md_intel_object *)md;
2256 
2257 	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2258 		return (0);
2259 
2260 	/* Bump generation. Newly written metadata may differ from previous. */
2261 	mdi->mdio_generation++;
2262 
2263 	/* Count number of disks. */
2264 	numdisks = 0;
2265 	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2266 		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2267 		if (pd->pd_disk_pos < 0)
2268 			continue;
2269 		numdisks++;
2270 		if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
2271 			pd->pd_disk_meta.flags =
2272 			    INTEL_F_ONLINE | INTEL_F_ASSIGNED;
2273 		} else if (disk->d_state == G_RAID_DISK_S_FAILED) {
2274 			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2275 			    INTEL_F_ASSIGNED;
2276 		} else if (disk->d_state == G_RAID_DISK_S_DISABLED) {
2277 			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2278 			    INTEL_F_ASSIGNED | INTEL_F_DISABLED;
2279 		} else {
2280 			if (!(pd->pd_disk_meta.flags & INTEL_F_DISABLED))
2281 				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
2282 			if (pd->pd_disk_meta.id != 0xffffffff) {
2283 				pd->pd_disk_meta.id = 0xffffffff;
2284 				len = strlen(pd->pd_disk_meta.serial);
2285 				len = min(len, INTEL_SERIAL_LEN - 3);
2286 				strcpy(pd->pd_disk_meta.serial + len, ":0");
2287 			}
2288 		}
2289 	}
2290 
2291 	/* Fill anchor and disks. */
2292 	meta = malloc(INTEL_MAX_MD_SIZE(numdisks),
2293 	    M_MD_INTEL, M_WAITOK | M_ZERO);
2294 	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
2295 	meta->config_size = INTEL_MAX_MD_SIZE(numdisks);
2296 	meta->config_id = mdi->mdio_config_id;
2297 	meta->orig_config_id = mdi->mdio_orig_config_id;
2298 	meta->generation = mdi->mdio_generation;
2299 	meta->attributes = INTEL_ATTR_CHECKSUM;
2300 	meta->total_disks = numdisks;
2301 	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2302 		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2303 		if (pd->pd_disk_pos < 0)
2304 			continue;
2305 		meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta;
2306 		if (pd->pd_disk_meta.sectors_hi != 0)
2307 			meta->attributes |= INTEL_ATTR_2TB_DISK;
2308 	}
2309 
2310 	/* Fill volumes and maps. */
2311 	vi = 0;
2312 	version = INTEL_VERSION_1000;
2313 	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2314 		pv = vol->v_md_data;
2315 		if (vol->v_stopping)
2316 			continue;
2317 		mvol = intel_get_volume(meta, vi);
2318 
2319 		/* New metadata may have different volumes order. */
2320 		pv->pv_volume_pos = vi;
2321 
2322 		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2323 			sd = &vol->v_subdisks[sdi];
2324 			if (sd->sd_disk != NULL)
2325 				break;
2326 		}
2327 		if (sdi >= vol->v_disks_count)
2328 			panic("No any filled subdisk in volume");
2329 		if (vol->v_mediasize >= 0x20000000000llu)
2330 			meta->attributes |= INTEL_ATTR_2TB;
2331 		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2332 			meta->attributes |= INTEL_ATTR_RAID0;
2333 		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2334 			meta->attributes |= INTEL_ATTR_RAID1;
2335 		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2336 			meta->attributes |= INTEL_ATTR_RAID5;
2337 		else if ((vol->v_disks_count & 1) == 0)
2338 			meta->attributes |= INTEL_ATTR_RAID10;
2339 		else
2340 			meta->attributes |= INTEL_ATTR_RAID1E;
2341 		if (pv->pv_cng)
2342 			meta->attributes |= INTEL_ATTR_RAIDCNG;
2343 		if (vol->v_strip_size > 131072)
2344 			meta->attributes |= INTEL_ATTR_EXT_STRIP;
2345 
2346 		if (pv->pv_cng)
2347 			cv = INTEL_VERSION_1206;
2348 		else if (vol->v_disks_count > 4)
2349 			cv = INTEL_VERSION_1204;
2350 		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2351 			cv = INTEL_VERSION_1202;
2352 		else if (vol->v_disks_count > 2)
2353 			cv = INTEL_VERSION_1201;
2354 		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2355 			cv = INTEL_VERSION_1100;
2356 		else
2357 			cv = INTEL_VERSION_1000;
2358 		if (strcmp(cv, version) > 0)
2359 			version = cv;
2360 
2361 		strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name));
2362 		mvol->total_sectors = vol->v_mediasize / sectorsize;
2363 		mvol->state = (INTEL_ST_READ_COALESCING |
2364 		    INTEL_ST_WRITE_COALESCING);
2365 		mvol->tid = vol->v_global_id + 1;
2366 		if (pv->pv_cng) {
2367 			mvol->state |= INTEL_ST_CLONE_N_GO;
2368 			if (pv->pv_cng_man_sync)
2369 				mvol->state |= INTEL_ST_CLONE_MAN_SYNC;
2370 			mvol->cng_master_disk = pv->pv_cng_master_disk;
2371 			if (vol->v_subdisks[pv->pv_cng_master_disk].sd_state ==
2372 			    G_RAID_SUBDISK_S_NONE)
2373 				mvol->cng_state = INTEL_CNGST_MASTER_MISSING;
2374 			else if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
2375 				mvol->cng_state = INTEL_CNGST_NEEDS_UPDATE;
2376 			else
2377 				mvol->cng_state = INTEL_CNGST_UPDATED;
2378 		}
2379 
2380 		/* Check for any recovery in progress. */
2381 		state = G_RAID_SUBDISK_S_ACTIVE;
2382 		pos = 0x7fffffffffffffffllu;
2383 		stale = 0;
2384 		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2385 			sd = &vol->v_subdisks[sdi];
2386 			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD)
2387 				state = G_RAID_SUBDISK_S_REBUILD;
2388 			else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC &&
2389 			    state != G_RAID_SUBDISK_S_REBUILD)
2390 				state = G_RAID_SUBDISK_S_RESYNC;
2391 			else if (sd->sd_state == G_RAID_SUBDISK_S_STALE)
2392 				stale = 1;
2393 			if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2394 			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
2395 			     sd->sd_rebuild_pos < pos)
2396 			        pos = sd->sd_rebuild_pos;
2397 		}
2398 		if (state == G_RAID_SUBDISK_S_REBUILD) {
2399 			mvol->migr_state = 1;
2400 			mvol->migr_type = INTEL_MT_REBUILD;
2401 		} else if (state == G_RAID_SUBDISK_S_RESYNC) {
2402 			mvol->migr_state = 1;
2403 			/* mvol->migr_type = INTEL_MT_REPAIR; */
2404 			mvol->migr_type = INTEL_MT_VERIFY;
2405 			mvol->state |= INTEL_ST_VERIFY_AND_FIX;
2406 		} else
2407 			mvol->migr_state = 0;
2408 		mvol->dirty = (vol->v_dirty || stale);
2409 
2410 		mmap0 = intel_get_map(mvol, 0);
2411 
2412 		/* Write map / common part of two maps. */
2413 		intel_set_map_offset(mmap0, sd->sd_offset / sectorsize);
2414 		intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize);
2415 		mmap0->strip_sectors = vol->v_strip_size / sectorsize;
2416 		if (vol->v_state == G_RAID_VOLUME_S_BROKEN)
2417 			mmap0->status = INTEL_S_FAILURE;
2418 		else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED)
2419 			mmap0->status = INTEL_S_DEGRADED;
2420 		else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED)
2421 		    == g_raid_nsubdisks(vol, -1))
2422 			mmap0->status = INTEL_S_UNINITIALIZED;
2423 		else
2424 			mmap0->status = INTEL_S_READY;
2425 		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2426 			mmap0->type = INTEL_T_RAID0;
2427 		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
2428 		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2429 			mmap0->type = INTEL_T_RAID1;
2430 		else
2431 			mmap0->type = INTEL_T_RAID5;
2432 		mmap0->total_disks = vol->v_disks_count;
2433 		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2434 			mmap0->total_domains = vol->v_disks_count;
2435 		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2436 			mmap0->total_domains = 2;
2437 		else
2438 			mmap0->total_domains = 1;
2439 		intel_set_map_stripe_count(mmap0,
2440 		    sd->sd_size / vol->v_strip_size / mmap0->total_domains);
2441 		mmap0->failed_disk_num = 0xff;
2442 		mmap0->ddf = 1;
2443 
2444 		/* If there are two maps - copy common and update. */
2445 		if (mvol->migr_state) {
2446 			intel_set_vol_curr_migr_unit(mvol,
2447 			    pos / vol->v_strip_size / mmap0->total_domains);
2448 			mmap1 = intel_get_map(mvol, 1);
2449 			memcpy(mmap1, mmap0, sizeof(struct intel_raid_map));
2450 			mmap0->status = INTEL_S_READY;
2451 		} else
2452 			mmap1 = NULL;
2453 
2454 		/* Write disk indexes and put rebuild flags. */
2455 		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2456 			sd = &vol->v_subdisks[sdi];
2457 			pd = (struct g_raid_md_intel_perdisk *)
2458 			    sd->sd_disk->d_md_data;
2459 			mmap0->disk_idx[sdi] = pd->pd_disk_pos;
2460 			if (mvol->migr_state)
2461 				mmap1->disk_idx[sdi] = pd->pd_disk_pos;
2462 			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2463 			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2464 				mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2465 			} else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
2466 			    sd->sd_state != G_RAID_SUBDISK_S_STALE &&
2467 			    sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) {
2468 				mmap0->disk_idx[sdi] |= INTEL_DI_RBLD;
2469 				if (mvol->migr_state)
2470 					mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2471 			}
2472 			if ((sd->sd_state == G_RAID_SUBDISK_S_NONE ||
2473 			     sd->sd_state == G_RAID_SUBDISK_S_FAILED ||
2474 			     sd->sd_state == G_RAID_SUBDISK_S_REBUILD) &&
2475 			    mmap0->failed_disk_num == 0xff) {
2476 				mmap0->failed_disk_num = sdi;
2477 				if (mvol->migr_state)
2478 					mmap1->failed_disk_num = sdi;
2479 			}
2480 		}
2481 		vi++;
2482 	}
2483 	meta->total_volumes = vi;
2484 	if (vi > 1 || meta->attributes &
2485 	     (INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | INTEL_ATTR_2TB))
2486 		version = INTEL_VERSION_1300;
2487 	if (strcmp(version, INTEL_VERSION_1300) < 0)
2488 		meta->attributes &= INTEL_ATTR_CHECKSUM;
2489 	memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1);
2490 
2491 	/* We are done. Print meta data and store them to disks. */
2492 	g_raid_md_intel_print(meta);
2493 	if (mdi->mdio_meta != NULL)
2494 		free(mdi->mdio_meta, M_MD_INTEL);
2495 	mdi->mdio_meta = meta;
2496 	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2497 		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2498 		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
2499 			continue;
2500 		if (pd->pd_meta != NULL) {
2501 			free(pd->pd_meta, M_MD_INTEL);
2502 			pd->pd_meta = NULL;
2503 		}
2504 		pd->pd_meta = intel_meta_copy(meta);
2505 		intel_meta_write(disk->d_consumer, meta);
2506 	}
2507 	return (0);
2508 }
2509 
2510 static int
2511 g_raid_md_fail_disk_intel(struct g_raid_md_object *md,
2512     struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2513 {
2514 	struct g_raid_softc *sc;
2515 	struct g_raid_md_intel_object *mdi;
2516 	struct g_raid_md_intel_perdisk *pd;
2517 	struct g_raid_subdisk *sd;
2518 
2519 	sc = md->mdo_softc;
2520 	mdi = (struct g_raid_md_intel_object *)md;
2521 	pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data;
2522 
2523 	/* We can't fail disk that is not a part of array now. */
2524 	if (pd->pd_disk_pos < 0)
2525 		return (-1);
2526 
2527 	/*
2528 	 * Mark disk as failed in metadata and try to write that metadata
2529 	 * to the disk itself to prevent it's later resurrection as STALE.
2530 	 */
2531 	mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED;
2532 	pd->pd_disk_meta.flags = INTEL_F_FAILED;
2533 	g_raid_md_intel_print(mdi->mdio_meta);
2534 	if (tdisk->d_consumer)
2535 		intel_meta_write(tdisk->d_consumer, mdi->mdio_meta);
2536 
2537 	/* Change states. */
2538 	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
2539 	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
2540 		g_raid_change_subdisk_state(sd,
2541 		    G_RAID_SUBDISK_S_FAILED);
2542 		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
2543 		    G_RAID_EVENT_SUBDISK);
2544 	}
2545 
2546 	/* Write updated metadata to remaining disks. */
2547 	g_raid_md_write_intel(md, NULL, NULL, tdisk);
2548 
2549 	/* Check if anything left except placeholders. */
2550 	if (g_raid_ndisks(sc, -1) ==
2551 	    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2552 		g_raid_destroy_node(sc, 0);
2553 	else
2554 		g_raid_md_intel_refill(sc);
2555 	return (0);
2556 }
2557 
2558 static int
2559 g_raid_md_free_disk_intel(struct g_raid_md_object *md,
2560     struct g_raid_disk *disk)
2561 {
2562 	struct g_raid_md_intel_perdisk *pd;
2563 
2564 	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2565 	if (pd->pd_meta != NULL) {
2566 		free(pd->pd_meta, M_MD_INTEL);
2567 		pd->pd_meta = NULL;
2568 	}
2569 	free(pd, M_MD_INTEL);
2570 	disk->d_md_data = NULL;
2571 	return (0);
2572 }
2573 
2574 static int
2575 g_raid_md_free_volume_intel(struct g_raid_md_object *md,
2576     struct g_raid_volume *vol)
2577 {
2578 	struct g_raid_md_intel_pervolume *pv;
2579 
2580 	pv = (struct g_raid_md_intel_pervolume *)vol->v_md_data;
2581 	free(pv, M_MD_INTEL);
2582 	vol->v_md_data = NULL;
2583 	return (0);
2584 }
2585 
2586 static int
2587 g_raid_md_free_intel(struct g_raid_md_object *md)
2588 {
2589 	struct g_raid_md_intel_object *mdi;
2590 
2591 	mdi = (struct g_raid_md_intel_object *)md;
2592 	if (!mdi->mdio_started) {
2593 		mdi->mdio_started = 0;
2594 		callout_stop(&mdi->mdio_start_co);
2595 		G_RAID_DEBUG1(1, md->mdo_softc,
2596 		    "root_mount_rel %p", mdi->mdio_rootmount);
2597 		root_mount_rel(mdi->mdio_rootmount);
2598 		mdi->mdio_rootmount = NULL;
2599 	}
2600 	if (mdi->mdio_meta != NULL) {
2601 		free(mdi->mdio_meta, M_MD_INTEL);
2602 		mdi->mdio_meta = NULL;
2603 	}
2604 	return (0);
2605 }
2606 
2607 G_RAID_MD_DECLARE(intel, "Intel");
2608