1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright 2023 Red Hat
4 */
5
6 #include <linux/atomic.h>
7 #include <linux/bitops.h>
8 #include <linux/completion.h>
9 #include <linux/delay.h>
10 #include <linux/device-mapper.h>
11 #include <linux/err.h>
12 #include <linux/module.h>
13 #include <linux/mutex.h>
14 #include <linux/spinlock.h>
15
16 #include "admin-state.h"
17 #include "block-map.h"
18 #include "completion.h"
19 #include "constants.h"
20 #include "data-vio.h"
21 #include "dedupe.h"
22 #include "dump.h"
23 #include "encodings.h"
24 #include "errors.h"
25 #include "flush.h"
26 #include "io-submitter.h"
27 #include "logger.h"
28 #include "memory-alloc.h"
29 #include "message-stats.h"
30 #include "recovery-journal.h"
31 #include "repair.h"
32 #include "slab-depot.h"
33 #include "status-codes.h"
34 #include "string-utils.h"
35 #include "thread-device.h"
36 #include "thread-registry.h"
37 #include "thread-utils.h"
38 #include "types.h"
39 #include "vdo.h"
40 #include "vio.h"
41
42 enum admin_phases {
43 GROW_LOGICAL_PHASE_START,
44 GROW_LOGICAL_PHASE_GROW_BLOCK_MAP,
45 GROW_LOGICAL_PHASE_END,
46 GROW_LOGICAL_PHASE_ERROR,
47 GROW_PHYSICAL_PHASE_START,
48 GROW_PHYSICAL_PHASE_COPY_SUMMARY,
49 GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS,
50 GROW_PHYSICAL_PHASE_USE_NEW_SLABS,
51 GROW_PHYSICAL_PHASE_END,
52 GROW_PHYSICAL_PHASE_ERROR,
53 LOAD_PHASE_START,
54 LOAD_PHASE_LOAD_DEPOT,
55 LOAD_PHASE_MAKE_DIRTY,
56 LOAD_PHASE_PREPARE_TO_ALLOCATE,
57 LOAD_PHASE_SCRUB_SLABS,
58 LOAD_PHASE_DATA_REDUCTION,
59 LOAD_PHASE_FINISHED,
60 LOAD_PHASE_DRAIN_JOURNAL,
61 LOAD_PHASE_WAIT_FOR_READ_ONLY,
62 PRE_LOAD_PHASE_START,
63 PRE_LOAD_PHASE_LOAD_COMPONENTS,
64 PRE_LOAD_PHASE_END,
65 PREPARE_GROW_PHYSICAL_PHASE_START,
66 RESUME_PHASE_START,
67 RESUME_PHASE_ALLOW_READ_ONLY_MODE,
68 RESUME_PHASE_DEDUPE,
69 RESUME_PHASE_DEPOT,
70 RESUME_PHASE_JOURNAL,
71 RESUME_PHASE_BLOCK_MAP,
72 RESUME_PHASE_LOGICAL_ZONES,
73 RESUME_PHASE_PACKER,
74 RESUME_PHASE_FLUSHER,
75 RESUME_PHASE_DATA_VIOS,
76 RESUME_PHASE_END,
77 SUSPEND_PHASE_START,
78 SUSPEND_PHASE_PACKER,
79 SUSPEND_PHASE_DATA_VIOS,
80 SUSPEND_PHASE_DEDUPE,
81 SUSPEND_PHASE_FLUSHES,
82 SUSPEND_PHASE_LOGICAL_ZONES,
83 SUSPEND_PHASE_BLOCK_MAP,
84 SUSPEND_PHASE_JOURNAL,
85 SUSPEND_PHASE_DEPOT,
86 SUSPEND_PHASE_READ_ONLY_WAIT,
87 SUSPEND_PHASE_WRITE_SUPER_BLOCK,
88 SUSPEND_PHASE_END,
89 };
90
91 static const char * const ADMIN_PHASE_NAMES[] = {
92 "GROW_LOGICAL_PHASE_START",
93 "GROW_LOGICAL_PHASE_GROW_BLOCK_MAP",
94 "GROW_LOGICAL_PHASE_END",
95 "GROW_LOGICAL_PHASE_ERROR",
96 "GROW_PHYSICAL_PHASE_START",
97 "GROW_PHYSICAL_PHASE_COPY_SUMMARY",
98 "GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS",
99 "GROW_PHYSICAL_PHASE_USE_NEW_SLABS",
100 "GROW_PHYSICAL_PHASE_END",
101 "GROW_PHYSICAL_PHASE_ERROR",
102 "LOAD_PHASE_START",
103 "LOAD_PHASE_LOAD_DEPOT",
104 "LOAD_PHASE_MAKE_DIRTY",
105 "LOAD_PHASE_PREPARE_TO_ALLOCATE",
106 "LOAD_PHASE_SCRUB_SLABS",
107 "LOAD_PHASE_DATA_REDUCTION",
108 "LOAD_PHASE_FINISHED",
109 "LOAD_PHASE_DRAIN_JOURNAL",
110 "LOAD_PHASE_WAIT_FOR_READ_ONLY",
111 "PRE_LOAD_PHASE_START",
112 "PRE_LOAD_PHASE_LOAD_COMPONENTS",
113 "PRE_LOAD_PHASE_END",
114 "PREPARE_GROW_PHYSICAL_PHASE_START",
115 "RESUME_PHASE_START",
116 "RESUME_PHASE_ALLOW_READ_ONLY_MODE",
117 "RESUME_PHASE_DEDUPE",
118 "RESUME_PHASE_DEPOT",
119 "RESUME_PHASE_JOURNAL",
120 "RESUME_PHASE_BLOCK_MAP",
121 "RESUME_PHASE_LOGICAL_ZONES",
122 "RESUME_PHASE_PACKER",
123 "RESUME_PHASE_FLUSHER",
124 "RESUME_PHASE_DATA_VIOS",
125 "RESUME_PHASE_END",
126 "SUSPEND_PHASE_START",
127 "SUSPEND_PHASE_PACKER",
128 "SUSPEND_PHASE_DATA_VIOS",
129 "SUSPEND_PHASE_DEDUPE",
130 "SUSPEND_PHASE_FLUSHES",
131 "SUSPEND_PHASE_LOGICAL_ZONES",
132 "SUSPEND_PHASE_BLOCK_MAP",
133 "SUSPEND_PHASE_JOURNAL",
134 "SUSPEND_PHASE_DEPOT",
135 "SUSPEND_PHASE_READ_ONLY_WAIT",
136 "SUSPEND_PHASE_WRITE_SUPER_BLOCK",
137 "SUSPEND_PHASE_END",
138 };
139
140 /* If we bump this, update the arrays below */
141 #define TABLE_VERSION 4
142
143 /* arrays for handling different table versions */
144 static const u8 REQUIRED_ARGC[] = { 10, 12, 9, 7, 6 };
145 /* pool name no longer used. only here for verification of older versions */
146 static const u8 POOL_NAME_ARG_INDEX[] = { 8, 10, 8 };
147
148 /*
149 * Track in-use instance numbers using a flat bit array.
150 *
151 * O(n) run time isn't ideal, but if we have 1000 VDO devices in use simultaneously we still only
152 * need to scan 16 words, so it's not likely to be a big deal compared to other resource usage.
153 */
154
155 /*
156 * This minimum size for the bit array creates a numbering space of 0-999, which allows
157 * successive starts of the same volume to have different instance numbers in any
158 * reasonably-sized test. Changing instances on restart allows vdoMonReport to detect that
159 * the ephemeral stats have reset to zero.
160 */
161 #define BIT_COUNT_MINIMUM 1000
162 /* Grow the bit array by this many bits when needed */
163 #define BIT_COUNT_INCREMENT 100
164
165 struct instance_tracker {
166 unsigned int bit_count;
167 unsigned long *words;
168 unsigned int count;
169 unsigned int next;
170 };
171
172 static DEFINE_MUTEX(instances_lock);
173 static struct instance_tracker instances;
174
175 /**
176 * free_device_config() - Free a device config created by parse_device_config().
177 * @config: The config to free.
178 */
free_device_config(struct device_config * config)179 static void free_device_config(struct device_config *config)
180 {
181 if (config == NULL)
182 return;
183
184 if (config->owned_device != NULL)
185 dm_put_device(config->owning_target, config->owned_device);
186
187 vdo_free(config->parent_device_name);
188 vdo_free(config->original_string);
189
190 /* Reduce the chance a use-after-free (as in BZ 1669960) happens to work. */
191 memset(config, 0, sizeof(*config));
192 vdo_free(config);
193 }
194
195 /**
196 * get_version_number() - Decide the version number from argv.
197 *
198 * @argc: The number of table values.
199 * @argv: The array of table values.
200 * @error_ptr: A pointer to return a error string in.
201 * @version_ptr: A pointer to return the version.
202 *
203 * Return: VDO_SUCCESS or an error code.
204 */
get_version_number(int argc,char ** argv,char ** error_ptr,unsigned int * version_ptr)205 static int get_version_number(int argc, char **argv, char **error_ptr,
206 unsigned int *version_ptr)
207 {
208 /* version, if it exists, is in a form of V<n> */
209 if (sscanf(argv[0], "V%u", version_ptr) == 1) {
210 if (*version_ptr < 1 || *version_ptr > TABLE_VERSION) {
211 *error_ptr = "Unknown version number detected";
212 return VDO_BAD_CONFIGURATION;
213 }
214 } else {
215 /* V0 actually has no version number in the table string */
216 *version_ptr = 0;
217 }
218
219 /*
220 * V0 and V1 have no optional parameters. There will always be a parameter for thread
221 * config, even if it's a "." to show it's an empty list.
222 */
223 if (*version_ptr <= 1) {
224 if (argc != REQUIRED_ARGC[*version_ptr]) {
225 *error_ptr = "Incorrect number of arguments for version";
226 return VDO_BAD_CONFIGURATION;
227 }
228 } else if (argc < REQUIRED_ARGC[*version_ptr]) {
229 *error_ptr = "Incorrect number of arguments for version";
230 return VDO_BAD_CONFIGURATION;
231 }
232
233 if (*version_ptr != TABLE_VERSION) {
234 vdo_log_warning("Detected version mismatch between kernel module and tools kernel: %d, tool: %d",
235 TABLE_VERSION, *version_ptr);
236 vdo_log_warning("Please consider upgrading management tools to match kernel.");
237 }
238 return VDO_SUCCESS;
239 }
240
241 /* Free a list of non-NULL string pointers, and then the list itself. */
free_string_array(char ** string_array)242 static void free_string_array(char **string_array)
243 {
244 unsigned int offset;
245
246 for (offset = 0; string_array[offset] != NULL; offset++)
247 vdo_free(string_array[offset]);
248 vdo_free(string_array);
249 }
250
251 /*
252 * Split the input string into substrings, separated at occurrences of the indicated character,
253 * returning a null-terminated list of string pointers.
254 *
255 * The string pointers and the pointer array itself should both be freed with vdo_free() when no
256 * longer needed. This can be done with vdo_free_string_array (below) if the pointers in the array
257 * are not changed. Since the array and copied strings are allocated by this function, it may only
258 * be used in contexts where allocation is permitted.
259 *
260 * Empty substrings are not ignored; that is, returned substrings may be empty strings if the
261 * separator occurs twice in a row.
262 */
split_string(const char * string,char separator,char *** substring_array_ptr)263 static int split_string(const char *string, char separator, char ***substring_array_ptr)
264 {
265 unsigned int current_substring = 0, substring_count = 1;
266 const char *s;
267 char **substrings;
268 int result;
269 ptrdiff_t length;
270
271 for (s = string; *s != 0; s++) {
272 if (*s == separator)
273 substring_count++;
274 }
275
276 result = vdo_allocate(substring_count + 1, char *, "string-splitting array",
277 &substrings);
278 if (result != VDO_SUCCESS)
279 return result;
280
281 for (s = string; *s != 0; s++) {
282 if (*s == separator) {
283 ptrdiff_t length = s - string;
284
285 result = vdo_allocate(length + 1, char, "split string",
286 &substrings[current_substring]);
287 if (result != VDO_SUCCESS) {
288 free_string_array(substrings);
289 return result;
290 }
291 /*
292 * Trailing NUL is already in place after allocation; deal with the zero or
293 * more non-NUL bytes in the string.
294 */
295 if (length > 0)
296 memcpy(substrings[current_substring], string, length);
297 string = s + 1;
298 current_substring++;
299 BUG_ON(current_substring >= substring_count);
300 }
301 }
302 /* Process final string, with no trailing separator. */
303 BUG_ON(current_substring != (substring_count - 1));
304 length = strlen(string);
305
306 result = vdo_allocate(length + 1, char, "split string",
307 &substrings[current_substring]);
308 if (result != VDO_SUCCESS) {
309 free_string_array(substrings);
310 return result;
311 }
312 memcpy(substrings[current_substring], string, length);
313 current_substring++;
314 /* substrings[current_substring] is NULL already */
315 *substring_array_ptr = substrings;
316 return VDO_SUCCESS;
317 }
318
319 /*
320 * Join the input substrings into one string, joined with the indicated character, returning a
321 * string. array_length is a bound on the number of valid elements in substring_array, in case it
322 * is not NULL-terminated.
323 */
join_strings(char ** substring_array,size_t array_length,char separator,char ** string_ptr)324 static int join_strings(char **substring_array, size_t array_length, char separator,
325 char **string_ptr)
326 {
327 size_t string_length = 0;
328 size_t i;
329 int result;
330 char *output, *current_position;
331
332 for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++)
333 string_length += strlen(substring_array[i]) + 1;
334
335 result = vdo_allocate(string_length, char, __func__, &output);
336 if (result != VDO_SUCCESS)
337 return result;
338
339 current_position = &output[0];
340
341 for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) {
342 current_position = vdo_append_to_buffer(current_position,
343 output + string_length, "%s",
344 substring_array[i]);
345 *current_position = separator;
346 current_position++;
347 }
348
349 /* We output one too many separators; replace the last with a zero byte. */
350 if (current_position != output)
351 *(current_position - 1) = '\0';
352
353 *string_ptr = output;
354 return VDO_SUCCESS;
355 }
356
357 /**
358 * parse_bool() - Parse a two-valued option into a bool.
359 * @bool_str: The string value to convert to a bool.
360 * @true_str: The string value which should be converted to true.
361 * @false_str: The string value which should be converted to false.
362 * @bool_ptr: A pointer to return the bool value in.
363 *
364 * Return: VDO_SUCCESS or an error if bool_str is neither true_str nor false_str.
365 */
parse_bool(const char * bool_str,const char * true_str,const char * false_str,bool * bool_ptr)366 static inline int __must_check parse_bool(const char *bool_str, const char *true_str,
367 const char *false_str, bool *bool_ptr)
368 {
369 bool value = false;
370
371 if (strcmp(bool_str, true_str) == 0)
372 value = true;
373 else if (strcmp(bool_str, false_str) == 0)
374 value = false;
375 else
376 return VDO_BAD_CONFIGURATION;
377
378 *bool_ptr = value;
379 return VDO_SUCCESS;
380 }
381
382 /**
383 * process_one_thread_config_spec() - Process one component of a thread parameter configuration
384 * string and update the configuration data structure.
385 * @thread_param_type: The type of thread specified.
386 * @count: The thread count requested.
387 * @config: The configuration data structure to update.
388 *
389 * If the thread count requested is invalid, a message is logged and -EINVAL returned. If the
390 * thread name is unknown, a message is logged but no error is returned.
391 *
392 * Return: VDO_SUCCESS or -EINVAL
393 */
process_one_thread_config_spec(const char * thread_param_type,unsigned int count,struct thread_count_config * config)394 static int process_one_thread_config_spec(const char *thread_param_type,
395 unsigned int count,
396 struct thread_count_config *config)
397 {
398 /* Handle limited thread parameters */
399 if (strcmp(thread_param_type, "bioRotationInterval") == 0) {
400 if (count == 0) {
401 vdo_log_error("thread config string error: 'bioRotationInterval' of at least 1 is required");
402 return -EINVAL;
403 } else if (count > VDO_BIO_ROTATION_INTERVAL_LIMIT) {
404 vdo_log_error("thread config string error: 'bioRotationInterval' cannot be higher than %d",
405 VDO_BIO_ROTATION_INTERVAL_LIMIT);
406 return -EINVAL;
407 }
408 config->bio_rotation_interval = count;
409 return VDO_SUCCESS;
410 }
411 if (strcmp(thread_param_type, "logical") == 0) {
412 if (count > MAX_VDO_LOGICAL_ZONES) {
413 vdo_log_error("thread config string error: at most %d 'logical' threads are allowed",
414 MAX_VDO_LOGICAL_ZONES);
415 return -EINVAL;
416 }
417 config->logical_zones = count;
418 return VDO_SUCCESS;
419 }
420 if (strcmp(thread_param_type, "physical") == 0) {
421 if (count > MAX_VDO_PHYSICAL_ZONES) {
422 vdo_log_error("thread config string error: at most %d 'physical' threads are allowed",
423 MAX_VDO_PHYSICAL_ZONES);
424 return -EINVAL;
425 }
426 config->physical_zones = count;
427 return VDO_SUCCESS;
428 }
429 /* Handle other thread count parameters */
430 if (count > MAXIMUM_VDO_THREADS) {
431 vdo_log_error("thread config string error: at most %d '%s' threads are allowed",
432 MAXIMUM_VDO_THREADS, thread_param_type);
433 return -EINVAL;
434 }
435 if (strcmp(thread_param_type, "hash") == 0) {
436 config->hash_zones = count;
437 return VDO_SUCCESS;
438 }
439 if (strcmp(thread_param_type, "cpu") == 0) {
440 if (count == 0) {
441 vdo_log_error("thread config string error: at least one 'cpu' thread required");
442 return -EINVAL;
443 }
444 config->cpu_threads = count;
445 return VDO_SUCCESS;
446 }
447 if (strcmp(thread_param_type, "ack") == 0) {
448 config->bio_ack_threads = count;
449 return VDO_SUCCESS;
450 }
451 if (strcmp(thread_param_type, "bio") == 0) {
452 if (count == 0) {
453 vdo_log_error("thread config string error: at least one 'bio' thread required");
454 return -EINVAL;
455 }
456 config->bio_threads = count;
457 return VDO_SUCCESS;
458 }
459
460 /*
461 * Don't fail, just log. This will handle version mismatches between user mode tools and
462 * kernel.
463 */
464 vdo_log_info("unknown thread parameter type \"%s\"", thread_param_type);
465 return VDO_SUCCESS;
466 }
467
468 /**
469 * parse_one_thread_config_spec() - Parse one component of a thread parameter configuration string
470 * and update the configuration data structure.
471 * @spec: The thread parameter specification string.
472 * @config: The configuration data to be updated.
473 */
parse_one_thread_config_spec(const char * spec,struct thread_count_config * config)474 static int parse_one_thread_config_spec(const char *spec,
475 struct thread_count_config *config)
476 {
477 unsigned int count;
478 char **fields;
479 int result;
480
481 result = split_string(spec, '=', &fields);
482 if (result != VDO_SUCCESS)
483 return result;
484
485 if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) {
486 vdo_log_error("thread config string error: expected thread parameter assignment, saw \"%s\"",
487 spec);
488 free_string_array(fields);
489 return -EINVAL;
490 }
491
492 result = kstrtouint(fields[1], 10, &count);
493 if (result) {
494 vdo_log_error("thread config string error: integer value needed, found \"%s\"",
495 fields[1]);
496 free_string_array(fields);
497 return result;
498 }
499
500 result = process_one_thread_config_spec(fields[0], count, config);
501 free_string_array(fields);
502 return result;
503 }
504
505 /**
506 * parse_thread_config_string() - Parse the configuration string passed and update the specified
507 * counts and other parameters of various types of threads to be
508 * created.
509 * @string: Thread parameter configuration string.
510 * @config: The thread configuration data to update.
511 *
512 * The configuration string should contain one or more comma-separated specs of the form
513 * "typename=number"; the supported type names are "cpu", "ack", "bio", "bioRotationInterval",
514 * "logical", "physical", and "hash".
515 *
516 * If an error occurs during parsing of a single key/value pair, we deem it serious enough to stop
517 * further parsing.
518 *
519 * This function can't set the "reason" value the caller wants to pass back, because we'd want to
520 * format it to say which field was invalid, and we can't allocate the "reason" strings
521 * dynamically. So if an error occurs, we'll log the details and pass back an error.
522 *
523 * Return: VDO_SUCCESS or -EINVAL or -ENOMEM
524 */
parse_thread_config_string(const char * string,struct thread_count_config * config)525 static int parse_thread_config_string(const char *string,
526 struct thread_count_config *config)
527 {
528 int result = VDO_SUCCESS;
529 char **specs;
530
531 if (strcmp(".", string) != 0) {
532 unsigned int i;
533
534 result = split_string(string, ',', &specs);
535 if (result != VDO_SUCCESS)
536 return result;
537
538 for (i = 0; specs[i] != NULL; i++) {
539 result = parse_one_thread_config_spec(specs[i], config);
540 if (result != VDO_SUCCESS)
541 break;
542 }
543 free_string_array(specs);
544 }
545 return result;
546 }
547
548 /**
549 * process_one_key_value_pair() - Process one component of an optional parameter string and update
550 * the configuration data structure.
551 * @key: The optional parameter key name.
552 * @value: The optional parameter value.
553 * @config: The configuration data structure to update.
554 *
555 * If the value requested is invalid, a message is logged and -EINVAL returned. If the key is
556 * unknown, a message is logged but no error is returned.
557 *
558 * Return: VDO_SUCCESS or -EINVAL
559 */
process_one_key_value_pair(const char * key,unsigned int value,struct device_config * config)560 static int process_one_key_value_pair(const char *key, unsigned int value,
561 struct device_config *config)
562 {
563 /* Non thread optional parameters */
564 if (strcmp(key, "maxDiscard") == 0) {
565 if (value == 0) {
566 vdo_log_error("optional parameter error: at least one max discard block required");
567 return -EINVAL;
568 }
569 /* Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 */
570 if (value > (UINT_MAX / VDO_BLOCK_SIZE)) {
571 vdo_log_error("optional parameter error: at most %d max discard blocks are allowed",
572 UINT_MAX / VDO_BLOCK_SIZE);
573 return -EINVAL;
574 }
575 config->max_discard_blocks = value;
576 return VDO_SUCCESS;
577 }
578 /* Handles unknown key names */
579 return process_one_thread_config_spec(key, value, &config->thread_counts);
580 }
581
582 /**
583 * parse_one_key_value_pair() - Parse one key/value pair and update the configuration data
584 * structure.
585 * @key: The optional key name.
586 * @value: The optional value.
587 * @config: The configuration data to be updated.
588 *
589 * Return: VDO_SUCCESS or error.
590 */
parse_one_key_value_pair(const char * key,const char * value,struct device_config * config)591 static int parse_one_key_value_pair(const char *key, const char *value,
592 struct device_config *config)
593 {
594 unsigned int count;
595 int result;
596
597 if (strcmp(key, "deduplication") == 0)
598 return parse_bool(value, "on", "off", &config->deduplication);
599
600 if (strcmp(key, "compression") == 0)
601 return parse_bool(value, "on", "off", &config->compression);
602
603 /* The remaining arguments must have integral values. */
604 result = kstrtouint(value, 10, &count);
605 if (result) {
606 vdo_log_error("optional config string error: integer value needed, found \"%s\"",
607 value);
608 return result;
609 }
610 return process_one_key_value_pair(key, count, config);
611 }
612
613 /**
614 * parse_key_value_pairs() - Parse all key/value pairs from a list of arguments.
615 * @argc: The total number of arguments in list.
616 * @argv: The list of key/value pairs.
617 * @config: The device configuration data to update.
618 *
619 * If an error occurs during parsing of a single key/value pair, we deem it serious enough to stop
620 * further parsing.
621 *
622 * This function can't set the "reason" value the caller wants to pass back, because we'd want to
623 * format it to say which field was invalid, and we can't allocate the "reason" strings
624 * dynamically. So if an error occurs, we'll log the details and return the error.
625 *
626 * Return: VDO_SUCCESS or error
627 */
parse_key_value_pairs(int argc,char ** argv,struct device_config * config)628 static int parse_key_value_pairs(int argc, char **argv, struct device_config *config)
629 {
630 int result = VDO_SUCCESS;
631
632 while (argc) {
633 result = parse_one_key_value_pair(argv[0], argv[1], config);
634 if (result != VDO_SUCCESS)
635 break;
636
637 argc -= 2;
638 argv += 2;
639 }
640
641 return result;
642 }
643
644 /**
645 * parse_optional_arguments() - Parse the configuration string passed in for optional arguments.
646 * @arg_set: The structure holding the arguments to parse.
647 * @error_ptr: Pointer to a buffer to hold the error string.
648 * @config: Pointer to device configuration data to update.
649 *
650 * For V0/V1 configurations, there will only be one optional parameter; the thread configuration.
651 * The configuration string should contain one or more comma-separated specs of the form
652 * "typename=number"; the supported type names are "cpu", "ack", "bio", "bioRotationInterval",
653 * "logical", "physical", and "hash".
654 *
655 * For V2 configurations and beyond, there could be any number of arguments. They should contain
656 * one or more key/value pairs separated by a space.
657 *
658 * Return: VDO_SUCCESS or error
659 */
parse_optional_arguments(struct dm_arg_set * arg_set,char ** error_ptr,struct device_config * config)660 static int parse_optional_arguments(struct dm_arg_set *arg_set, char **error_ptr,
661 struct device_config *config)
662 {
663 int result = VDO_SUCCESS;
664
665 if (config->version == 0 || config->version == 1) {
666 result = parse_thread_config_string(arg_set->argv[0],
667 &config->thread_counts);
668 if (result != VDO_SUCCESS) {
669 *error_ptr = "Invalid thread-count configuration";
670 return VDO_BAD_CONFIGURATION;
671 }
672 } else {
673 if ((arg_set->argc % 2) != 0) {
674 *error_ptr = "Odd number of optional arguments given but they should be <key> <value> pairs";
675 return VDO_BAD_CONFIGURATION;
676 }
677 result = parse_key_value_pairs(arg_set->argc, arg_set->argv, config);
678 if (result != VDO_SUCCESS) {
679 *error_ptr = "Invalid optional argument configuration";
680 return VDO_BAD_CONFIGURATION;
681 }
682 }
683 return result;
684 }
685
686 /**
687 * handle_parse_error() - Handle a parsing error.
688 * @config: The config to free.
689 * @error_ptr: A place to store a constant string about the error.
690 * @error_str: A constant string to store in error_ptr.
691 */
handle_parse_error(struct device_config * config,char ** error_ptr,char * error_str)692 static void handle_parse_error(struct device_config *config, char **error_ptr,
693 char *error_str)
694 {
695 free_device_config(config);
696 *error_ptr = error_str;
697 }
698
699 /**
700 * parse_device_config() - Convert the dmsetup table into a struct device_config.
701 * @argc: The number of table values.
702 * @argv: The array of table values.
703 * @ti: The target structure for this table.
704 * @config_ptr: A pointer to return the allocated config.
705 *
706 * Return: VDO_SUCCESS or an error code.
707 */
parse_device_config(int argc,char ** argv,struct dm_target * ti,struct device_config ** config_ptr)708 static int parse_device_config(int argc, char **argv, struct dm_target *ti,
709 struct device_config **config_ptr)
710 {
711 bool enable_512e;
712 size_t logical_bytes = to_bytes(ti->len);
713 struct dm_arg_set arg_set;
714 char **error_ptr = &ti->error;
715 struct device_config *config = NULL;
716 int result;
717
718 if ((logical_bytes % VDO_BLOCK_SIZE) != 0) {
719 handle_parse_error(config, error_ptr,
720 "Logical size must be a multiple of 4096");
721 return VDO_BAD_CONFIGURATION;
722 }
723
724 if (argc == 0) {
725 handle_parse_error(config, error_ptr, "Incorrect number of arguments");
726 return VDO_BAD_CONFIGURATION;
727 }
728
729 result = vdo_allocate(1, struct device_config, "device_config", &config);
730 if (result != VDO_SUCCESS) {
731 handle_parse_error(config, error_ptr,
732 "Could not allocate config structure");
733 return VDO_BAD_CONFIGURATION;
734 }
735
736 config->owning_target = ti;
737 config->logical_blocks = logical_bytes / VDO_BLOCK_SIZE;
738 INIT_LIST_HEAD(&config->config_list);
739
740 /* Save the original string. */
741 result = join_strings(argv, argc, ' ', &config->original_string);
742 if (result != VDO_SUCCESS) {
743 handle_parse_error(config, error_ptr, "Could not populate string");
744 return VDO_BAD_CONFIGURATION;
745 }
746
747 vdo_log_info("table line: %s", config->original_string);
748
749 config->thread_counts = (struct thread_count_config) {
750 .bio_ack_threads = 1,
751 .bio_threads = DEFAULT_VDO_BIO_SUBMIT_QUEUE_COUNT,
752 .bio_rotation_interval = DEFAULT_VDO_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL,
753 .cpu_threads = 1,
754 .logical_zones = 0,
755 .physical_zones = 0,
756 .hash_zones = 0,
757 };
758 config->max_discard_blocks = 1;
759 config->deduplication = true;
760 config->compression = false;
761
762 arg_set.argc = argc;
763 arg_set.argv = argv;
764
765 result = get_version_number(argc, argv, error_ptr, &config->version);
766 if (result != VDO_SUCCESS) {
767 /* get_version_number sets error_ptr itself. */
768 handle_parse_error(config, error_ptr, *error_ptr);
769 return result;
770 }
771 /* Move the arg pointer forward only if the argument was there. */
772 if (config->version >= 1)
773 dm_shift_arg(&arg_set);
774
775 result = vdo_duplicate_string(dm_shift_arg(&arg_set), "parent device name",
776 &config->parent_device_name);
777 if (result != VDO_SUCCESS) {
778 handle_parse_error(config, error_ptr,
779 "Could not copy parent device name");
780 return VDO_BAD_CONFIGURATION;
781 }
782
783 /* Get the physical blocks, if known. */
784 if (config->version >= 1) {
785 result = kstrtoull(dm_shift_arg(&arg_set), 10, &config->physical_blocks);
786 if (result != VDO_SUCCESS) {
787 handle_parse_error(config, error_ptr,
788 "Invalid physical block count");
789 return VDO_BAD_CONFIGURATION;
790 }
791 }
792
793 /* Get the logical block size and validate */
794 result = parse_bool(dm_shift_arg(&arg_set), "512", "4096", &enable_512e);
795 if (result != VDO_SUCCESS) {
796 handle_parse_error(config, error_ptr, "Invalid logical block size");
797 return VDO_BAD_CONFIGURATION;
798 }
799 config->logical_block_size = (enable_512e ? 512 : 4096);
800
801 /* Skip past the two no longer used read cache options. */
802 if (config->version <= 1)
803 dm_consume_args(&arg_set, 2);
804
805 /* Get the page cache size. */
806 result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->cache_size);
807 if (result != VDO_SUCCESS) {
808 handle_parse_error(config, error_ptr,
809 "Invalid block map page cache size");
810 return VDO_BAD_CONFIGURATION;
811 }
812
813 /* Get the block map era length. */
814 result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->block_map_maximum_age);
815 if (result != VDO_SUCCESS) {
816 handle_parse_error(config, error_ptr, "Invalid block map maximum age");
817 return VDO_BAD_CONFIGURATION;
818 }
819
820 /* Skip past the no longer used MD RAID5 optimization mode */
821 if (config->version <= 2)
822 dm_consume_args(&arg_set, 1);
823
824 /* Skip past the no longer used write policy setting */
825 if (config->version <= 3)
826 dm_consume_args(&arg_set, 1);
827
828 /* Skip past the no longer used pool name for older table lines */
829 if (config->version <= 2) {
830 /*
831 * Make sure the enum to get the pool name from argv directly is still in sync with
832 * the parsing of the table line.
833 */
834 if (&arg_set.argv[0] != &argv[POOL_NAME_ARG_INDEX[config->version]]) {
835 handle_parse_error(config, error_ptr,
836 "Pool name not in expected location");
837 return VDO_BAD_CONFIGURATION;
838 }
839 dm_shift_arg(&arg_set);
840 }
841
842 /* Get the optional arguments and validate. */
843 result = parse_optional_arguments(&arg_set, error_ptr, config);
844 if (result != VDO_SUCCESS) {
845 /* parse_optional_arguments sets error_ptr itself. */
846 handle_parse_error(config, error_ptr, *error_ptr);
847 return result;
848 }
849
850 /*
851 * Logical, physical, and hash zone counts can all be zero; then we get one thread doing
852 * everything, our older configuration. If any zone count is non-zero, the others must be
853 * as well.
854 */
855 if (((config->thread_counts.logical_zones == 0) !=
856 (config->thread_counts.physical_zones == 0)) ||
857 ((config->thread_counts.physical_zones == 0) !=
858 (config->thread_counts.hash_zones == 0))) {
859 handle_parse_error(config, error_ptr,
860 "Logical, physical, and hash zones counts must all be zero or all non-zero");
861 return VDO_BAD_CONFIGURATION;
862 }
863
864 if (config->cache_size <
865 (2 * MAXIMUM_VDO_USER_VIOS * config->thread_counts.logical_zones)) {
866 handle_parse_error(config, error_ptr,
867 "Insufficient block map cache for logical zones");
868 return VDO_BAD_CONFIGURATION;
869 }
870
871 result = dm_get_device(ti, config->parent_device_name,
872 dm_table_get_mode(ti->table), &config->owned_device);
873 if (result != 0) {
874 vdo_log_error("couldn't open device \"%s\": error %d",
875 config->parent_device_name, result);
876 handle_parse_error(config, error_ptr, "Unable to open storage device");
877 return VDO_BAD_CONFIGURATION;
878 }
879
880 if (config->version == 0) {
881 u64 device_size = bdev_nr_bytes(config->owned_device->bdev);
882
883 config->physical_blocks = device_size / VDO_BLOCK_SIZE;
884 }
885
886 *config_ptr = config;
887 return result;
888 }
889
get_vdo_for_target(struct dm_target * ti)890 static struct vdo *get_vdo_for_target(struct dm_target *ti)
891 {
892 return ((struct device_config *) ti->private)->vdo;
893 }
894
895
vdo_map_bio(struct dm_target * ti,struct bio * bio)896 static int vdo_map_bio(struct dm_target *ti, struct bio *bio)
897 {
898 struct vdo *vdo = get_vdo_for_target(ti);
899 struct vdo_work_queue *current_work_queue;
900 const struct admin_state_code *code = vdo_get_admin_state_code(&vdo->admin.state);
901
902 VDO_ASSERT_LOG_ONLY(code->normal, "vdo should not receive bios while in state %s",
903 code->name);
904
905 /* Count all incoming bios. */
906 vdo_count_bios(&vdo->stats.bios_in, bio);
907
908
909 /* Handle empty bios. Empty flush bios are not associated with a vio. */
910 if ((bio_op(bio) == REQ_OP_FLUSH) || ((bio->bi_opf & REQ_PREFLUSH) != 0)) {
911 vdo_launch_flush(vdo, bio);
912 return DM_MAPIO_SUBMITTED;
913 }
914
915 /* This could deadlock, */
916 current_work_queue = vdo_get_current_work_queue();
917 BUG_ON((current_work_queue != NULL) &&
918 (vdo == vdo_get_work_queue_owner(current_work_queue)->vdo));
919 vdo_launch_bio(vdo->data_vio_pool, bio);
920 return DM_MAPIO_SUBMITTED;
921 }
922
vdo_io_hints(struct dm_target * ti,struct queue_limits * limits)923 static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits)
924 {
925 struct vdo *vdo = get_vdo_for_target(ti);
926
927 limits->logical_block_size = vdo->device_config->logical_block_size;
928 limits->physical_block_size = VDO_BLOCK_SIZE;
929
930 /* The minimum io size for random io */
931 limits->io_min = VDO_BLOCK_SIZE;
932 /* The optimal io size for streamed/sequential io */
933 limits->io_opt = VDO_BLOCK_SIZE;
934
935 /*
936 * Sets the maximum discard size that will be passed into VDO. This value comes from a
937 * table line value passed in during dmsetup create.
938 *
939 * The value 1024 is the largest usable value on HD systems. A 2048 sector discard on a
940 * busy HD system takes 31 seconds. We should use a value no higher than 1024, which takes
941 * 15 to 16 seconds on a busy HD system. However, using large values results in 120 second
942 * blocked task warnings in kernel logs. In order to avoid these warnings, we choose to
943 * use the smallest reasonable value.
944 *
945 * The value is used by dm-thin to determine whether to pass down discards. The block layer
946 * splits large discards on this boundary when this is set.
947 */
948 limits->max_hw_discard_sectors =
949 (vdo->device_config->max_discard_blocks * VDO_SECTORS_PER_BLOCK);
950
951 /*
952 * Force discards to not begin or end with a partial block by stating the granularity is
953 * 4k.
954 */
955 limits->discard_granularity = VDO_BLOCK_SIZE;
956 }
957
vdo_iterate_devices(struct dm_target * ti,iterate_devices_callout_fn fn,void * data)958 static int vdo_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn,
959 void *data)
960 {
961 struct device_config *config = get_vdo_for_target(ti)->device_config;
962
963 return fn(ti, config->owned_device, 0,
964 config->physical_blocks * VDO_SECTORS_PER_BLOCK, data);
965 }
966
967 /*
968 * Status line is:
969 * <device> <operating mode> <in recovery> <index state> <compression state>
970 * <used physical blocks> <total physical blocks>
971 */
972
vdo_status(struct dm_target * ti,status_type_t status_type,unsigned int status_flags,char * result,unsigned int maxlen)973 static void vdo_status(struct dm_target *ti, status_type_t status_type,
974 unsigned int status_flags, char *result, unsigned int maxlen)
975 {
976 struct vdo *vdo = get_vdo_for_target(ti);
977 struct vdo_statistics *stats;
978 struct device_config *device_config;
979 /* N.B.: The DMEMIT macro uses the variables named "sz", "result", "maxlen". */
980 int sz = 0;
981
982 switch (status_type) {
983 case STATUSTYPE_INFO:
984 /* Report info for dmsetup status */
985 mutex_lock(&vdo->stats_mutex);
986 vdo_fetch_statistics(vdo, &vdo->stats_buffer);
987 stats = &vdo->stats_buffer;
988
989 DMEMIT("/dev/%pg %s %s %s %s %llu %llu",
990 vdo_get_backing_device(vdo), stats->mode,
991 stats->in_recovery_mode ? "recovering" : "-",
992 vdo_get_dedupe_index_state_name(vdo->hash_zones),
993 vdo_get_compressing(vdo) ? "online" : "offline",
994 stats->data_blocks_used + stats->overhead_blocks_used,
995 stats->physical_blocks);
996 mutex_unlock(&vdo->stats_mutex);
997 break;
998
999 case STATUSTYPE_TABLE:
1000 /* Report the string actually specified in the beginning. */
1001 device_config = (struct device_config *) ti->private;
1002 DMEMIT("%s", device_config->original_string);
1003 break;
1004
1005 case STATUSTYPE_IMA:
1006 /* FIXME: We ought to be more detailed here, but this is what thin does. */
1007 *result = '\0';
1008 break;
1009 }
1010 }
1011
get_underlying_device_block_count(const struct vdo * vdo)1012 static block_count_t __must_check get_underlying_device_block_count(const struct vdo *vdo)
1013 {
1014 return bdev_nr_bytes(vdo_get_backing_device(vdo)) / VDO_BLOCK_SIZE;
1015 }
1016
process_vdo_message_locked(struct vdo * vdo,unsigned int argc,char ** argv)1017 static int __must_check process_vdo_message_locked(struct vdo *vdo, unsigned int argc,
1018 char **argv)
1019 {
1020 if ((argc == 2) && (strcasecmp(argv[0], "compression") == 0)) {
1021 if (strcasecmp(argv[1], "on") == 0) {
1022 vdo_set_compressing(vdo, true);
1023 return 0;
1024 }
1025
1026 if (strcasecmp(argv[1], "off") == 0) {
1027 vdo_set_compressing(vdo, false);
1028 return 0;
1029 }
1030
1031 vdo_log_warning("invalid argument '%s' to dmsetup compression message",
1032 argv[1]);
1033 return -EINVAL;
1034 }
1035
1036 vdo_log_warning("unrecognized dmsetup message '%s' received", argv[0]);
1037 return -EINVAL;
1038 }
1039
1040 /*
1041 * If the message is a dump, just do it. Otherwise, check that no other message is being processed,
1042 * and only proceed if so.
1043 * Returns -EBUSY if another message is being processed
1044 */
process_vdo_message(struct vdo * vdo,unsigned int argc,char ** argv)1045 static int __must_check process_vdo_message(struct vdo *vdo, unsigned int argc,
1046 char **argv)
1047 {
1048 int result;
1049
1050 /*
1051 * All messages which may be processed in parallel with other messages should be handled
1052 * here before the atomic check below. Messages which should be exclusive should be
1053 * processed in process_vdo_message_locked().
1054 */
1055
1056 /* Dump messages should always be processed */
1057 if (strcasecmp(argv[0], "dump") == 0)
1058 return vdo_dump(vdo, argc, argv, "dmsetup message");
1059
1060 if (argc == 1) {
1061 if (strcasecmp(argv[0], "dump-on-shutdown") == 0) {
1062 vdo->dump_on_shutdown = true;
1063 return 0;
1064 }
1065
1066 /* Index messages should always be processed */
1067 if ((strcasecmp(argv[0], "index-close") == 0) ||
1068 (strcasecmp(argv[0], "index-create") == 0) ||
1069 (strcasecmp(argv[0], "index-disable") == 0) ||
1070 (strcasecmp(argv[0], "index-enable") == 0))
1071 return vdo_message_dedupe_index(vdo->hash_zones, argv[0]);
1072 }
1073
1074 if (atomic_cmpxchg(&vdo->processing_message, 0, 1) != 0)
1075 return -EBUSY;
1076
1077 result = process_vdo_message_locked(vdo, argc, argv);
1078
1079 /* Pairs with the implicit barrier in cmpxchg just above */
1080 smp_wmb();
1081 atomic_set(&vdo->processing_message, 0);
1082 return result;
1083 }
1084
vdo_message(struct dm_target * ti,unsigned int argc,char ** argv,char * result_buffer,unsigned int maxlen)1085 static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv,
1086 char *result_buffer, unsigned int maxlen)
1087 {
1088 struct registered_thread allocating_thread, instance_thread;
1089 struct vdo *vdo;
1090 int result;
1091
1092 if (argc == 0) {
1093 vdo_log_warning("unspecified dmsetup message");
1094 return -EINVAL;
1095 }
1096
1097 vdo = get_vdo_for_target(ti);
1098 vdo_register_allocating_thread(&allocating_thread, NULL);
1099 vdo_register_thread_device_id(&instance_thread, &vdo->instance);
1100
1101 /*
1102 * Must be done here so we don't map return codes. The code in dm-ioctl expects a 1 for a
1103 * return code to look at the buffer and see if it is full or not.
1104 */
1105 if ((argc == 1) && (strcasecmp(argv[0], "stats") == 0)) {
1106 vdo_write_stats(vdo, result_buffer, maxlen);
1107 result = 1;
1108 } else if ((argc == 1) && (strcasecmp(argv[0], "config") == 0)) {
1109 vdo_write_config(vdo, &result_buffer, &maxlen);
1110 result = 1;
1111 } else {
1112 result = vdo_status_to_errno(process_vdo_message(vdo, argc, argv));
1113 }
1114
1115 vdo_unregister_thread_device_id();
1116 vdo_unregister_allocating_thread();
1117 return result;
1118 }
1119
configure_target_capabilities(struct dm_target * ti)1120 static void configure_target_capabilities(struct dm_target *ti)
1121 {
1122 ti->discards_supported = 1;
1123 ti->flush_supported = true;
1124 ti->num_discard_bios = 1;
1125 ti->num_flush_bios = 1;
1126
1127 /*
1128 * If this value changes, please make sure to update the value for max_discard_sectors
1129 * accordingly.
1130 */
1131 BUG_ON(dm_set_target_max_io_len(ti, VDO_SECTORS_PER_BLOCK) != 0);
1132 }
1133
1134 /*
1135 * Implements vdo_filter_fn.
1136 */
vdo_uses_device(struct vdo * vdo,const void * context)1137 static bool vdo_uses_device(struct vdo *vdo, const void *context)
1138 {
1139 const struct device_config *config = context;
1140
1141 return vdo_get_backing_device(vdo)->bd_dev == config->owned_device->bdev->bd_dev;
1142 }
1143
1144 /**
1145 * get_thread_id_for_phase() - Get the thread id for the current phase of the admin operation in
1146 * progress.
1147 * @vdo: The vdo.
1148 */
get_thread_id_for_phase(struct vdo * vdo)1149 static thread_id_t __must_check get_thread_id_for_phase(struct vdo *vdo)
1150 {
1151 switch (vdo->admin.phase) {
1152 case RESUME_PHASE_PACKER:
1153 case RESUME_PHASE_FLUSHER:
1154 case SUSPEND_PHASE_PACKER:
1155 case SUSPEND_PHASE_FLUSHES:
1156 return vdo->thread_config.packer_thread;
1157
1158 case RESUME_PHASE_DATA_VIOS:
1159 case SUSPEND_PHASE_DATA_VIOS:
1160 return vdo->thread_config.cpu_thread;
1161
1162 case LOAD_PHASE_DRAIN_JOURNAL:
1163 case RESUME_PHASE_JOURNAL:
1164 case SUSPEND_PHASE_JOURNAL:
1165 return vdo->thread_config.journal_thread;
1166
1167 default:
1168 return vdo->thread_config.admin_thread;
1169 }
1170 }
1171
prepare_admin_completion(struct vdo * vdo,vdo_action_fn callback,vdo_action_fn error_handler)1172 static struct vdo_completion *prepare_admin_completion(struct vdo *vdo,
1173 vdo_action_fn callback,
1174 vdo_action_fn error_handler)
1175 {
1176 struct vdo_completion *completion = &vdo->admin.completion;
1177
1178 /*
1179 * We can't use vdo_prepare_completion_for_requeue() here because we don't want to reset
1180 * any error in the completion.
1181 */
1182 completion->callback = callback;
1183 completion->error_handler = error_handler;
1184 completion->callback_thread_id = get_thread_id_for_phase(vdo);
1185 completion->requeue = true;
1186 return completion;
1187 }
1188
1189 /**
1190 * advance_phase() - Increment the phase of the current admin operation and prepare the admin
1191 * completion to run on the thread for the next phase.
1192 * @vdo: The vdo on which an admin operation is being performed.
1193 *
1194 * Return: The current phase.
1195 */
advance_phase(struct vdo * vdo)1196 static u32 advance_phase(struct vdo *vdo)
1197 {
1198 u32 phase = vdo->admin.phase++;
1199
1200 vdo->admin.completion.callback_thread_id = get_thread_id_for_phase(vdo);
1201 vdo->admin.completion.requeue = true;
1202 return phase;
1203 }
1204
1205 /*
1206 * Perform an administrative operation (load, suspend, grow logical, or grow physical). This method
1207 * should not be called from vdo threads.
1208 */
perform_admin_operation(struct vdo * vdo,u32 starting_phase,vdo_action_fn callback,vdo_action_fn error_handler,const char * type)1209 static int perform_admin_operation(struct vdo *vdo, u32 starting_phase,
1210 vdo_action_fn callback, vdo_action_fn error_handler,
1211 const char *type)
1212 {
1213 int result;
1214 struct vdo_administrator *admin = &vdo->admin;
1215
1216 if (atomic_cmpxchg(&admin->busy, 0, 1) != 0) {
1217 return vdo_log_error_strerror(VDO_COMPONENT_BUSY,
1218 "Can't start %s operation, another operation is already in progress",
1219 type);
1220 }
1221
1222 admin->phase = starting_phase;
1223 reinit_completion(&admin->callback_sync);
1224 vdo_reset_completion(&admin->completion);
1225 vdo_launch_completion(prepare_admin_completion(vdo, callback, error_handler));
1226
1227 /*
1228 * Using the "interruptible" interface means that Linux will not log a message when we wait
1229 * for more than 120 seconds.
1230 */
1231 while (wait_for_completion_interruptible(&admin->callback_sync)) {
1232 /* However, if we get a signal in a user-mode process, we could spin... */
1233 fsleep(1000);
1234 }
1235
1236 result = admin->completion.result;
1237 /* pairs with implicit barrier in cmpxchg above */
1238 smp_wmb();
1239 atomic_set(&admin->busy, 0);
1240 return result;
1241 }
1242
1243 /* Assert that we are operating on the correct thread for the current phase. */
assert_admin_phase_thread(struct vdo * vdo,const char * what)1244 static void assert_admin_phase_thread(struct vdo *vdo, const char *what)
1245 {
1246 VDO_ASSERT_LOG_ONLY(vdo_get_callback_thread_id() == get_thread_id_for_phase(vdo),
1247 "%s on correct thread for %s", what,
1248 ADMIN_PHASE_NAMES[vdo->admin.phase]);
1249 }
1250
1251 /**
1252 * finish_operation_callback() - Callback to finish an admin operation.
1253 * @completion: The admin_completion.
1254 */
finish_operation_callback(struct vdo_completion * completion)1255 static void finish_operation_callback(struct vdo_completion *completion)
1256 {
1257 struct vdo_administrator *admin = &completion->vdo->admin;
1258
1259 vdo_finish_operation(&admin->state, completion->result);
1260 complete(&admin->callback_sync);
1261 }
1262
1263 /**
1264 * decode_from_super_block() - Decode the VDO state from the super block and validate that it is
1265 * correct.
1266 * @vdo: The vdo being loaded.
1267 *
1268 * On error from this method, the component states must be destroyed explicitly. If this method
1269 * returns successfully, the component states must not be destroyed.
1270 *
1271 * Return: VDO_SUCCESS or an error.
1272 */
decode_from_super_block(struct vdo * vdo)1273 static int __must_check decode_from_super_block(struct vdo *vdo)
1274 {
1275 const struct device_config *config = vdo->device_config;
1276 int result;
1277
1278 result = vdo_decode_component_states(vdo->super_block.buffer, &vdo->geometry,
1279 &vdo->states);
1280 if (result != VDO_SUCCESS)
1281 return result;
1282
1283 vdo_set_state(vdo, vdo->states.vdo.state);
1284 vdo->load_state = vdo->states.vdo.state;
1285
1286 /*
1287 * If the device config specifies a larger logical size than was recorded in the super
1288 * block, just accept it.
1289 */
1290 if (vdo->states.vdo.config.logical_blocks < config->logical_blocks) {
1291 vdo_log_warning("Growing logical size: a logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block",
1292 (unsigned long long) config->logical_blocks,
1293 (unsigned long long) vdo->states.vdo.config.logical_blocks);
1294 vdo->states.vdo.config.logical_blocks = config->logical_blocks;
1295 }
1296
1297 result = vdo_validate_component_states(&vdo->states, vdo->geometry.nonce,
1298 config->physical_blocks,
1299 config->logical_blocks);
1300 if (result != VDO_SUCCESS)
1301 return result;
1302
1303 vdo->layout = vdo->states.layout;
1304 return VDO_SUCCESS;
1305 }
1306
1307 /**
1308 * decode_vdo() - Decode the component data portion of a super block and fill in the corresponding
1309 * portions of the vdo being loaded.
1310 * @vdo: The vdo being loaded.
1311 *
1312 * This will also allocate the recovery journal and slab depot. If this method is called with an
1313 * asynchronous layer (i.e. a thread config which specifies at least one base thread), the block
1314 * map and packer will be constructed as well.
1315 *
1316 * Return: VDO_SUCCESS or an error.
1317 */
decode_vdo(struct vdo * vdo)1318 static int __must_check decode_vdo(struct vdo *vdo)
1319 {
1320 block_count_t maximum_age, journal_length;
1321 struct partition *partition;
1322 int result;
1323
1324 result = decode_from_super_block(vdo);
1325 if (result != VDO_SUCCESS) {
1326 vdo_destroy_component_states(&vdo->states);
1327 return result;
1328 }
1329
1330 maximum_age = vdo_convert_maximum_age(vdo->device_config->block_map_maximum_age);
1331 journal_length =
1332 vdo_get_recovery_journal_length(vdo->states.vdo.config.recovery_journal_size);
1333 if (maximum_age > (journal_length / 2)) {
1334 return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
1335 "maximum age: %llu exceeds limit %llu",
1336 (unsigned long long) maximum_age,
1337 (unsigned long long) (journal_length / 2));
1338 }
1339
1340 if (maximum_age == 0) {
1341 return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
1342 "maximum age must be greater than 0");
1343 }
1344
1345 result = vdo_enable_read_only_entry(vdo);
1346 if (result != VDO_SUCCESS)
1347 return result;
1348
1349 partition = vdo_get_known_partition(&vdo->layout,
1350 VDO_RECOVERY_JOURNAL_PARTITION);
1351 result = vdo_decode_recovery_journal(vdo->states.recovery_journal,
1352 vdo->states.vdo.nonce, vdo, partition,
1353 vdo->states.vdo.complete_recoveries,
1354 vdo->states.vdo.config.recovery_journal_size,
1355 &vdo->recovery_journal);
1356 if (result != VDO_SUCCESS)
1357 return result;
1358
1359 partition = vdo_get_known_partition(&vdo->layout, VDO_SLAB_SUMMARY_PARTITION);
1360 result = vdo_decode_slab_depot(vdo->states.slab_depot, vdo, partition,
1361 &vdo->depot);
1362 if (result != VDO_SUCCESS)
1363 return result;
1364
1365 result = vdo_decode_block_map(vdo->states.block_map,
1366 vdo->states.vdo.config.logical_blocks, vdo,
1367 vdo->recovery_journal, vdo->states.vdo.nonce,
1368 vdo->device_config->cache_size, maximum_age,
1369 &vdo->block_map);
1370 if (result != VDO_SUCCESS)
1371 return result;
1372
1373 result = vdo_make_physical_zones(vdo, &vdo->physical_zones);
1374 if (result != VDO_SUCCESS)
1375 return result;
1376
1377 /* The logical zones depend on the physical zones already existing. */
1378 result = vdo_make_logical_zones(vdo, &vdo->logical_zones);
1379 if (result != VDO_SUCCESS)
1380 return result;
1381
1382 return vdo_make_hash_zones(vdo, &vdo->hash_zones);
1383 }
1384
1385 /**
1386 * pre_load_callback() - Callback to initiate a pre-load, registered in vdo_initialize().
1387 * @completion: The admin completion.
1388 */
pre_load_callback(struct vdo_completion * completion)1389 static void pre_load_callback(struct vdo_completion *completion)
1390 {
1391 struct vdo *vdo = completion->vdo;
1392 int result;
1393
1394 assert_admin_phase_thread(vdo, __func__);
1395
1396 switch (advance_phase(vdo)) {
1397 case PRE_LOAD_PHASE_START:
1398 result = vdo_start_operation(&vdo->admin.state,
1399 VDO_ADMIN_STATE_PRE_LOADING);
1400 if (result != VDO_SUCCESS) {
1401 vdo_continue_completion(completion, result);
1402 return;
1403 }
1404
1405 vdo_load_super_block(vdo, completion);
1406 return;
1407
1408 case PRE_LOAD_PHASE_LOAD_COMPONENTS:
1409 vdo_continue_completion(completion, decode_vdo(vdo));
1410 return;
1411
1412 case PRE_LOAD_PHASE_END:
1413 break;
1414
1415 default:
1416 vdo_set_completion_result(completion, UDS_BAD_STATE);
1417 }
1418
1419 finish_operation_callback(completion);
1420 }
1421
release_instance(unsigned int instance)1422 static void release_instance(unsigned int instance)
1423 {
1424 mutex_lock(&instances_lock);
1425 if (instance >= instances.bit_count) {
1426 VDO_ASSERT_LOG_ONLY(false,
1427 "instance number %u must be less than bit count %u",
1428 instance, instances.bit_count);
1429 } else if (test_bit(instance, instances.words) == 0) {
1430 VDO_ASSERT_LOG_ONLY(false, "instance number %u must be allocated", instance);
1431 } else {
1432 __clear_bit(instance, instances.words);
1433 instances.count -= 1;
1434 }
1435 mutex_unlock(&instances_lock);
1436 }
1437
set_device_config(struct dm_target * ti,struct vdo * vdo,struct device_config * config)1438 static void set_device_config(struct dm_target *ti, struct vdo *vdo,
1439 struct device_config *config)
1440 {
1441 list_del_init(&config->config_list);
1442 list_add_tail(&config->config_list, &vdo->device_config_list);
1443 config->vdo = vdo;
1444 ti->private = config;
1445 configure_target_capabilities(ti);
1446 }
1447
vdo_initialize(struct dm_target * ti,unsigned int instance,struct device_config * config)1448 static int vdo_initialize(struct dm_target *ti, unsigned int instance,
1449 struct device_config *config)
1450 {
1451 struct vdo *vdo;
1452 int result;
1453 u64 block_size = VDO_BLOCK_SIZE;
1454 u64 logical_size = to_bytes(ti->len);
1455 block_count_t logical_blocks = logical_size / block_size;
1456
1457 vdo_log_info("loading device '%s'", vdo_get_device_name(ti));
1458 vdo_log_debug("Logical block size = %llu", (u64) config->logical_block_size);
1459 vdo_log_debug("Logical blocks = %llu", logical_blocks);
1460 vdo_log_debug("Physical block size = %llu", (u64) block_size);
1461 vdo_log_debug("Physical blocks = %llu", config->physical_blocks);
1462 vdo_log_debug("Block map cache blocks = %u", config->cache_size);
1463 vdo_log_debug("Block map maximum age = %u", config->block_map_maximum_age);
1464 vdo_log_debug("Deduplication = %s", (config->deduplication ? "on" : "off"));
1465 vdo_log_debug("Compression = %s", (config->compression ? "on" : "off"));
1466
1467 vdo = vdo_find_matching(vdo_uses_device, config);
1468 if (vdo != NULL) {
1469 vdo_log_error("Existing vdo already uses device %s",
1470 vdo->device_config->parent_device_name);
1471 ti->error = "Cannot share storage device with already-running VDO";
1472 return VDO_BAD_CONFIGURATION;
1473 }
1474
1475 result = vdo_make(instance, config, &ti->error, &vdo);
1476 if (result != VDO_SUCCESS) {
1477 vdo_log_error("Could not create VDO device. (VDO error %d, message %s)",
1478 result, ti->error);
1479 vdo_destroy(vdo);
1480 return result;
1481 }
1482
1483 result = perform_admin_operation(vdo, PRE_LOAD_PHASE_START, pre_load_callback,
1484 finish_operation_callback, "pre-load");
1485 if (result != VDO_SUCCESS) {
1486 ti->error = ((result == VDO_INVALID_ADMIN_STATE) ?
1487 "Pre-load is only valid immediately after initialization" :
1488 "Cannot load metadata from device");
1489 vdo_log_error("Could not start VDO device. (VDO error %d, message %s)",
1490 result, ti->error);
1491 vdo_destroy(vdo);
1492 return result;
1493 }
1494
1495 set_device_config(ti, vdo, config);
1496 vdo->device_config = config;
1497 return VDO_SUCCESS;
1498 }
1499
1500 /* Implements vdo_filter_fn. */
vdo_is_named(struct vdo * vdo,const void * context)1501 static bool __must_check vdo_is_named(struct vdo *vdo, const void *context)
1502 {
1503 struct dm_target *ti = vdo->device_config->owning_target;
1504 const char *device_name = vdo_get_device_name(ti);
1505
1506 return strcmp(device_name, context) == 0;
1507 }
1508
1509 /**
1510 * get_bit_array_size() - Return the number of bytes needed to store a bit array of the specified
1511 * capacity in an array of unsigned longs.
1512 * @bit_count: The number of bits the array must hold.
1513 *
1514 * Return: the number of bytes needed for the array representation.
1515 */
get_bit_array_size(unsigned int bit_count)1516 static size_t get_bit_array_size(unsigned int bit_count)
1517 {
1518 /* Round up to a multiple of the word size and convert to a byte count. */
1519 return (BITS_TO_LONGS(bit_count) * sizeof(unsigned long));
1520 }
1521
1522 /**
1523 * grow_bit_array() - Re-allocate the bitmap word array so there will more instance numbers that
1524 * can be allocated.
1525 *
1526 * Since the array is initially NULL, this also initializes the array the first time we allocate an
1527 * instance number.
1528 *
1529 * Return: VDO_SUCCESS or an error code from the allocation
1530 */
grow_bit_array(void)1531 static int grow_bit_array(void)
1532 {
1533 unsigned int new_count = max(instances.bit_count + BIT_COUNT_INCREMENT,
1534 (unsigned int) BIT_COUNT_MINIMUM);
1535 unsigned long *new_words;
1536 int result;
1537
1538 result = vdo_reallocate_memory(instances.words,
1539 get_bit_array_size(instances.bit_count),
1540 get_bit_array_size(new_count),
1541 "instance number bit array", &new_words);
1542 if (result != VDO_SUCCESS)
1543 return result;
1544
1545 instances.bit_count = new_count;
1546 instances.words = new_words;
1547 return VDO_SUCCESS;
1548 }
1549
1550 /**
1551 * allocate_instance() - Allocate an instance number.
1552 * @instance_ptr: A point to hold the instance number
1553 *
1554 * Return: VDO_SUCCESS or an error code
1555 *
1556 * This function must be called while holding the instances lock.
1557 */
allocate_instance(unsigned int * instance_ptr)1558 static int allocate_instance(unsigned int *instance_ptr)
1559 {
1560 unsigned int instance;
1561 int result;
1562
1563 /* If there are no unallocated instances, grow the bit array. */
1564 if (instances.count >= instances.bit_count) {
1565 result = grow_bit_array();
1566 if (result != VDO_SUCCESS)
1567 return result;
1568 }
1569
1570 /*
1571 * There must be a zero bit somewhere now. Find it, starting just after the last instance
1572 * allocated.
1573 */
1574 instance = find_next_zero_bit(instances.words, instances.bit_count,
1575 instances.next);
1576 if (instance >= instances.bit_count) {
1577 /* Nothing free after next, so wrap around to instance zero. */
1578 instance = find_first_zero_bit(instances.words, instances.bit_count);
1579 result = VDO_ASSERT(instance < instances.bit_count,
1580 "impossibly, no zero bit found");
1581 if (result != VDO_SUCCESS)
1582 return result;
1583 }
1584
1585 __set_bit(instance, instances.words);
1586 instances.count++;
1587 instances.next = instance + 1;
1588 *instance_ptr = instance;
1589 return VDO_SUCCESS;
1590 }
1591
construct_new_vdo_registered(struct dm_target * ti,unsigned int argc,char ** argv,unsigned int instance)1592 static int construct_new_vdo_registered(struct dm_target *ti, unsigned int argc,
1593 char **argv, unsigned int instance)
1594 {
1595 int result;
1596 struct device_config *config;
1597
1598 result = parse_device_config(argc, argv, ti, &config);
1599 if (result != VDO_SUCCESS) {
1600 vdo_log_error_strerror(result, "parsing failed: %s", ti->error);
1601 release_instance(instance);
1602 return -EINVAL;
1603 }
1604
1605 /* Beyond this point, the instance number will be cleaned up for us if needed */
1606 result = vdo_initialize(ti, instance, config);
1607 if (result != VDO_SUCCESS) {
1608 release_instance(instance);
1609 free_device_config(config);
1610 return vdo_status_to_errno(result);
1611 }
1612
1613 return VDO_SUCCESS;
1614 }
1615
construct_new_vdo(struct dm_target * ti,unsigned int argc,char ** argv)1616 static int construct_new_vdo(struct dm_target *ti, unsigned int argc, char **argv)
1617 {
1618 int result;
1619 unsigned int instance;
1620 struct registered_thread instance_thread;
1621
1622 mutex_lock(&instances_lock);
1623 result = allocate_instance(&instance);
1624 mutex_unlock(&instances_lock);
1625 if (result != VDO_SUCCESS)
1626 return -ENOMEM;
1627
1628 vdo_register_thread_device_id(&instance_thread, &instance);
1629 result = construct_new_vdo_registered(ti, argc, argv, instance);
1630 vdo_unregister_thread_device_id();
1631 return result;
1632 }
1633
1634 /**
1635 * check_may_grow_physical() - Callback to check that we're not in recovery mode, used in
1636 * vdo_prepare_to_grow_physical().
1637 * @completion: The admin completion.
1638 */
check_may_grow_physical(struct vdo_completion * completion)1639 static void check_may_grow_physical(struct vdo_completion *completion)
1640 {
1641 struct vdo *vdo = completion->vdo;
1642
1643 assert_admin_phase_thread(vdo, __func__);
1644
1645 /* These checks can only be done from a vdo thread. */
1646 if (vdo_is_read_only(vdo))
1647 vdo_set_completion_result(completion, VDO_READ_ONLY);
1648
1649 if (vdo_in_recovery_mode(vdo))
1650 vdo_set_completion_result(completion, VDO_RETRY_AFTER_REBUILD);
1651
1652 finish_operation_callback(completion);
1653 }
1654
get_partition_size(struct layout * layout,enum partition_id id)1655 static block_count_t get_partition_size(struct layout *layout, enum partition_id id)
1656 {
1657 return vdo_get_known_partition(layout, id)->count;
1658 }
1659
1660 /**
1661 * grow_layout() - Make the layout for growing a vdo.
1662 * @vdo: The vdo preparing to grow.
1663 * @old_size: The current size of the vdo.
1664 * @new_size: The size to which the vdo will be grown.
1665 *
1666 * Return: VDO_SUCCESS or an error code.
1667 */
grow_layout(struct vdo * vdo,block_count_t old_size,block_count_t new_size)1668 static int grow_layout(struct vdo *vdo, block_count_t old_size, block_count_t new_size)
1669 {
1670 int result;
1671 block_count_t min_new_size;
1672
1673 if (vdo->next_layout.size == new_size) {
1674 /* We are already prepared to grow to the new size, so we're done. */
1675 return VDO_SUCCESS;
1676 }
1677
1678 /* Make a copy completion if there isn't one */
1679 if (vdo->partition_copier == NULL) {
1680 vdo->partition_copier = dm_kcopyd_client_create(NULL);
1681 if (IS_ERR(vdo->partition_copier)) {
1682 result = PTR_ERR(vdo->partition_copier);
1683 vdo->partition_copier = NULL;
1684 return result;
1685 }
1686 }
1687
1688 /* Free any unused preparation. */
1689 vdo_uninitialize_layout(&vdo->next_layout);
1690
1691 /*
1692 * Make a new layout with the existing partition sizes for everything but the slab depot
1693 * partition.
1694 */
1695 result = vdo_initialize_layout(new_size, vdo->layout.start,
1696 get_partition_size(&vdo->layout,
1697 VDO_BLOCK_MAP_PARTITION),
1698 get_partition_size(&vdo->layout,
1699 VDO_RECOVERY_JOURNAL_PARTITION),
1700 get_partition_size(&vdo->layout,
1701 VDO_SLAB_SUMMARY_PARTITION),
1702 &vdo->next_layout);
1703 if (result != VDO_SUCCESS) {
1704 dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier));
1705 return result;
1706 }
1707
1708 /* Ensure the new journal and summary are entirely within the added blocks. */
1709 min_new_size = (old_size +
1710 get_partition_size(&vdo->next_layout,
1711 VDO_SLAB_SUMMARY_PARTITION) +
1712 get_partition_size(&vdo->next_layout,
1713 VDO_RECOVERY_JOURNAL_PARTITION));
1714 if (min_new_size > new_size) {
1715 /* Copying the journal and summary would destroy some old metadata. */
1716 vdo_uninitialize_layout(&vdo->next_layout);
1717 dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier));
1718 return VDO_INCREMENT_TOO_SMALL;
1719 }
1720
1721 return VDO_SUCCESS;
1722 }
1723
prepare_to_grow_physical(struct vdo * vdo,block_count_t new_physical_blocks)1724 static int prepare_to_grow_physical(struct vdo *vdo, block_count_t new_physical_blocks)
1725 {
1726 int result;
1727 block_count_t current_physical_blocks = vdo->states.vdo.config.physical_blocks;
1728
1729 vdo_log_info("Preparing to resize physical to %llu",
1730 (unsigned long long) new_physical_blocks);
1731 VDO_ASSERT_LOG_ONLY((new_physical_blocks > current_physical_blocks),
1732 "New physical size is larger than current physical size");
1733 result = perform_admin_operation(vdo, PREPARE_GROW_PHYSICAL_PHASE_START,
1734 check_may_grow_physical,
1735 finish_operation_callback,
1736 "prepare grow-physical");
1737 if (result != VDO_SUCCESS)
1738 return result;
1739
1740 result = grow_layout(vdo, current_physical_blocks, new_physical_blocks);
1741 if (result != VDO_SUCCESS)
1742 return result;
1743
1744 result = vdo_prepare_to_grow_slab_depot(vdo->depot,
1745 vdo_get_known_partition(&vdo->next_layout,
1746 VDO_SLAB_DEPOT_PARTITION));
1747 if (result != VDO_SUCCESS) {
1748 vdo_uninitialize_layout(&vdo->next_layout);
1749 return result;
1750 }
1751
1752 vdo_log_info("Done preparing to resize physical");
1753 return VDO_SUCCESS;
1754 }
1755
1756 /**
1757 * validate_new_device_config() - Check whether a new device config represents a valid modification
1758 * to an existing config.
1759 * @to_validate: The new config to validate.
1760 * @config: The existing config.
1761 * @may_grow: Set to true if growing the logical and physical size of the vdo is currently
1762 * permitted.
1763 * @error_ptr: A pointer to hold the reason for any error.
1764 *
1765 * Return: VDO_SUCCESS or an error.
1766 */
validate_new_device_config(struct device_config * to_validate,struct device_config * config,bool may_grow,char ** error_ptr)1767 static int validate_new_device_config(struct device_config *to_validate,
1768 struct device_config *config, bool may_grow,
1769 char **error_ptr)
1770 {
1771 if (to_validate->owning_target->begin != config->owning_target->begin) {
1772 *error_ptr = "Starting sector cannot change";
1773 return VDO_PARAMETER_MISMATCH;
1774 }
1775
1776 if (to_validate->logical_block_size != config->logical_block_size) {
1777 *error_ptr = "Logical block size cannot change";
1778 return VDO_PARAMETER_MISMATCH;
1779 }
1780
1781 if (to_validate->logical_blocks < config->logical_blocks) {
1782 *error_ptr = "Can't shrink VDO logical size";
1783 return VDO_PARAMETER_MISMATCH;
1784 }
1785
1786 if (to_validate->cache_size != config->cache_size) {
1787 *error_ptr = "Block map cache size cannot change";
1788 return VDO_PARAMETER_MISMATCH;
1789 }
1790
1791 if (to_validate->block_map_maximum_age != config->block_map_maximum_age) {
1792 *error_ptr = "Block map maximum age cannot change";
1793 return VDO_PARAMETER_MISMATCH;
1794 }
1795
1796 if (memcmp(&to_validate->thread_counts, &config->thread_counts,
1797 sizeof(struct thread_count_config)) != 0) {
1798 *error_ptr = "Thread configuration cannot change";
1799 return VDO_PARAMETER_MISMATCH;
1800 }
1801
1802 if (to_validate->physical_blocks < config->physical_blocks) {
1803 *error_ptr = "Removing physical storage from a VDO is not supported";
1804 return VDO_NOT_IMPLEMENTED;
1805 }
1806
1807 if (!may_grow && (to_validate->physical_blocks > config->physical_blocks)) {
1808 *error_ptr = "VDO physical size may not grow in current state";
1809 return VDO_NOT_IMPLEMENTED;
1810 }
1811
1812 return VDO_SUCCESS;
1813 }
1814
prepare_to_modify(struct dm_target * ti,struct device_config * config,struct vdo * vdo)1815 static int prepare_to_modify(struct dm_target *ti, struct device_config *config,
1816 struct vdo *vdo)
1817 {
1818 int result;
1819 bool may_grow = (vdo_get_admin_state(vdo) != VDO_ADMIN_STATE_PRE_LOADED);
1820
1821 result = validate_new_device_config(config, vdo->device_config, may_grow,
1822 &ti->error);
1823 if (result != VDO_SUCCESS)
1824 return -EINVAL;
1825
1826 if (config->logical_blocks > vdo->device_config->logical_blocks) {
1827 block_count_t logical_blocks = vdo->states.vdo.config.logical_blocks;
1828
1829 vdo_log_info("Preparing to resize logical to %llu",
1830 (unsigned long long) config->logical_blocks);
1831 VDO_ASSERT_LOG_ONLY((config->logical_blocks > logical_blocks),
1832 "New logical size is larger than current size");
1833
1834 result = vdo_prepare_to_grow_block_map(vdo->block_map,
1835 config->logical_blocks);
1836 if (result != VDO_SUCCESS) {
1837 ti->error = "Device vdo_prepare_to_grow_logical failed";
1838 return result;
1839 }
1840
1841 vdo_log_info("Done preparing to resize logical");
1842 }
1843
1844 if (config->physical_blocks > vdo->device_config->physical_blocks) {
1845 result = prepare_to_grow_physical(vdo, config->physical_blocks);
1846 if (result != VDO_SUCCESS) {
1847 if (result == VDO_PARAMETER_MISMATCH) {
1848 /*
1849 * If we don't trap this case, vdo_status_to_errno() will remap
1850 * it to -EIO, which is misleading and ahistorical.
1851 */
1852 result = -EINVAL;
1853 }
1854
1855 if (result == VDO_TOO_MANY_SLABS)
1856 ti->error = "Device vdo_prepare_to_grow_physical failed (specified physical size too big based on formatted slab size)";
1857 else
1858 ti->error = "Device vdo_prepare_to_grow_physical failed";
1859
1860 return result;
1861 }
1862 }
1863
1864 if (strcmp(config->parent_device_name, vdo->device_config->parent_device_name) != 0) {
1865 const char *device_name = vdo_get_device_name(config->owning_target);
1866
1867 vdo_log_info("Updating backing device of %s from %s to %s", device_name,
1868 vdo->device_config->parent_device_name,
1869 config->parent_device_name);
1870 }
1871
1872 return VDO_SUCCESS;
1873 }
1874
update_existing_vdo(const char * device_name,struct dm_target * ti,unsigned int argc,char ** argv,struct vdo * vdo)1875 static int update_existing_vdo(const char *device_name, struct dm_target *ti,
1876 unsigned int argc, char **argv, struct vdo *vdo)
1877 {
1878 int result;
1879 struct device_config *config;
1880
1881 result = parse_device_config(argc, argv, ti, &config);
1882 if (result != VDO_SUCCESS)
1883 return -EINVAL;
1884
1885 vdo_log_info("preparing to modify device '%s'", device_name);
1886 result = prepare_to_modify(ti, config, vdo);
1887 if (result != VDO_SUCCESS) {
1888 free_device_config(config);
1889 return vdo_status_to_errno(result);
1890 }
1891
1892 set_device_config(ti, vdo, config);
1893 return VDO_SUCCESS;
1894 }
1895
vdo_ctr(struct dm_target * ti,unsigned int argc,char ** argv)1896 static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1897 {
1898 int result;
1899 struct registered_thread allocating_thread, instance_thread;
1900 const char *device_name;
1901 struct vdo *vdo;
1902
1903 vdo_register_allocating_thread(&allocating_thread, NULL);
1904 device_name = vdo_get_device_name(ti);
1905 vdo = vdo_find_matching(vdo_is_named, device_name);
1906 if (vdo == NULL) {
1907 result = construct_new_vdo(ti, argc, argv);
1908 } else {
1909 vdo_register_thread_device_id(&instance_thread, &vdo->instance);
1910 result = update_existing_vdo(device_name, ti, argc, argv, vdo);
1911 vdo_unregister_thread_device_id();
1912 }
1913
1914 vdo_unregister_allocating_thread();
1915 return result;
1916 }
1917
vdo_dtr(struct dm_target * ti)1918 static void vdo_dtr(struct dm_target *ti)
1919 {
1920 struct device_config *config = ti->private;
1921 struct vdo *vdo = vdo_forget(config->vdo);
1922
1923 list_del_init(&config->config_list);
1924 if (list_empty(&vdo->device_config_list)) {
1925 const char *device_name;
1926
1927 /* This was the last config referencing the VDO. Free it. */
1928 unsigned int instance = vdo->instance;
1929 struct registered_thread allocating_thread, instance_thread;
1930
1931 vdo_register_thread_device_id(&instance_thread, &instance);
1932 vdo_register_allocating_thread(&allocating_thread, NULL);
1933
1934 device_name = vdo_get_device_name(ti);
1935 vdo_log_info("stopping device '%s'", device_name);
1936 if (vdo->dump_on_shutdown)
1937 vdo_dump_all(vdo, "device shutdown");
1938
1939 vdo_destroy(vdo_forget(vdo));
1940 vdo_log_info("device '%s' stopped", device_name);
1941 vdo_unregister_thread_device_id();
1942 vdo_unregister_allocating_thread();
1943 release_instance(instance);
1944 } else if (config == vdo->device_config) {
1945 /*
1946 * The VDO still references this config. Give it a reference to a config that isn't
1947 * being destroyed.
1948 */
1949 vdo->device_config = list_first_entry(&vdo->device_config_list,
1950 struct device_config, config_list);
1951 }
1952
1953 free_device_config(config);
1954 ti->private = NULL;
1955 }
1956
vdo_presuspend(struct dm_target * ti)1957 static void vdo_presuspend(struct dm_target *ti)
1958 {
1959 get_vdo_for_target(ti)->suspend_type =
1960 (dm_noflush_suspending(ti) ? VDO_ADMIN_STATE_SUSPENDING : VDO_ADMIN_STATE_SAVING);
1961 }
1962
1963 /**
1964 * write_super_block_for_suspend() - Update the VDO state and save the super block.
1965 * @completion: The admin completion
1966 */
write_super_block_for_suspend(struct vdo_completion * completion)1967 static void write_super_block_for_suspend(struct vdo_completion *completion)
1968 {
1969 struct vdo *vdo = completion->vdo;
1970
1971 switch (vdo_get_state(vdo)) {
1972 case VDO_DIRTY:
1973 case VDO_NEW:
1974 vdo_set_state(vdo, VDO_CLEAN);
1975 break;
1976
1977 case VDO_CLEAN:
1978 case VDO_READ_ONLY_MODE:
1979 case VDO_FORCE_REBUILD:
1980 case VDO_RECOVERING:
1981 case VDO_REBUILD_FOR_UPGRADE:
1982 break;
1983
1984 case VDO_REPLAYING:
1985 default:
1986 vdo_continue_completion(completion, UDS_BAD_STATE);
1987 return;
1988 }
1989
1990 vdo_save_components(vdo, completion);
1991 }
1992
1993 /**
1994 * suspend_callback() - Callback to initiate a suspend, registered in vdo_postsuspend().
1995 * @completion: The sub-task completion.
1996 */
suspend_callback(struct vdo_completion * completion)1997 static void suspend_callback(struct vdo_completion *completion)
1998 {
1999 struct vdo *vdo = completion->vdo;
2000 struct admin_state *state = &vdo->admin.state;
2001 int result;
2002
2003 assert_admin_phase_thread(vdo, __func__);
2004
2005 switch (advance_phase(vdo)) {
2006 case SUSPEND_PHASE_START:
2007 if (vdo_get_admin_state_code(state)->quiescent) {
2008 /* Already suspended */
2009 break;
2010 }
2011
2012 vdo_continue_completion(completion,
2013 vdo_start_operation(state, vdo->suspend_type));
2014 return;
2015
2016 case SUSPEND_PHASE_PACKER:
2017 /*
2018 * If the VDO was already resumed from a prior suspend while read-only, some of the
2019 * components may not have been resumed. By setting a read-only error here, we
2020 * guarantee that the result of this suspend will be VDO_READ_ONLY and not
2021 * VDO_INVALID_ADMIN_STATE in that case.
2022 */
2023 if (vdo_in_read_only_mode(vdo))
2024 vdo_set_completion_result(completion, VDO_READ_ONLY);
2025
2026 vdo_drain_packer(vdo->packer, completion);
2027 return;
2028
2029 case SUSPEND_PHASE_DATA_VIOS:
2030 drain_data_vio_pool(vdo->data_vio_pool, completion);
2031 return;
2032
2033 case SUSPEND_PHASE_DEDUPE:
2034 vdo_drain_hash_zones(vdo->hash_zones, completion);
2035 return;
2036
2037 case SUSPEND_PHASE_FLUSHES:
2038 vdo_drain_flusher(vdo->flusher, completion);
2039 return;
2040
2041 case SUSPEND_PHASE_LOGICAL_ZONES:
2042 /*
2043 * Attempt to flush all I/O before completing post suspend work. We believe a
2044 * suspended device is expected to have persisted all data written before the
2045 * suspend, even if it hasn't been flushed yet.
2046 */
2047 result = vdo_synchronous_flush(vdo);
2048 if (result != VDO_SUCCESS)
2049 vdo_enter_read_only_mode(vdo, result);
2050
2051 vdo_drain_logical_zones(vdo->logical_zones,
2052 vdo_get_admin_state_code(state), completion);
2053 return;
2054
2055 case SUSPEND_PHASE_BLOCK_MAP:
2056 vdo_drain_block_map(vdo->block_map, vdo_get_admin_state_code(state),
2057 completion);
2058 return;
2059
2060 case SUSPEND_PHASE_JOURNAL:
2061 vdo_drain_recovery_journal(vdo->recovery_journal,
2062 vdo_get_admin_state_code(state), completion);
2063 return;
2064
2065 case SUSPEND_PHASE_DEPOT:
2066 vdo_drain_slab_depot(vdo->depot, vdo_get_admin_state_code(state),
2067 completion);
2068 return;
2069
2070 case SUSPEND_PHASE_READ_ONLY_WAIT:
2071 vdo_wait_until_not_entering_read_only_mode(completion);
2072 return;
2073
2074 case SUSPEND_PHASE_WRITE_SUPER_BLOCK:
2075 if (vdo_is_state_suspending(state) || (completion->result != VDO_SUCCESS)) {
2076 /* If we didn't save the VDO or there was an error, we're done. */
2077 break;
2078 }
2079
2080 write_super_block_for_suspend(completion);
2081 return;
2082
2083 case SUSPEND_PHASE_END:
2084 break;
2085
2086 default:
2087 vdo_set_completion_result(completion, UDS_BAD_STATE);
2088 }
2089
2090 finish_operation_callback(completion);
2091 }
2092
vdo_postsuspend(struct dm_target * ti)2093 static void vdo_postsuspend(struct dm_target *ti)
2094 {
2095 struct vdo *vdo = get_vdo_for_target(ti);
2096 struct registered_thread instance_thread;
2097 const char *device_name;
2098 int result;
2099
2100 vdo_register_thread_device_id(&instance_thread, &vdo->instance);
2101 device_name = vdo_get_device_name(vdo->device_config->owning_target);
2102 vdo_log_info("suspending device '%s'", device_name);
2103
2104 /*
2105 * It's important to note any error here does not actually stop device-mapper from
2106 * suspending the device. All this work is done post suspend.
2107 */
2108 result = perform_admin_operation(vdo, SUSPEND_PHASE_START, suspend_callback,
2109 suspend_callback, "suspend");
2110
2111 if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) {
2112 /*
2113 * Treat VDO_READ_ONLY as a success since a read-only suspension still leaves the
2114 * VDO suspended.
2115 */
2116 vdo_log_info("device '%s' suspended", device_name);
2117 } else if (result == VDO_INVALID_ADMIN_STATE) {
2118 vdo_log_error("Suspend invoked while in unexpected state: %s",
2119 vdo_get_admin_state(vdo)->name);
2120 } else {
2121 vdo_log_error_strerror(result, "Suspend of device '%s' failed",
2122 device_name);
2123 }
2124
2125 vdo_unregister_thread_device_id();
2126 }
2127
2128 /**
2129 * was_new() - Check whether the vdo was new when it was loaded.
2130 * @vdo: The vdo to query.
2131 *
2132 * Return: true if the vdo was new.
2133 */
was_new(const struct vdo * vdo)2134 static bool was_new(const struct vdo *vdo)
2135 {
2136 return (vdo->load_state == VDO_NEW);
2137 }
2138
2139 /**
2140 * requires_repair() - Check whether a vdo requires recovery or rebuild.
2141 * @vdo: The vdo to query.
2142 *
2143 * Return: true if the vdo must be repaired.
2144 */
requires_repair(const struct vdo * vdo)2145 static bool __must_check requires_repair(const struct vdo *vdo)
2146 {
2147 switch (vdo_get_state(vdo)) {
2148 case VDO_DIRTY:
2149 case VDO_FORCE_REBUILD:
2150 case VDO_REPLAYING:
2151 case VDO_REBUILD_FOR_UPGRADE:
2152 return true;
2153
2154 default:
2155 return false;
2156 }
2157 }
2158
2159 /**
2160 * get_load_type() - Determine how the slab depot was loaded.
2161 * @vdo: The vdo.
2162 *
2163 * Return: How the depot was loaded.
2164 */
get_load_type(struct vdo * vdo)2165 static enum slab_depot_load_type get_load_type(struct vdo *vdo)
2166 {
2167 if (vdo_state_requires_read_only_rebuild(vdo->load_state))
2168 return VDO_SLAB_DEPOT_REBUILD_LOAD;
2169
2170 if (vdo_state_requires_recovery(vdo->load_state))
2171 return VDO_SLAB_DEPOT_RECOVERY_LOAD;
2172
2173 return VDO_SLAB_DEPOT_NORMAL_LOAD;
2174 }
2175
2176 /**
2177 * load_callback() - Callback to do the destructive parts of loading a VDO.
2178 * @completion: The sub-task completion.
2179 */
load_callback(struct vdo_completion * completion)2180 static void load_callback(struct vdo_completion *completion)
2181 {
2182 struct vdo *vdo = completion->vdo;
2183 int result;
2184
2185 assert_admin_phase_thread(vdo, __func__);
2186
2187 switch (advance_phase(vdo)) {
2188 case LOAD_PHASE_START:
2189 result = vdo_start_operation(&vdo->admin.state, VDO_ADMIN_STATE_LOADING);
2190 if (result != VDO_SUCCESS) {
2191 vdo_continue_completion(completion, result);
2192 return;
2193 }
2194
2195 /* Prepare the recovery journal for new entries. */
2196 vdo_open_recovery_journal(vdo->recovery_journal, vdo->depot,
2197 vdo->block_map);
2198 vdo_allow_read_only_mode_entry(completion);
2199 return;
2200
2201 case LOAD_PHASE_LOAD_DEPOT:
2202 vdo_set_dedupe_state_normal(vdo->hash_zones);
2203 if (vdo_is_read_only(vdo)) {
2204 /*
2205 * In read-only mode we don't use the allocator and it may not even be
2206 * readable, so don't bother trying to load it.
2207 */
2208 vdo_set_completion_result(completion, VDO_READ_ONLY);
2209 break;
2210 }
2211
2212 if (requires_repair(vdo)) {
2213 vdo_repair(completion);
2214 return;
2215 }
2216
2217 vdo_load_slab_depot(vdo->depot,
2218 (was_new(vdo) ? VDO_ADMIN_STATE_FORMATTING :
2219 VDO_ADMIN_STATE_LOADING),
2220 completion, NULL);
2221 return;
2222
2223 case LOAD_PHASE_MAKE_DIRTY:
2224 vdo_set_state(vdo, VDO_DIRTY);
2225 vdo_save_components(vdo, completion);
2226 return;
2227
2228 case LOAD_PHASE_PREPARE_TO_ALLOCATE:
2229 vdo_initialize_block_map_from_journal(vdo->block_map,
2230 vdo->recovery_journal);
2231 vdo_prepare_slab_depot_to_allocate(vdo->depot, get_load_type(vdo),
2232 completion);
2233 return;
2234
2235 case LOAD_PHASE_SCRUB_SLABS:
2236 if (vdo_state_requires_recovery(vdo->load_state))
2237 vdo_enter_recovery_mode(vdo);
2238
2239 vdo_scrub_all_unrecovered_slabs(vdo->depot, completion);
2240 return;
2241
2242 case LOAD_PHASE_DATA_REDUCTION:
2243 WRITE_ONCE(vdo->compressing, vdo->device_config->compression);
2244 if (vdo->device_config->deduplication) {
2245 /*
2246 * Don't try to load or rebuild the index first (and log scary error
2247 * messages) if this is known to be a newly-formatted volume.
2248 */
2249 vdo_start_dedupe_index(vdo->hash_zones, was_new(vdo));
2250 }
2251
2252 vdo->allocations_allowed = false;
2253 fallthrough;
2254
2255 case LOAD_PHASE_FINISHED:
2256 break;
2257
2258 case LOAD_PHASE_DRAIN_JOURNAL:
2259 vdo_drain_recovery_journal(vdo->recovery_journal, VDO_ADMIN_STATE_SAVING,
2260 completion);
2261 return;
2262
2263 case LOAD_PHASE_WAIT_FOR_READ_ONLY:
2264 /* Avoid an infinite loop */
2265 completion->error_handler = NULL;
2266 vdo->admin.phase = LOAD_PHASE_FINISHED;
2267 vdo_wait_until_not_entering_read_only_mode(completion);
2268 return;
2269
2270 default:
2271 vdo_set_completion_result(completion, UDS_BAD_STATE);
2272 }
2273
2274 finish_operation_callback(completion);
2275 }
2276
2277 /**
2278 * handle_load_error() - Handle an error during the load operation.
2279 * @completion: The admin completion.
2280 *
2281 * If at all possible, brings the vdo online in read-only mode. This handler is registered in
2282 * vdo_preresume_registered().
2283 */
handle_load_error(struct vdo_completion * completion)2284 static void handle_load_error(struct vdo_completion *completion)
2285 {
2286 struct vdo *vdo = completion->vdo;
2287
2288 if (vdo_requeue_completion_if_needed(completion,
2289 vdo->thread_config.admin_thread))
2290 return;
2291
2292 if (vdo_state_requires_read_only_rebuild(vdo->load_state) &&
2293 (vdo->admin.phase == LOAD_PHASE_MAKE_DIRTY)) {
2294 vdo_log_error_strerror(completion->result, "aborting load");
2295 vdo->admin.phase = LOAD_PHASE_DRAIN_JOURNAL;
2296 load_callback(vdo_forget(completion));
2297 return;
2298 }
2299
2300 if ((completion->result == VDO_UNSUPPORTED_VERSION) &&
2301 (vdo->admin.phase == LOAD_PHASE_MAKE_DIRTY)) {
2302 vdo_log_error("Aborting load due to unsupported version");
2303 vdo->admin.phase = LOAD_PHASE_FINISHED;
2304 load_callback(completion);
2305 return;
2306 }
2307
2308 vdo_log_error_strerror(completion->result,
2309 "Entering read-only mode due to load error");
2310 vdo->admin.phase = LOAD_PHASE_WAIT_FOR_READ_ONLY;
2311 vdo_enter_read_only_mode(vdo, completion->result);
2312 completion->result = VDO_READ_ONLY;
2313 load_callback(completion);
2314 }
2315
2316 /**
2317 * write_super_block_for_resume() - Update the VDO state and save the super block.
2318 * @completion: The admin completion
2319 */
write_super_block_for_resume(struct vdo_completion * completion)2320 static void write_super_block_for_resume(struct vdo_completion *completion)
2321 {
2322 struct vdo *vdo = completion->vdo;
2323
2324 switch (vdo_get_state(vdo)) {
2325 case VDO_CLEAN:
2326 case VDO_NEW:
2327 vdo_set_state(vdo, VDO_DIRTY);
2328 vdo_save_components(vdo, completion);
2329 return;
2330
2331 case VDO_DIRTY:
2332 case VDO_READ_ONLY_MODE:
2333 case VDO_FORCE_REBUILD:
2334 case VDO_RECOVERING:
2335 case VDO_REBUILD_FOR_UPGRADE:
2336 /* No need to write the super block in these cases */
2337 vdo_launch_completion(completion);
2338 return;
2339
2340 case VDO_REPLAYING:
2341 default:
2342 vdo_continue_completion(completion, UDS_BAD_STATE);
2343 }
2344 }
2345
2346 /**
2347 * resume_callback() - Callback to resume a VDO.
2348 * @completion: The admin completion.
2349 */
resume_callback(struct vdo_completion * completion)2350 static void resume_callback(struct vdo_completion *completion)
2351 {
2352 struct vdo *vdo = completion->vdo;
2353 int result;
2354
2355 assert_admin_phase_thread(vdo, __func__);
2356
2357 switch (advance_phase(vdo)) {
2358 case RESUME_PHASE_START:
2359 result = vdo_start_operation(&vdo->admin.state,
2360 VDO_ADMIN_STATE_RESUMING);
2361 if (result != VDO_SUCCESS) {
2362 vdo_continue_completion(completion, result);
2363 return;
2364 }
2365
2366 write_super_block_for_resume(completion);
2367 return;
2368
2369 case RESUME_PHASE_ALLOW_READ_ONLY_MODE:
2370 vdo_allow_read_only_mode_entry(completion);
2371 return;
2372
2373 case RESUME_PHASE_DEDUPE:
2374 vdo_resume_hash_zones(vdo->hash_zones, completion);
2375 return;
2376
2377 case RESUME_PHASE_DEPOT:
2378 vdo_resume_slab_depot(vdo->depot, completion);
2379 return;
2380
2381 case RESUME_PHASE_JOURNAL:
2382 vdo_resume_recovery_journal(vdo->recovery_journal, completion);
2383 return;
2384
2385 case RESUME_PHASE_BLOCK_MAP:
2386 vdo_resume_block_map(vdo->block_map, completion);
2387 return;
2388
2389 case RESUME_PHASE_LOGICAL_ZONES:
2390 vdo_resume_logical_zones(vdo->logical_zones, completion);
2391 return;
2392
2393 case RESUME_PHASE_PACKER:
2394 {
2395 bool was_enabled = vdo_get_compressing(vdo);
2396 bool enable = vdo->device_config->compression;
2397
2398 if (enable != was_enabled)
2399 WRITE_ONCE(vdo->compressing, enable);
2400 vdo_log_info("compression is %s", (enable ? "enabled" : "disabled"));
2401
2402 vdo_resume_packer(vdo->packer, completion);
2403 return;
2404 }
2405
2406 case RESUME_PHASE_FLUSHER:
2407 vdo_resume_flusher(vdo->flusher, completion);
2408 return;
2409
2410 case RESUME_PHASE_DATA_VIOS:
2411 resume_data_vio_pool(vdo->data_vio_pool, completion);
2412 return;
2413
2414 case RESUME_PHASE_END:
2415 break;
2416
2417 default:
2418 vdo_set_completion_result(completion, UDS_BAD_STATE);
2419 }
2420
2421 finish_operation_callback(completion);
2422 }
2423
2424 /**
2425 * grow_logical_callback() - Callback to initiate a grow logical.
2426 * @completion: The admin completion.
2427 *
2428 * Registered in perform_grow_logical().
2429 */
grow_logical_callback(struct vdo_completion * completion)2430 static void grow_logical_callback(struct vdo_completion *completion)
2431 {
2432 struct vdo *vdo = completion->vdo;
2433 int result;
2434
2435 assert_admin_phase_thread(vdo, __func__);
2436
2437 switch (advance_phase(vdo)) {
2438 case GROW_LOGICAL_PHASE_START:
2439 if (vdo_is_read_only(vdo)) {
2440 vdo_log_error_strerror(VDO_READ_ONLY,
2441 "Can't grow logical size of a read-only VDO");
2442 vdo_set_completion_result(completion, VDO_READ_ONLY);
2443 break;
2444 }
2445
2446 result = vdo_start_operation(&vdo->admin.state,
2447 VDO_ADMIN_STATE_SUSPENDED_OPERATION);
2448 if (result != VDO_SUCCESS) {
2449 vdo_continue_completion(completion, result);
2450 return;
2451 }
2452
2453 vdo->states.vdo.config.logical_blocks = vdo->block_map->next_entry_count;
2454 vdo_save_components(vdo, completion);
2455 return;
2456
2457 case GROW_LOGICAL_PHASE_GROW_BLOCK_MAP:
2458 vdo_grow_block_map(vdo->block_map, completion);
2459 return;
2460
2461 case GROW_LOGICAL_PHASE_END:
2462 break;
2463
2464 case GROW_LOGICAL_PHASE_ERROR:
2465 vdo_enter_read_only_mode(vdo, completion->result);
2466 break;
2467
2468 default:
2469 vdo_set_completion_result(completion, UDS_BAD_STATE);
2470 }
2471
2472 finish_operation_callback(completion);
2473 }
2474
2475 /**
2476 * handle_logical_growth_error() - Handle an error during the grow physical process.
2477 * @completion: The admin completion.
2478 */
handle_logical_growth_error(struct vdo_completion * completion)2479 static void handle_logical_growth_error(struct vdo_completion *completion)
2480 {
2481 struct vdo *vdo = completion->vdo;
2482
2483 if (vdo->admin.phase == GROW_LOGICAL_PHASE_GROW_BLOCK_MAP) {
2484 /*
2485 * We've failed to write the new size in the super block, so set our in memory
2486 * config back to the old size.
2487 */
2488 vdo->states.vdo.config.logical_blocks = vdo->block_map->entry_count;
2489 vdo_abandon_block_map_growth(vdo->block_map);
2490 }
2491
2492 vdo->admin.phase = GROW_LOGICAL_PHASE_ERROR;
2493 grow_logical_callback(completion);
2494 }
2495
2496 /**
2497 * perform_grow_logical() - Grow the logical size of the vdo.
2498 * @vdo: The vdo to grow.
2499 * @new_logical_blocks: The size to which the vdo should be grown.
2500 *
2501 * Context: This method may only be called when the vdo has been suspended and must not be called
2502 * from a base thread.
2503 *
2504 * Return: VDO_SUCCESS or an error.
2505 */
perform_grow_logical(struct vdo * vdo,block_count_t new_logical_blocks)2506 static int perform_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks)
2507 {
2508 int result;
2509
2510 if (vdo->device_config->logical_blocks == new_logical_blocks) {
2511 /*
2512 * A table was loaded for which we prepared to grow, but a table without that
2513 * growth was what we are resuming with.
2514 */
2515 vdo_abandon_block_map_growth(vdo->block_map);
2516 return VDO_SUCCESS;
2517 }
2518
2519 vdo_log_info("Resizing logical to %llu",
2520 (unsigned long long) new_logical_blocks);
2521 if (vdo->block_map->next_entry_count != new_logical_blocks)
2522 return VDO_PARAMETER_MISMATCH;
2523
2524 result = perform_admin_operation(vdo, GROW_LOGICAL_PHASE_START,
2525 grow_logical_callback,
2526 handle_logical_growth_error, "grow logical");
2527 if (result != VDO_SUCCESS)
2528 return result;
2529
2530 vdo_log_info("Logical blocks now %llu", (unsigned long long) new_logical_blocks);
2531 return VDO_SUCCESS;
2532 }
2533
copy_callback(int read_err,unsigned long write_err,void * context)2534 static void copy_callback(int read_err, unsigned long write_err, void *context)
2535 {
2536 struct vdo_completion *completion = context;
2537 int result = (((read_err == 0) && (write_err == 0)) ? VDO_SUCCESS : -EIO);
2538
2539 vdo_continue_completion(completion, result);
2540 }
2541
partition_to_region(struct partition * partition,struct vdo * vdo,struct dm_io_region * region)2542 static void partition_to_region(struct partition *partition, struct vdo *vdo,
2543 struct dm_io_region *region)
2544 {
2545 physical_block_number_t pbn = partition->offset - vdo->geometry.bio_offset;
2546
2547 *region = (struct dm_io_region) {
2548 .bdev = vdo_get_backing_device(vdo),
2549 .sector = pbn * VDO_SECTORS_PER_BLOCK,
2550 .count = partition->count * VDO_SECTORS_PER_BLOCK,
2551 };
2552 }
2553
2554 /**
2555 * copy_partition() - Copy a partition from the location specified in the current layout to that in
2556 * the next layout.
2557 * @vdo: The vdo preparing to grow.
2558 * @id: The ID of the partition to copy.
2559 * @parent: The completion to notify when the copy is complete.
2560 */
copy_partition(struct vdo * vdo,enum partition_id id,struct vdo_completion * parent)2561 static void copy_partition(struct vdo *vdo, enum partition_id id,
2562 struct vdo_completion *parent)
2563 {
2564 struct dm_io_region read_region, write_regions[1];
2565 struct partition *from = vdo_get_known_partition(&vdo->layout, id);
2566 struct partition *to = vdo_get_known_partition(&vdo->next_layout, id);
2567
2568 partition_to_region(from, vdo, &read_region);
2569 partition_to_region(to, vdo, &write_regions[0]);
2570 dm_kcopyd_copy(vdo->partition_copier, &read_region, 1, write_regions, 0,
2571 copy_callback, parent);
2572 }
2573
2574 /**
2575 * grow_physical_callback() - Callback to initiate a grow physical.
2576 * @completion: The admin completion.
2577 *
2578 * Registered in perform_grow_physical().
2579 */
grow_physical_callback(struct vdo_completion * completion)2580 static void grow_physical_callback(struct vdo_completion *completion)
2581 {
2582 struct vdo *vdo = completion->vdo;
2583 int result;
2584
2585 assert_admin_phase_thread(vdo, __func__);
2586
2587 switch (advance_phase(vdo)) {
2588 case GROW_PHYSICAL_PHASE_START:
2589 if (vdo_is_read_only(vdo)) {
2590 vdo_log_error_strerror(VDO_READ_ONLY,
2591 "Can't grow physical size of a read-only VDO");
2592 vdo_set_completion_result(completion, VDO_READ_ONLY);
2593 break;
2594 }
2595
2596 result = vdo_start_operation(&vdo->admin.state,
2597 VDO_ADMIN_STATE_SUSPENDED_OPERATION);
2598 if (result != VDO_SUCCESS) {
2599 vdo_continue_completion(completion, result);
2600 return;
2601 }
2602
2603 /* Copy the journal into the new layout. */
2604 copy_partition(vdo, VDO_RECOVERY_JOURNAL_PARTITION, completion);
2605 return;
2606
2607 case GROW_PHYSICAL_PHASE_COPY_SUMMARY:
2608 copy_partition(vdo, VDO_SLAB_SUMMARY_PARTITION, completion);
2609 return;
2610
2611 case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS:
2612 vdo_uninitialize_layout(&vdo->layout);
2613 vdo->layout = vdo->next_layout;
2614 vdo_forget(vdo->next_layout.head);
2615 vdo->states.vdo.config.physical_blocks = vdo->layout.size;
2616 vdo_update_slab_depot_size(vdo->depot);
2617 vdo_save_components(vdo, completion);
2618 return;
2619
2620 case GROW_PHYSICAL_PHASE_USE_NEW_SLABS:
2621 vdo_use_new_slabs(vdo->depot, completion);
2622 return;
2623
2624 case GROW_PHYSICAL_PHASE_END:
2625 vdo->depot->summary_origin =
2626 vdo_get_known_partition(&vdo->layout,
2627 VDO_SLAB_SUMMARY_PARTITION)->offset;
2628 vdo->recovery_journal->origin =
2629 vdo_get_known_partition(&vdo->layout,
2630 VDO_RECOVERY_JOURNAL_PARTITION)->offset;
2631 break;
2632
2633 case GROW_PHYSICAL_PHASE_ERROR:
2634 vdo_enter_read_only_mode(vdo, completion->result);
2635 break;
2636
2637 default:
2638 vdo_set_completion_result(completion, UDS_BAD_STATE);
2639 }
2640
2641 vdo_uninitialize_layout(&vdo->next_layout);
2642 finish_operation_callback(completion);
2643 }
2644
2645 /**
2646 * handle_physical_growth_error() - Handle an error during the grow physical process.
2647 * @completion: The sub-task completion.
2648 */
handle_physical_growth_error(struct vdo_completion * completion)2649 static void handle_physical_growth_error(struct vdo_completion *completion)
2650 {
2651 completion->vdo->admin.phase = GROW_PHYSICAL_PHASE_ERROR;
2652 grow_physical_callback(completion);
2653 }
2654
2655 /**
2656 * perform_grow_physical() - Grow the physical size of the vdo.
2657 * @vdo: The vdo to resize.
2658 * @new_physical_blocks: The new physical size in blocks.
2659 *
2660 * Context: This method may only be called when the vdo has been suspended and must not be called
2661 * from a base thread.
2662 *
2663 * Return: VDO_SUCCESS or an error.
2664 */
perform_grow_physical(struct vdo * vdo,block_count_t new_physical_blocks)2665 static int perform_grow_physical(struct vdo *vdo, block_count_t new_physical_blocks)
2666 {
2667 int result;
2668 block_count_t new_depot_size, prepared_depot_size;
2669 block_count_t old_physical_blocks = vdo->states.vdo.config.physical_blocks;
2670
2671 /* Skip any noop grows. */
2672 if (old_physical_blocks == new_physical_blocks)
2673 return VDO_SUCCESS;
2674
2675 if (new_physical_blocks != vdo->next_layout.size) {
2676 /*
2677 * Either the VDO isn't prepared to grow, or it was prepared to grow to a different
2678 * size. Doing this check here relies on the fact that the call to this method is
2679 * done under the dmsetup message lock.
2680 */
2681 vdo_uninitialize_layout(&vdo->next_layout);
2682 vdo_abandon_new_slabs(vdo->depot);
2683 return VDO_PARAMETER_MISMATCH;
2684 }
2685
2686 /* Validate that we are prepared to grow appropriately. */
2687 new_depot_size =
2688 vdo_get_known_partition(&vdo->next_layout, VDO_SLAB_DEPOT_PARTITION)->count;
2689 prepared_depot_size = (vdo->depot->new_slabs == NULL) ? 0 : vdo->depot->new_size;
2690 if (prepared_depot_size != new_depot_size)
2691 return VDO_PARAMETER_MISMATCH;
2692
2693 result = perform_admin_operation(vdo, GROW_PHYSICAL_PHASE_START,
2694 grow_physical_callback,
2695 handle_physical_growth_error, "grow physical");
2696 if (result != VDO_SUCCESS)
2697 return result;
2698
2699 vdo_log_info("Physical block count was %llu, now %llu",
2700 (unsigned long long) old_physical_blocks,
2701 (unsigned long long) new_physical_blocks);
2702 return VDO_SUCCESS;
2703 }
2704
2705 /**
2706 * apply_new_vdo_configuration() - Attempt to make any configuration changes from the table being
2707 * resumed.
2708 * @vdo: The vdo being resumed.
2709 * @config: The new device configuration derived from the table with which the vdo is being
2710 * resumed.
2711 *
2712 * Return: VDO_SUCCESS or an error.
2713 */
apply_new_vdo_configuration(struct vdo * vdo,struct device_config * config)2714 static int __must_check apply_new_vdo_configuration(struct vdo *vdo,
2715 struct device_config *config)
2716 {
2717 int result;
2718
2719 result = perform_grow_logical(vdo, config->logical_blocks);
2720 if (result != VDO_SUCCESS) {
2721 vdo_log_error("grow logical operation failed, result = %d", result);
2722 return result;
2723 }
2724
2725 result = perform_grow_physical(vdo, config->physical_blocks);
2726 if (result != VDO_SUCCESS)
2727 vdo_log_error("resize operation failed, result = %d", result);
2728
2729 return result;
2730 }
2731
vdo_preresume_registered(struct dm_target * ti,struct vdo * vdo)2732 static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo)
2733 {
2734 struct device_config *config = ti->private;
2735 const char *device_name = vdo_get_device_name(ti);
2736 block_count_t backing_blocks;
2737 int result;
2738
2739 backing_blocks = get_underlying_device_block_count(vdo);
2740 if (backing_blocks < config->physical_blocks) {
2741 /* FIXME: can this still happen? */
2742 vdo_log_error("resume of device '%s' failed: backing device has %llu blocks but VDO physical size is %llu blocks",
2743 device_name, (unsigned long long) backing_blocks,
2744 (unsigned long long) config->physical_blocks);
2745 return -EINVAL;
2746 }
2747
2748 if (vdo_get_admin_state(vdo) == VDO_ADMIN_STATE_PRE_LOADED) {
2749 vdo_log_info("starting device '%s'", device_name);
2750 result = perform_admin_operation(vdo, LOAD_PHASE_START, load_callback,
2751 handle_load_error, "load");
2752 if (result == VDO_UNSUPPORTED_VERSION) {
2753 /*
2754 * A component version is not supported. This can happen when the
2755 * recovery journal metadata is in an old version format. Abort the
2756 * load without saving the state.
2757 */
2758 vdo->suspend_type = VDO_ADMIN_STATE_SUSPENDING;
2759 perform_admin_operation(vdo, SUSPEND_PHASE_START,
2760 suspend_callback, suspend_callback,
2761 "suspend");
2762 return result;
2763 }
2764
2765 if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
2766 /*
2767 * Something has gone very wrong. Make sure everything has drained and
2768 * leave the device in an unresumable state.
2769 */
2770 vdo_log_error_strerror(result,
2771 "Start failed, could not load VDO metadata");
2772 vdo->suspend_type = VDO_ADMIN_STATE_STOPPING;
2773 perform_admin_operation(vdo, SUSPEND_PHASE_START,
2774 suspend_callback, suspend_callback,
2775 "suspend");
2776 return result;
2777 }
2778
2779 /* Even if the VDO is read-only, it is now able to handle read requests. */
2780 vdo_log_info("device '%s' started", device_name);
2781 }
2782
2783 vdo_log_info("resuming device '%s'", device_name);
2784
2785 /* If this fails, the VDO was not in a state to be resumed. This should never happen. */
2786 result = apply_new_vdo_configuration(vdo, config);
2787 BUG_ON(result == VDO_INVALID_ADMIN_STATE);
2788
2789 /*
2790 * Now that we've tried to modify the vdo, the new config *is* the config, whether the
2791 * modifications worked or not.
2792 */
2793 vdo->device_config = config;
2794
2795 /*
2796 * Any error here is highly unexpected and the state of the vdo is questionable, so we mark
2797 * it read-only in memory. Because we are suspended, the read-only state will not be
2798 * written to disk.
2799 */
2800 if (result != VDO_SUCCESS) {
2801 vdo_log_error_strerror(result,
2802 "Commit of modifications to device '%s' failed",
2803 device_name);
2804 vdo_enter_read_only_mode(vdo, result);
2805 return result;
2806 }
2807
2808 if (vdo_get_admin_state(vdo)->normal) {
2809 /* The VDO was just started, so we don't need to resume it. */
2810 return VDO_SUCCESS;
2811 }
2812
2813 result = perform_admin_operation(vdo, RESUME_PHASE_START, resume_callback,
2814 resume_callback, "resume");
2815 BUG_ON(result == VDO_INVALID_ADMIN_STATE);
2816 if (result == VDO_READ_ONLY) {
2817 /* Even if the vdo is read-only, it has still resumed. */
2818 result = VDO_SUCCESS;
2819 }
2820
2821 if (result != VDO_SUCCESS)
2822 vdo_log_error("resume of device '%s' failed with error: %d", device_name,
2823 result);
2824
2825 return result;
2826 }
2827
vdo_preresume(struct dm_target * ti)2828 static int vdo_preresume(struct dm_target *ti)
2829 {
2830 struct registered_thread instance_thread;
2831 struct vdo *vdo = get_vdo_for_target(ti);
2832 int result;
2833
2834 vdo_register_thread_device_id(&instance_thread, &vdo->instance);
2835 result = vdo_preresume_registered(ti, vdo);
2836 if ((result == VDO_PARAMETER_MISMATCH) || (result == VDO_INVALID_ADMIN_STATE) ||
2837 (result == VDO_UNSUPPORTED_VERSION))
2838 result = -EINVAL;
2839 vdo_unregister_thread_device_id();
2840 return vdo_status_to_errno(result);
2841 }
2842
vdo_resume(struct dm_target * ti)2843 static void vdo_resume(struct dm_target *ti)
2844 {
2845 struct registered_thread instance_thread;
2846
2847 vdo_register_thread_device_id(&instance_thread,
2848 &get_vdo_for_target(ti)->instance);
2849 vdo_log_info("device '%s' resumed", vdo_get_device_name(ti));
2850 vdo_unregister_thread_device_id();
2851 }
2852
2853 /*
2854 * If anything changes that affects how user tools will interact with vdo, update the version
2855 * number and make sure documentation about the change is complete so tools can properly update
2856 * their management code.
2857 */
2858 static struct target_type vdo_target_bio = {
2859 .features = DM_TARGET_SINGLETON,
2860 .name = "vdo",
2861 .version = { 9, 1, 0 },
2862 .module = THIS_MODULE,
2863 .ctr = vdo_ctr,
2864 .dtr = vdo_dtr,
2865 .io_hints = vdo_io_hints,
2866 .iterate_devices = vdo_iterate_devices,
2867 .map = vdo_map_bio,
2868 .message = vdo_message,
2869 .status = vdo_status,
2870 .presuspend = vdo_presuspend,
2871 .postsuspend = vdo_postsuspend,
2872 .preresume = vdo_preresume,
2873 .resume = vdo_resume,
2874 };
2875
2876 static bool dm_registered;
2877
vdo_module_destroy(void)2878 static void vdo_module_destroy(void)
2879 {
2880 vdo_log_debug("unloading");
2881
2882 if (dm_registered)
2883 dm_unregister_target(&vdo_target_bio);
2884
2885 VDO_ASSERT_LOG_ONLY(instances.count == 0,
2886 "should have no instance numbers still in use, but have %u",
2887 instances.count);
2888 vdo_free(instances.words);
2889 memset(&instances, 0, sizeof(struct instance_tracker));
2890 }
2891
vdo_init(void)2892 static int __init vdo_init(void)
2893 {
2894 int result = 0;
2895
2896 /* Memory tracking must be initialized first for accurate accounting. */
2897 vdo_memory_init();
2898 vdo_initialize_threads_mutex();
2899 vdo_initialize_thread_device_registry();
2900 vdo_initialize_device_registry_once();
2901
2902 /* Add VDO errors to the set of errors registered by the indexer. */
2903 result = vdo_register_status_codes();
2904 if (result != VDO_SUCCESS) {
2905 vdo_log_error("vdo_register_status_codes failed %d", result);
2906 vdo_module_destroy();
2907 return result;
2908 }
2909
2910 result = dm_register_target(&vdo_target_bio);
2911 if (result < 0) {
2912 vdo_log_error("dm_register_target failed %d", result);
2913 vdo_module_destroy();
2914 return result;
2915 }
2916 dm_registered = true;
2917
2918 return result;
2919 }
2920
vdo_exit(void)2921 static void __exit vdo_exit(void)
2922 {
2923 vdo_module_destroy();
2924 /* Memory tracking cleanup must be done last. */
2925 vdo_memory_exit();
2926 }
2927
2928 module_init(vdo_init);
2929 module_exit(vdo_exit);
2930
2931 module_param_named(log_level, vdo_log_level, uint, 0644);
2932 MODULE_PARM_DESC(log_level, "Log level for log messages");
2933
2934 MODULE_DESCRIPTION(DM_NAME " target for transparent deduplication");
2935 MODULE_AUTHOR("Red Hat, Inc.");
2936 MODULE_LICENSE("GPL");
2937