xref: /linux/drivers/md/dm-vdo/indexer/delta-index.c (revision 79790b6818e96c58fe2bffee1b418c16e64e7b80)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2023 Red Hat
4  */
5 #include "delta-index.h"
6 
7 #include <linux/bitops.h>
8 #include <linux/bits.h>
9 #include <linux/compiler.h>
10 #include <linux/limits.h>
11 #include <linux/log2.h>
12 
13 #include "cpu.h"
14 #include "errors.h"
15 #include "logger.h"
16 #include "memory-alloc.h"
17 #include "numeric.h"
18 #include "permassert.h"
19 #include "string-utils.h"
20 #include "time-utils.h"
21 
22 #include "config.h"
23 #include "indexer.h"
24 
25 /*
26  * The entries in a delta index could be stored in a single delta list, but to reduce search times
27  * and update costs it uses multiple delta lists. These lists are stored in a single chunk of
28  * memory managed by the delta_zone structure. The delta_zone can move the data around within its
29  * memory, so the location of each delta list is recorded as a bit offset into the memory. Because
30  * the volume index can contain over a million delta lists, we want to be efficient with the size
31  * of the delta list header information. This information is encoded into 16 bytes per list. The
32  * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to
33  * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are
34  * sufficient to store the size of a delta list.
35  *
36  * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are
37  * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and
38  * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most significant bit
39  * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's
40  * number corresponds to its index in the array.
41  *
42  * A standard delta list entry is stored as a fixed length payload (the value) followed by a
43  * variable length key (the delta). A collision entry is used when two block names have the same
44  * delta list address. A collision entry always follows a standard entry for the hash with which it
45  * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end,
46  * containing the full block name. An entry with a delta of 0 at the beginning of a delta list
47  * indicates a normal entry.
48  *
49  * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory
50  * used by small deltas. The Huffman code is specified by three parameters, which can be computed
51  * from the desired mean delta when the index is full. (See compute_coding_constants() for
52  * details.)
53  *
54  * The bit field utilities used to read and write delta entries assume that it is possible to read
55  * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two
56  * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are
57  * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit
58  * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted
59  * delta list could cause this step to run off the end of the delta_zone memory, so as extra
60  * protection against this happening, the tail guard list is set to all ones.
61  *
62  * The delta_index supports two different forms. The mutable form is created by
63  * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The
64  * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and
65  * cached) chapter index pages. The immutable form does not allocate delta list headers or
66  * temporary offsets, and thus is somewhat more memory efficient.
67  */
68 
69 /*
70  * This is the largest field size supported by get_field() and set_field(). Any field that is
71  * larger is not guaranteed to fit in a single byte-aligned u32.
72  */
73 #define MAX_FIELD_BITS ((sizeof(u32) - 1) * BITS_PER_BYTE + 1)
74 
75 /*
76  * This is the largest field size supported by get_big_field() and set_big_field(). Any field that
77  * is larger is not guaranteed to fit in a single byte-aligned u64.
78  */
79 #define MAX_BIG_FIELD_BITS ((sizeof(u64) - 1) * BITS_PER_BYTE + 1)
80 
81 /*
82  * This is the number of guard bytes needed at the end of the memory byte array when using the bit
83  * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7
84  * bytes beyond the end of the desired field. The definition is written to make it clear how this
85  * value is derived.
86  */
87 #define POST_FIELD_GUARD_BYTES (sizeof(u64) - 1)
88 
89 /* The number of guard bits that are needed in the tail guard list */
90 #define GUARD_BITS (POST_FIELD_GUARD_BYTES * BITS_PER_BYTE)
91 
92 /*
93  * The maximum size of a single delta list in bytes. We count guard bytes in this value because a
94  * buffer of this size can be used with move_bits().
95  */
96 #define DELTA_LIST_MAX_BYTE_COUNT					\
97 	((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES)
98 
99 /* The number of extra bytes and bits needed to store a collision entry */
100 #define COLLISION_BYTES UDS_RECORD_NAME_SIZE
101 #define COLLISION_BITS (COLLISION_BYTES * BITS_PER_BYTE)
102 
103 /*
104  * Immutable delta lists are packed into pages containing a header that encodes the delta list
105  * information into 19 bits per list (64KB bit offset).
106  */
107 #define IMMUTABLE_HEADER_SIZE 19
108 
109 /*
110  * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a
111  * number to increment when the format of the data changes.
112  */
113 #define MAGIC_SIZE 8
114 
115 static const char DELTA_INDEX_MAGIC[] = "DI-00002";
116 
117 struct delta_index_header {
118 	char magic[MAGIC_SIZE];
119 	u32 zone_number;
120 	u32 zone_count;
121 	u32 first_list;
122 	u32 list_count;
123 	u64 record_count;
124 	u64 collision_count;
125 };
126 
127 /*
128  * Header data used for immutable delta index pages. This data is followed by the delta list offset
129  * table.
130  */
131 struct delta_page_header {
132 	/* Externally-defined nonce */
133 	u64 nonce;
134 	/* The virtual chapter number */
135 	u64 virtual_chapter_number;
136 	/* Index of the first delta list on the page */
137 	u16 first_list;
138 	/* Number of delta lists on the page */
139 	u16 list_count;
140 } __packed;
141 
get_delta_list_byte_start(const struct delta_list * delta_list)142 static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list)
143 {
144 	return delta_list->start / BITS_PER_BYTE;
145 }
146 
get_delta_list_byte_size(const struct delta_list * delta_list)147 static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list)
148 {
149 	unsigned int bit_offset = delta_list->start % BITS_PER_BYTE;
150 
151 	return BITS_TO_BYTES(bit_offset + delta_list->size);
152 }
153 
rebalance_delta_zone(const struct delta_zone * delta_zone,u32 first,u32 last)154 static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first,
155 				 u32 last)
156 {
157 	struct delta_list *delta_list;
158 	u64 new_start;
159 
160 	if (first == last) {
161 		/* Only one list is moving, and we know there is space. */
162 		delta_list = &delta_zone->delta_lists[first];
163 		new_start = delta_zone->new_offsets[first];
164 		if (delta_list->start != new_start) {
165 			u64 source;
166 			u64 destination;
167 
168 			source = get_delta_list_byte_start(delta_list);
169 			delta_list->start = new_start;
170 			destination = get_delta_list_byte_start(delta_list);
171 			memmove(delta_zone->memory + destination,
172 				delta_zone->memory + source,
173 				get_delta_list_byte_size(delta_list));
174 		}
175 	} else {
176 		/*
177 		 * There is more than one list. Divide the problem in half, and use recursive calls
178 		 * to process each half. Note that after this computation, first <= middle, and
179 		 * middle < last.
180 		 */
181 		u32 middle = (first + last) / 2;
182 
183 		delta_list = &delta_zone->delta_lists[middle];
184 		new_start = delta_zone->new_offsets[middle];
185 
186 		/*
187 		 * The direction that our middle list is moving determines which half of the
188 		 * problem must be processed first.
189 		 */
190 		if (new_start > delta_list->start) {
191 			rebalance_delta_zone(delta_zone, middle + 1, last);
192 			rebalance_delta_zone(delta_zone, first, middle);
193 		} else {
194 			rebalance_delta_zone(delta_zone, first, middle);
195 			rebalance_delta_zone(delta_zone, middle + 1, last);
196 		}
197 	}
198 }
199 
get_zone_memory_size(unsigned int zone_count,size_t memory_size)200 static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size)
201 {
202 	/* Round up so that each zone is a multiple of 64K in size. */
203 	size_t ALLOC_BOUNDARY = 64 * 1024;
204 
205 	return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY;
206 }
207 
uds_reset_delta_index(const struct delta_index * delta_index)208 void uds_reset_delta_index(const struct delta_index *delta_index)
209 {
210 	unsigned int z;
211 
212 	/*
213 	 * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one
214 	 * before the first real entry and one after so that we don't need to bounds check the
215 	 * array access when calculating preceding and following gap sizes.
216 	 */
217 	for (z = 0; z < delta_index->zone_count; z++) {
218 		u64 list_bits;
219 		u64 spacing;
220 		u64 offset;
221 		unsigned int i;
222 		struct delta_zone *zone = &delta_index->delta_zones[z];
223 		struct delta_list *delta_lists = zone->delta_lists;
224 
225 		/* Zeroing the delta list headers initializes the head guard list correctly. */
226 		memset(delta_lists, 0,
227 		       (zone->list_count + 2) * sizeof(struct delta_list));
228 
229 		/* Set all the bits in the end guard list. */
230 		list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS;
231 		delta_lists[zone->list_count + 1].start = list_bits;
232 		delta_lists[zone->list_count + 1].size = GUARD_BITS;
233 		memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0,
234 		       POST_FIELD_GUARD_BYTES);
235 
236 		/* Evenly space out the real delta lists by setting regular offsets. */
237 		spacing = list_bits / zone->list_count;
238 		offset = spacing / 2;
239 		for (i = 1; i <= zone->list_count; i++) {
240 			delta_lists[i].start = offset;
241 			offset += spacing;
242 		}
243 
244 		/* Update the statistics. */
245 		zone->discard_count += zone->record_count;
246 		zone->record_count = 0;
247 		zone->collision_count = 0;
248 	}
249 }
250 
251 /* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by
252  * three parameters:
253  *
254  *  MINBITS   The number of bits in the smallest code
255  *  BASE      The number of values coded using a code of length MINBITS
256  *  INCR      The number of values coded by using one additional bit
257  *
258  * These parameters are related by this equation:
259  *
260  *	BASE + INCR == 1 << MINBITS
261  *
262  * The math for the Huffman code of an exponential distribution says that
263  *
264  *	INCR = log(2) * MEAN_DELTA
265  *
266  * Then use the smallest MINBITS value so that
267  *
268  *	(1 << MINBITS) > INCR
269  *
270  * And then
271  *
272  *	BASE = (1 << MINBITS) - INCR
273  *
274  * Now the index can generate a code such that
275  * - The first BASE values code using MINBITS bits.
276  * - The next INCR values code using MINBITS+1 bits.
277  * - The next INCR values code using MINBITS+2 bits.
278  * - (and so on).
279  */
compute_coding_constants(u32 mean_delta,u16 * min_bits,u32 * min_keys,u32 * incr_keys)280 static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys)
281 {
282 	/*
283 	 * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use
284 	 * floating point, use a really good integer approximation.
285 	 */
286 	*incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL;
287 	*min_bits = bits_per(*incr_keys + 1);
288 	*min_keys = (1 << *min_bits) - *incr_keys;
289 }
290 
uds_uninitialize_delta_index(struct delta_index * delta_index)291 void uds_uninitialize_delta_index(struct delta_index *delta_index)
292 {
293 	unsigned int z;
294 
295 	if (delta_index->delta_zones == NULL)
296 		return;
297 
298 	for (z = 0; z < delta_index->zone_count; z++) {
299 		vdo_free(vdo_forget(delta_index->delta_zones[z].new_offsets));
300 		vdo_free(vdo_forget(delta_index->delta_zones[z].delta_lists));
301 		vdo_free(vdo_forget(delta_index->delta_zones[z].memory));
302 	}
303 
304 	vdo_free(delta_index->delta_zones);
305 	memset(delta_index, 0, sizeof(struct delta_index));
306 }
307 
initialize_delta_zone(struct delta_zone * delta_zone,size_t size,u32 first_list,u32 list_count,u32 mean_delta,u32 payload_bits,u8 tag)308 static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
309 				 u32 first_list, u32 list_count, u32 mean_delta,
310 				 u32 payload_bits, u8 tag)
311 {
312 	int result;
313 
314 	result = vdo_allocate(size, u8, "delta list", &delta_zone->memory);
315 	if (result != VDO_SUCCESS)
316 		return result;
317 
318 	result = vdo_allocate(list_count + 2, u64, "delta list temp",
319 			      &delta_zone->new_offsets);
320 	if (result != VDO_SUCCESS)
321 		return result;
322 
323 	/* Allocate the delta lists. */
324 	result = vdo_allocate(list_count + 2, struct delta_list, "delta lists",
325 			      &delta_zone->delta_lists);
326 	if (result != VDO_SUCCESS)
327 		return result;
328 
329 	compute_coding_constants(mean_delta, &delta_zone->min_bits,
330 				 &delta_zone->min_keys, &delta_zone->incr_keys);
331 	delta_zone->value_bits = payload_bits;
332 	delta_zone->buffered_writer = NULL;
333 	delta_zone->size = size;
334 	delta_zone->rebalance_time = 0;
335 	delta_zone->rebalance_count = 0;
336 	delta_zone->record_count = 0;
337 	delta_zone->collision_count = 0;
338 	delta_zone->discard_count = 0;
339 	delta_zone->overflow_count = 0;
340 	delta_zone->first_list = first_list;
341 	delta_zone->list_count = list_count;
342 	delta_zone->tag = tag;
343 
344 	return UDS_SUCCESS;
345 }
346 
uds_initialize_delta_index(struct delta_index * delta_index,unsigned int zone_count,u32 list_count,u32 mean_delta,u32 payload_bits,size_t memory_size,u8 tag)347 int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count,
348 			       u32 list_count, u32 mean_delta, u32 payload_bits,
349 			       size_t memory_size, u8 tag)
350 {
351 	int result;
352 	unsigned int z;
353 	size_t zone_memory;
354 
355 	result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones",
356 			      &delta_index->delta_zones);
357 	if (result != VDO_SUCCESS)
358 		return result;
359 
360 	delta_index->zone_count = zone_count;
361 	delta_index->list_count = list_count;
362 	delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count);
363 	delta_index->memory_size = 0;
364 	delta_index->mutable = true;
365 	delta_index->tag = tag;
366 
367 	for (z = 0; z < zone_count; z++) {
368 		u32 lists_in_zone = delta_index->lists_per_zone;
369 		u32 first_list_in_zone = z * lists_in_zone;
370 
371 		if (z == zone_count - 1) {
372 			/*
373 			 * The last zone gets fewer lists if zone_count doesn't evenly divide
374 			 * list_count. We'll have an underflow if the assertion below doesn't hold.
375 			 */
376 			if (delta_index->list_count <= first_list_in_zone) {
377 				uds_uninitialize_delta_index(delta_index);
378 				return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
379 							      "%u delta lists not enough for %u zones",
380 							      list_count, zone_count);
381 			}
382 			lists_in_zone = delta_index->list_count - first_list_in_zone;
383 		}
384 
385 		zone_memory = get_zone_memory_size(zone_count, memory_size);
386 		result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory,
387 					       first_list_in_zone, lists_in_zone,
388 					       mean_delta, payload_bits, tag);
389 		if (result != UDS_SUCCESS) {
390 			uds_uninitialize_delta_index(delta_index);
391 			return result;
392 		}
393 
394 		delta_index->memory_size +=
395 			(sizeof(struct delta_zone) + zone_memory +
396 			 (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64)));
397 	}
398 
399 	uds_reset_delta_index(delta_index);
400 	return UDS_SUCCESS;
401 }
402 
403 /* Read a bit field from an arbitrary bit boundary. */
get_field(const u8 * memory,u64 offset,u8 size)404 static inline u32 get_field(const u8 *memory, u64 offset, u8 size)
405 {
406 	const void *addr = memory + offset / BITS_PER_BYTE;
407 
408 	return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1);
409 }
410 
411 /* Write a bit field to an arbitrary bit boundary. */
set_field(u32 value,u8 * memory,u64 offset,u8 size)412 static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size)
413 {
414 	void *addr = memory + offset / BITS_PER_BYTE;
415 	int shift = offset % BITS_PER_BYTE;
416 	u32 data = get_unaligned_le32(addr);
417 
418 	data &= ~(((1 << size) - 1) << shift);
419 	data |= value << shift;
420 	put_unaligned_le32(data, addr);
421 }
422 
423 /* Get the bit offset to the immutable delta list header. */
get_immutable_header_offset(u32 list_number)424 static inline u32 get_immutable_header_offset(u32 list_number)
425 {
426 	return sizeof(struct delta_page_header) * BITS_PER_BYTE +
427 		list_number * IMMUTABLE_HEADER_SIZE;
428 }
429 
430 /* Get the bit offset to the start of the immutable delta list bit stream. */
get_immutable_start(const u8 * memory,u32 list_number)431 static inline u32 get_immutable_start(const u8 *memory, u32 list_number)
432 {
433 	return get_field(memory, get_immutable_header_offset(list_number),
434 			 IMMUTABLE_HEADER_SIZE);
435 }
436 
437 /* Set the bit offset to the start of the immutable delta list bit stream. */
set_immutable_start(u8 * memory,u32 list_number,u32 start)438 static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start)
439 {
440 	set_field(start, memory, get_immutable_header_offset(list_number),
441 		  IMMUTABLE_HEADER_SIZE);
442 }
443 
verify_delta_index_page(u64 nonce,u16 list_count,u64 expected_nonce,u8 * memory,size_t memory_size)444 static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce,
445 				    u8 *memory, size_t memory_size)
446 {
447 	unsigned int i;
448 
449 	/*
450 	 * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the
451 	 * entire volume at least once.
452 	 */
453 	if (nonce != expected_nonce)
454 		return false;
455 
456 	/* Verify that the number of delta lists can fit in the page. */
457 	if (list_count > ((memory_size - sizeof(struct delta_page_header)) *
458 			  BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE))
459 		return false;
460 
461 	/*
462 	 * Verify that the first delta list is immediately after the last delta
463 	 * list header.
464 	 */
465 	if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1))
466 		return false;
467 
468 	/* Verify that the lists are in the correct order. */
469 	for (i = 0; i < list_count; i++) {
470 		if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1))
471 			return false;
472 	}
473 
474 	/*
475 	 * Verify that the last list ends on the page, and that there is room
476 	 * for the post-field guard bits.
477 	 */
478 	if (get_immutable_start(memory, list_count) >
479 	    (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE)
480 		return false;
481 
482 	/* Verify that the guard bytes are correctly set to all ones. */
483 	for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) {
484 		if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0)
485 			return false;
486 	}
487 
488 	/* All verifications passed. */
489 	return true;
490 }
491 
492 /* Initialize a delta index page to refer to a supplied page. */
uds_initialize_delta_index_page(struct delta_index_page * delta_index_page,u64 expected_nonce,u32 mean_delta,u32 payload_bits,u8 * memory,size_t memory_size)493 int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
494 				    u64 expected_nonce, u32 mean_delta, u32 payload_bits,
495 				    u8 *memory, size_t memory_size)
496 {
497 	u64 nonce;
498 	u64 vcn;
499 	u64 first_list;
500 	u64 list_count;
501 	struct delta_page_header *header = (struct delta_page_header *) memory;
502 	struct delta_zone *delta_zone = &delta_index_page->delta_zone;
503 	const u8 *nonce_addr = (const u8 *) &header->nonce;
504 	const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number;
505 	const u8 *first_list_addr = (const u8 *) &header->first_list;
506 	const u8 *list_count_addr = (const u8 *) &header->list_count;
507 
508 	/* First assume that the header is little endian. */
509 	nonce = get_unaligned_le64(nonce_addr);
510 	vcn = get_unaligned_le64(vcn_addr);
511 	first_list = get_unaligned_le16(first_list_addr);
512 	list_count = get_unaligned_le16(list_count_addr);
513 	if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
514 				     memory_size)) {
515 		/* If that fails, try big endian. */
516 		nonce = get_unaligned_be64(nonce_addr);
517 		vcn = get_unaligned_be64(vcn_addr);
518 		first_list = get_unaligned_be16(first_list_addr);
519 		list_count = get_unaligned_be16(list_count_addr);
520 		if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
521 					     memory_size)) {
522 			/*
523 			 * Both attempts failed. Do not log this as an error, because it can happen
524 			 * during a rebuild if we haven't written the entire volume at least once.
525 			 */
526 			return UDS_CORRUPT_DATA;
527 		}
528 	}
529 
530 	delta_index_page->delta_index.delta_zones = delta_zone;
531 	delta_index_page->delta_index.zone_count = 1;
532 	delta_index_page->delta_index.list_count = list_count;
533 	delta_index_page->delta_index.lists_per_zone = list_count;
534 	delta_index_page->delta_index.mutable = false;
535 	delta_index_page->delta_index.tag = 'p';
536 	delta_index_page->virtual_chapter_number = vcn;
537 	delta_index_page->lowest_list_number = first_list;
538 	delta_index_page->highest_list_number = first_list + list_count - 1;
539 
540 	compute_coding_constants(mean_delta, &delta_zone->min_bits,
541 				 &delta_zone->min_keys, &delta_zone->incr_keys);
542 	delta_zone->value_bits = payload_bits;
543 	delta_zone->memory = memory;
544 	delta_zone->delta_lists = NULL;
545 	delta_zone->new_offsets = NULL;
546 	delta_zone->buffered_writer = NULL;
547 	delta_zone->size = memory_size;
548 	delta_zone->rebalance_time = 0;
549 	delta_zone->rebalance_count = 0;
550 	delta_zone->record_count = 0;
551 	delta_zone->collision_count = 0;
552 	delta_zone->discard_count = 0;
553 	delta_zone->overflow_count = 0;
554 	delta_zone->first_list = 0;
555 	delta_zone->list_count = list_count;
556 	delta_zone->tag = 'p';
557 
558 	return UDS_SUCCESS;
559 }
560 
561 /* Read a large bit field from an arbitrary bit boundary. */
get_big_field(const u8 * memory,u64 offset,u8 size)562 static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size)
563 {
564 	const void *addr = memory + offset / BITS_PER_BYTE;
565 
566 	return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1);
567 }
568 
569 /* Write a large bit field to an arbitrary bit boundary. */
set_big_field(u64 value,u8 * memory,u64 offset,u8 size)570 static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size)
571 {
572 	void *addr = memory + offset / BITS_PER_BYTE;
573 	u8 shift = offset % BITS_PER_BYTE;
574 	u64 data = get_unaligned_le64(addr);
575 
576 	data &= ~(((1UL << size) - 1) << shift);
577 	data |= value << shift;
578 	put_unaligned_le64(data, addr);
579 }
580 
581 /* Set a sequence of bits to all zeros. */
set_zero(u8 * memory,u64 offset,u32 size)582 static inline void set_zero(u8 *memory, u64 offset, u32 size)
583 {
584 	if (size > 0) {
585 		u8 *addr = memory + offset / BITS_PER_BYTE;
586 		u8 shift = offset % BITS_PER_BYTE;
587 		u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size;
588 
589 		*addr++ &= ~(((1 << count) - 1) << shift);
590 		for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE)
591 			*addr++ = 0;
592 
593 		if (size > 0)
594 			*addr &= 0xFF << size;
595 	}
596 }
597 
598 /*
599  * Move several bits from a higher to a lower address, moving the lower addressed bits first. The
600  * size and memory offsets are measured in bits.
601  */
move_bits_down(const u8 * from,u64 from_offset,u8 * to,u64 to_offset,u32 size)602 static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
603 {
604 	const u8 *source;
605 	u8 *destination;
606 	u8 offset;
607 	u8 count;
608 	u64 field;
609 
610 	/* Start by moving one field that ends on a to int boundary. */
611 	count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32)));
612 	field = get_big_field(from, from_offset, count);
613 	set_big_field(field, to, to_offset, count);
614 	from_offset += count;
615 	to_offset += count;
616 	size -= count;
617 
618 	/* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
619 	offset = from_offset % BITS_PER_TYPE(u32);
620 	source = from + (from_offset - offset) / BITS_PER_BYTE;
621 	destination = to + to_offset / BITS_PER_BYTE;
622 	while (size > MAX_BIG_FIELD_BITS) {
623 		put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
624 		source += sizeof(u32);
625 		destination += sizeof(u32);
626 		from_offset += BITS_PER_TYPE(u32);
627 		to_offset += BITS_PER_TYPE(u32);
628 		size -= BITS_PER_TYPE(u32);
629 	}
630 
631 	/* Finish up by moving any remaining bits. */
632 	if (size > 0) {
633 		field = get_big_field(from, from_offset, size);
634 		set_big_field(field, to, to_offset, size);
635 	}
636 }
637 
638 /*
639  * Move several bits from a lower to a higher address, moving the higher addressed bits first. The
640  * size and memory offsets are measured in bits.
641  */
move_bits_up(const u8 * from,u64 from_offset,u8 * to,u64 to_offset,u32 size)642 static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
643 {
644 	const u8 *source;
645 	u8 *destination;
646 	u8 offset;
647 	u8 count;
648 	u64 field;
649 
650 	/* Start by moving one field that begins on a destination int boundary. */
651 	count = (to_offset + size) % BITS_PER_TYPE(u32);
652 	if (count > 0) {
653 		size -= count;
654 		field = get_big_field(from, from_offset + size, count);
655 		set_big_field(field, to, to_offset + size, count);
656 	}
657 
658 	/* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
659 	offset = (from_offset + size) % BITS_PER_TYPE(u32);
660 	source = from + (from_offset + size - offset) / BITS_PER_BYTE;
661 	destination = to + (to_offset + size) / BITS_PER_BYTE;
662 	while (size > MAX_BIG_FIELD_BITS) {
663 		source -= sizeof(u32);
664 		destination -= sizeof(u32);
665 		size -= BITS_PER_TYPE(u32);
666 		put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
667 	}
668 
669 	/* Finish up by moving any remaining bits. */
670 	if (size > 0) {
671 		field = get_big_field(from, from_offset, size);
672 		set_big_field(field, to, to_offset, size);
673 	}
674 }
675 
676 /*
677  * Move bits from one field to another. When the fields overlap, behave as if we first move all the
678  * bits from the source to a temporary value, and then move all the bits from the temporary value
679  * to the destination. The size and memory offsets are measured in bits.
680  */
move_bits(const u8 * from,u64 from_offset,u8 * to,u64 to_offset,u32 size)681 static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
682 {
683 	u64 field;
684 
685 	/* A small move doesn't require special handling. */
686 	if (size <= MAX_BIG_FIELD_BITS) {
687 		if (size > 0) {
688 			field = get_big_field(from, from_offset, size);
689 			set_big_field(field, to, to_offset, size);
690 		}
691 
692 		return;
693 	}
694 
695 	if (from_offset > to_offset)
696 		move_bits_down(from, from_offset, to, to_offset, size);
697 	else
698 		move_bits_up(from, from_offset, to, to_offset, size);
699 }
700 
701 /*
702  * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta
703  * lists (starting with a specified list index) is copied from the mutable delta index into a
704  * memory page used in the immutable index. The number of lists copied onto the page is returned in
705  * list_count.
706  */
uds_pack_delta_index_page(const struct delta_index * delta_index,u64 header_nonce,u8 * memory,size_t memory_size,u64 virtual_chapter_number,u32 first_list,u32 * list_count)707 int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce,
708 			      u8 *memory, size_t memory_size, u64 virtual_chapter_number,
709 			      u32 first_list, u32 *list_count)
710 {
711 	const struct delta_zone *delta_zone;
712 	struct delta_list *delta_lists;
713 	u32 max_lists;
714 	u32 n_lists = 0;
715 	u32 offset;
716 	u32 i;
717 	int free_bits;
718 	int bits;
719 	struct delta_page_header *header;
720 
721 	delta_zone = &delta_index->delta_zones[0];
722 	delta_lists = &delta_zone->delta_lists[first_list + 1];
723 	max_lists = delta_index->list_count - first_list;
724 
725 	/*
726 	 * Compute how many lists will fit on the page. Subtract the size of the fixed header, one
727 	 * delta list offset, and the guard bytes from the page size to determine how much space is
728 	 * available for delta lists.
729 	 */
730 	free_bits = memory_size * BITS_PER_BYTE;
731 	free_bits -= get_immutable_header_offset(1);
732 	free_bits -= GUARD_BITS;
733 	if (free_bits < IMMUTABLE_HEADER_SIZE) {
734 		/* This page is too small to store any delta lists. */
735 		return vdo_log_error_strerror(UDS_OVERFLOW,
736 					      "Chapter Index Page of %zu bytes is too small",
737 					      memory_size);
738 	}
739 
740 	while (n_lists < max_lists) {
741 		/* Each list requires a delta list offset and the list data. */
742 		bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size;
743 		if (bits > free_bits)
744 			break;
745 
746 		n_lists++;
747 		free_bits -= bits;
748 	}
749 
750 	*list_count = n_lists;
751 
752 	header = (struct delta_page_header *) memory;
753 	put_unaligned_le64(header_nonce, (u8 *) &header->nonce);
754 	put_unaligned_le64(virtual_chapter_number,
755 			   (u8 *) &header->virtual_chapter_number);
756 	put_unaligned_le16(first_list, (u8 *) &header->first_list);
757 	put_unaligned_le16(n_lists, (u8 *) &header->list_count);
758 
759 	/* Construct the delta list offset table. */
760 	offset = get_immutable_header_offset(n_lists + 1);
761 	set_immutable_start(memory, 0, offset);
762 	for (i = 0; i < n_lists; i++) {
763 		offset += delta_lists[i].size;
764 		set_immutable_start(memory, i + 1, offset);
765 	}
766 
767 	/* Copy the delta list data onto the memory page. */
768 	for (i = 0; i < n_lists; i++) {
769 		move_bits(delta_zone->memory, delta_lists[i].start, memory,
770 			  get_immutable_start(memory, i), delta_lists[i].size);
771 	}
772 
773 	/* Set all the bits in the guard bytes. */
774 	memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0,
775 	       POST_FIELD_GUARD_BYTES);
776 	return UDS_SUCCESS;
777 }
778 
779 /* Compute the new offsets of the delta lists. */
compute_new_list_offsets(struct delta_zone * delta_zone,u32 growing_index,size_t growing_size,size_t used_space)780 static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index,
781 				     size_t growing_size, size_t used_space)
782 {
783 	size_t spacing;
784 	u32 i;
785 	struct delta_list *delta_lists = delta_zone->delta_lists;
786 	u32 tail_guard_index = delta_zone->list_count + 1;
787 
788 	spacing = (delta_zone->size - used_space) / delta_zone->list_count;
789 	delta_zone->new_offsets[0] = 0;
790 	for (i = 0; i <= delta_zone->list_count; i++) {
791 		delta_zone->new_offsets[i + 1] =
792 			(delta_zone->new_offsets[i] +
793 			 get_delta_list_byte_size(&delta_lists[i]) + spacing);
794 		delta_zone->new_offsets[i] *= BITS_PER_BYTE;
795 		delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE;
796 		if (i == 0)
797 			delta_zone->new_offsets[i + 1] -= spacing / 2;
798 		if (i + 1 == growing_index)
799 			delta_zone->new_offsets[i + 1] += growing_size;
800 	}
801 
802 	delta_zone->new_offsets[tail_guard_index] =
803 		(delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size);
804 }
805 
rebalance_lists(struct delta_zone * delta_zone)806 static void rebalance_lists(struct delta_zone *delta_zone)
807 {
808 	struct delta_list *delta_lists;
809 	u32 i;
810 	size_t used_space = 0;
811 
812 	/* Extend and balance memory to receive the delta lists */
813 	delta_lists = delta_zone->delta_lists;
814 	for (i = 0; i <= delta_zone->list_count + 1; i++)
815 		used_space += get_delta_list_byte_size(&delta_lists[i]);
816 
817 	compute_new_list_offsets(delta_zone, 0, 0, used_space);
818 	for (i = 1; i <= delta_zone->list_count + 1; i++)
819 		delta_lists[i].start = delta_zone->new_offsets[i];
820 }
821 
822 /* Start restoring a delta index from multiple input streams. */
uds_start_restoring_delta_index(struct delta_index * delta_index,struct buffered_reader ** buffered_readers,unsigned int reader_count)823 int uds_start_restoring_delta_index(struct delta_index *delta_index,
824 				    struct buffered_reader **buffered_readers,
825 				    unsigned int reader_count)
826 {
827 	int result;
828 	unsigned int zone_count = reader_count;
829 	u64 record_count = 0;
830 	u64 collision_count = 0;
831 	u32 first_list[MAX_ZONES];
832 	u32 list_count[MAX_ZONES];
833 	unsigned int z;
834 	u32 list_next = 0;
835 	const struct delta_zone *delta_zone;
836 
837 	/* Read and validate each header. */
838 	for (z = 0; z < zone_count; z++) {
839 		struct delta_index_header header;
840 		u8 buffer[sizeof(struct delta_index_header)];
841 		size_t offset = 0;
842 
843 		result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
844 						       sizeof(buffer));
845 		if (result != UDS_SUCCESS) {
846 			return vdo_log_warning_strerror(result,
847 							"failed to read delta index header");
848 		}
849 
850 		memcpy(&header.magic, buffer, MAGIC_SIZE);
851 		offset += MAGIC_SIZE;
852 		decode_u32_le(buffer, &offset, &header.zone_number);
853 		decode_u32_le(buffer, &offset, &header.zone_count);
854 		decode_u32_le(buffer, &offset, &header.first_list);
855 		decode_u32_le(buffer, &offset, &header.list_count);
856 		decode_u64_le(buffer, &offset, &header.record_count);
857 		decode_u64_le(buffer, &offset, &header.collision_count);
858 
859 		result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
860 				    "%zu bytes decoded of %zu expected", offset,
861 				    sizeof(struct delta_index_header));
862 		if (result != VDO_SUCCESS) {
863 			return vdo_log_warning_strerror(result,
864 							"failed to read delta index header");
865 		}
866 
867 		if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) {
868 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
869 							"delta index file has bad magic number");
870 		}
871 
872 		if (zone_count != header.zone_count) {
873 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
874 							"delta index files contain mismatched zone counts (%u,%u)",
875 							zone_count, header.zone_count);
876 		}
877 
878 		if (header.zone_number != z) {
879 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
880 							"delta index zone %u found in slot %u",
881 							header.zone_number, z);
882 		}
883 
884 		first_list[z] = header.first_list;
885 		list_count[z] = header.list_count;
886 		record_count += header.record_count;
887 		collision_count += header.collision_count;
888 
889 		if (first_list[z] != list_next) {
890 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
891 							"delta index file for zone %u starts with list %u instead of list %u",
892 							z, first_list[z], list_next);
893 		}
894 
895 		list_next += list_count[z];
896 	}
897 
898 	if (list_next != delta_index->list_count) {
899 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
900 						"delta index files contain %u delta lists instead of %u delta lists",
901 						list_next, delta_index->list_count);
902 	}
903 
904 	if (collision_count > record_count) {
905 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
906 						"delta index files contain %llu collisions and %llu records",
907 						(unsigned long long) collision_count,
908 						(unsigned long long) record_count);
909 	}
910 
911 	uds_reset_delta_index(delta_index);
912 	delta_index->delta_zones[0].record_count = record_count;
913 	delta_index->delta_zones[0].collision_count = collision_count;
914 
915 	/* Read the delta lists and distribute them to the proper zones. */
916 	for (z = 0; z < zone_count; z++) {
917 		u32 i;
918 
919 		delta_index->load_lists[z] = 0;
920 		for (i = 0; i < list_count[z]; i++) {
921 			u16 delta_list_size;
922 			u32 list_number;
923 			unsigned int zone_number;
924 			u8 size_data[sizeof(u16)];
925 
926 			result = uds_read_from_buffered_reader(buffered_readers[z],
927 							       size_data,
928 							       sizeof(size_data));
929 			if (result != UDS_SUCCESS) {
930 				return vdo_log_warning_strerror(result,
931 								"failed to read delta index size");
932 			}
933 
934 			delta_list_size = get_unaligned_le16(size_data);
935 			if (delta_list_size > 0)
936 				delta_index->load_lists[z] += 1;
937 
938 			list_number = first_list[z] + i;
939 			zone_number = list_number / delta_index->lists_per_zone;
940 			delta_zone = &delta_index->delta_zones[zone_number];
941 			list_number -= delta_zone->first_list;
942 			delta_zone->delta_lists[list_number + 1].size = delta_list_size;
943 		}
944 	}
945 
946 	/* Prepare each zone to start receiving the delta list data. */
947 	for (z = 0; z < delta_index->zone_count; z++)
948 		rebalance_lists(&delta_index->delta_zones[z]);
949 
950 	return UDS_SUCCESS;
951 }
952 
restore_delta_list_to_zone(struct delta_zone * delta_zone,const struct delta_list_save_info * save_info,const u8 * data)953 static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
954 				      const struct delta_list_save_info *save_info,
955 				      const u8 *data)
956 {
957 	struct delta_list *delta_list;
958 	u16 bit_count;
959 	u16 byte_count;
960 	u32 list_number = save_info->index - delta_zone->first_list;
961 
962 	if (list_number >= delta_zone->list_count) {
963 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
964 						"invalid delta list number %u not in range [%u,%u)",
965 						save_info->index, delta_zone->first_list,
966 						delta_zone->first_list + delta_zone->list_count);
967 	}
968 
969 	delta_list = &delta_zone->delta_lists[list_number + 1];
970 	if (delta_list->size == 0) {
971 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
972 						"unexpected delta list number %u",
973 						save_info->index);
974 	}
975 
976 	bit_count = delta_list->size + save_info->bit_offset;
977 	byte_count = BITS_TO_BYTES(bit_count);
978 	if (save_info->byte_count != byte_count) {
979 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
980 						"unexpected delta list size %u != %u",
981 						save_info->byte_count, byte_count);
982 	}
983 
984 	move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start,
985 		  delta_list->size);
986 	return UDS_SUCCESS;
987 }
988 
restore_delta_list_data(struct delta_index * delta_index,unsigned int load_zone,struct buffered_reader * buffered_reader,u8 * data)989 static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone,
990 				   struct buffered_reader *buffered_reader, u8 *data)
991 {
992 	int result;
993 	struct delta_list_save_info save_info;
994 	u8 buffer[sizeof(struct delta_list_save_info)];
995 	unsigned int new_zone;
996 
997 	result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer));
998 	if (result != UDS_SUCCESS) {
999 		return vdo_log_warning_strerror(result,
1000 						"failed to read delta list data");
1001 	}
1002 
1003 	save_info = (struct delta_list_save_info) {
1004 		.tag = buffer[0],
1005 		.bit_offset = buffer[1],
1006 		.byte_count = get_unaligned_le16(&buffer[2]),
1007 		.index = get_unaligned_le32(&buffer[4]),
1008 	};
1009 
1010 	if ((save_info.bit_offset >= BITS_PER_BYTE) ||
1011 	    (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) {
1012 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
1013 						"corrupt delta list data");
1014 	}
1015 
1016 	/* Make sure the data is intended for this delta index. */
1017 	if (save_info.tag != delta_index->tag)
1018 		return UDS_CORRUPT_DATA;
1019 
1020 	if (save_info.index >= delta_index->list_count) {
1021 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
1022 						"invalid delta list number %u of %u",
1023 						save_info.index,
1024 						delta_index->list_count);
1025 	}
1026 
1027 	result = uds_read_from_buffered_reader(buffered_reader, data,
1028 					       save_info.byte_count);
1029 	if (result != UDS_SUCCESS) {
1030 		return vdo_log_warning_strerror(result,
1031 						"failed to read delta list data");
1032 	}
1033 
1034 	delta_index->load_lists[load_zone] -= 1;
1035 	new_zone = save_info.index / delta_index->lists_per_zone;
1036 	return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone],
1037 					  &save_info, data);
1038 }
1039 
1040 /* Restore delta lists from saved data. */
uds_finish_restoring_delta_index(struct delta_index * delta_index,struct buffered_reader ** buffered_readers,unsigned int reader_count)1041 int uds_finish_restoring_delta_index(struct delta_index *delta_index,
1042 				     struct buffered_reader **buffered_readers,
1043 				     unsigned int reader_count)
1044 {
1045 	int result;
1046 	int saved_result = UDS_SUCCESS;
1047 	unsigned int z;
1048 	u8 *data;
1049 
1050 	result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
1051 	if (result != VDO_SUCCESS)
1052 		return result;
1053 
1054 	for (z = 0; z < reader_count; z++) {
1055 		while (delta_index->load_lists[z] > 0) {
1056 			result = restore_delta_list_data(delta_index, z,
1057 							 buffered_readers[z], data);
1058 			if (result != UDS_SUCCESS) {
1059 				saved_result = result;
1060 				break;
1061 			}
1062 		}
1063 	}
1064 
1065 	vdo_free(data);
1066 	return saved_result;
1067 }
1068 
uds_check_guard_delta_lists(struct buffered_reader ** buffered_readers,unsigned int reader_count)1069 int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
1070 				unsigned int reader_count)
1071 {
1072 	int result;
1073 	unsigned int z;
1074 	u8 buffer[sizeof(struct delta_list_save_info)];
1075 
1076 	for (z = 0; z < reader_count; z++) {
1077 		result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
1078 						       sizeof(buffer));
1079 		if (result != UDS_SUCCESS)
1080 			return result;
1081 
1082 		if (buffer[0] != 'z')
1083 			return UDS_CORRUPT_DATA;
1084 	}
1085 
1086 	return UDS_SUCCESS;
1087 }
1088 
flush_delta_list(struct delta_zone * zone,u32 flush_index)1089 static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
1090 {
1091 	struct delta_list *delta_list;
1092 	u8 buffer[sizeof(struct delta_list_save_info)];
1093 	int result;
1094 
1095 	delta_list = &zone->delta_lists[flush_index + 1];
1096 
1097 	buffer[0] = zone->tag;
1098 	buffer[1] = delta_list->start % BITS_PER_BYTE;
1099 	put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]);
1100 	put_unaligned_le32(zone->first_list + flush_index, &buffer[4]);
1101 
1102 	result = uds_write_to_buffered_writer(zone->buffered_writer, buffer,
1103 					      sizeof(buffer));
1104 	if (result != UDS_SUCCESS) {
1105 		vdo_log_warning_strerror(result, "failed to write delta list memory");
1106 		return result;
1107 	}
1108 
1109 	result = uds_write_to_buffered_writer(zone->buffered_writer,
1110 					      zone->memory + get_delta_list_byte_start(delta_list),
1111 					      get_delta_list_byte_size(delta_list));
1112 	if (result != UDS_SUCCESS)
1113 		vdo_log_warning_strerror(result, "failed to write delta list memory");
1114 
1115 	return result;
1116 }
1117 
1118 /* Start saving a delta index zone to a buffered output stream. */
uds_start_saving_delta_index(const struct delta_index * delta_index,unsigned int zone_number,struct buffered_writer * buffered_writer)1119 int uds_start_saving_delta_index(const struct delta_index *delta_index,
1120 				 unsigned int zone_number,
1121 				 struct buffered_writer *buffered_writer)
1122 {
1123 	int result;
1124 	u32 i;
1125 	struct delta_zone *delta_zone;
1126 	u8 buffer[sizeof(struct delta_index_header)];
1127 	size_t offset = 0;
1128 
1129 	delta_zone = &delta_index->delta_zones[zone_number];
1130 	memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE);
1131 	offset += MAGIC_SIZE;
1132 	encode_u32_le(buffer, &offset, zone_number);
1133 	encode_u32_le(buffer, &offset, delta_index->zone_count);
1134 	encode_u32_le(buffer, &offset, delta_zone->first_list);
1135 	encode_u32_le(buffer, &offset, delta_zone->list_count);
1136 	encode_u64_le(buffer, &offset, delta_zone->record_count);
1137 	encode_u64_le(buffer, &offset, delta_zone->collision_count);
1138 
1139 	result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
1140 			    "%zu bytes encoded of %zu expected", offset,
1141 			    sizeof(struct delta_index_header));
1142 	if (result != VDO_SUCCESS)
1143 		return result;
1144 
1145 	result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
1146 	if (result != UDS_SUCCESS)
1147 		return vdo_log_warning_strerror(result,
1148 						"failed to write delta index header");
1149 
1150 	for (i = 0; i < delta_zone->list_count; i++) {
1151 		u8 data[sizeof(u16)];
1152 		struct delta_list *delta_list;
1153 
1154 		delta_list = &delta_zone->delta_lists[i + 1];
1155 		put_unaligned_le16(delta_list->size, data);
1156 		result = uds_write_to_buffered_writer(buffered_writer, data,
1157 						      sizeof(data));
1158 		if (result != UDS_SUCCESS)
1159 			return vdo_log_warning_strerror(result,
1160 							"failed to write delta list size");
1161 	}
1162 
1163 	delta_zone->buffered_writer = buffered_writer;
1164 	return UDS_SUCCESS;
1165 }
1166 
uds_finish_saving_delta_index(const struct delta_index * delta_index,unsigned int zone_number)1167 int uds_finish_saving_delta_index(const struct delta_index *delta_index,
1168 				  unsigned int zone_number)
1169 {
1170 	int result;
1171 	int first_error = UDS_SUCCESS;
1172 	u32 i;
1173 	struct delta_zone *delta_zone;
1174 	struct delta_list *delta_list;
1175 
1176 	delta_zone = &delta_index->delta_zones[zone_number];
1177 	for (i = 0; i < delta_zone->list_count; i++) {
1178 		delta_list = &delta_zone->delta_lists[i + 1];
1179 		if (delta_list->size > 0) {
1180 			result = flush_delta_list(delta_zone, i);
1181 			if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS))
1182 				first_error = result;
1183 		}
1184 	}
1185 
1186 	delta_zone->buffered_writer = NULL;
1187 	return first_error;
1188 }
1189 
uds_write_guard_delta_list(struct buffered_writer * buffered_writer)1190 int uds_write_guard_delta_list(struct buffered_writer *buffered_writer)
1191 {
1192 	int result;
1193 	u8 buffer[sizeof(struct delta_list_save_info)];
1194 
1195 	memset(buffer, 0, sizeof(struct delta_list_save_info));
1196 	buffer[0] = 'z';
1197 
1198 	result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer));
1199 	if (result != UDS_SUCCESS)
1200 		vdo_log_warning_strerror(result, "failed to write guard delta list");
1201 
1202 	return UDS_SUCCESS;
1203 }
1204 
uds_compute_delta_index_save_bytes(u32 list_count,size_t memory_size)1205 size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size)
1206 {
1207 	/* One zone will use at least as much memory as other zone counts. */
1208 	return (sizeof(struct delta_index_header) +
1209 		list_count * (sizeof(struct delta_list_save_info) + 1) +
1210 		get_zone_memory_size(1, memory_size));
1211 }
1212 
assert_not_at_end(const struct delta_index_entry * delta_entry)1213 static int assert_not_at_end(const struct delta_index_entry *delta_entry)
1214 {
1215 	int result = VDO_ASSERT(!delta_entry->at_end,
1216 				"operation is invalid because the list entry is at the end of the delta list");
1217 	if (result != VDO_SUCCESS)
1218 		result = UDS_BAD_STATE;
1219 
1220 	return result;
1221 }
1222 
1223 /*
1224  * Prepare to search for an entry in the specified delta list.
1225  *
1226  * This is always the first function to be called when dealing with delta index entries. It is
1227  * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The
1228  * fields of the delta_index_entry argument will be set up for iteration, but will not contain an
1229  * entry from the list.
1230  */
uds_start_delta_index_search(const struct delta_index * delta_index,u32 list_number,u32 key,struct delta_index_entry * delta_entry)1231 int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number,
1232 				 u32 key, struct delta_index_entry *delta_entry)
1233 {
1234 	int result;
1235 	unsigned int zone_number;
1236 	struct delta_zone *delta_zone;
1237 	struct delta_list *delta_list;
1238 
1239 	result = VDO_ASSERT((list_number < delta_index->list_count),
1240 			    "Delta list number (%u) is out of range (%u)", list_number,
1241 			    delta_index->list_count);
1242 	if (result != VDO_SUCCESS)
1243 		return UDS_CORRUPT_DATA;
1244 
1245 	zone_number = list_number / delta_index->lists_per_zone;
1246 	delta_zone = &delta_index->delta_zones[zone_number];
1247 	list_number -= delta_zone->first_list;
1248 	result = VDO_ASSERT((list_number < delta_zone->list_count),
1249 			    "Delta list number (%u) is out of range (%u) for zone (%u)",
1250 			    list_number, delta_zone->list_count, zone_number);
1251 	if (result != VDO_SUCCESS)
1252 		return UDS_CORRUPT_DATA;
1253 
1254 	if (delta_index->mutable) {
1255 		delta_list = &delta_zone->delta_lists[list_number + 1];
1256 	} else {
1257 		u32 end_offset;
1258 
1259 		/*
1260 		 * Translate the immutable delta list header into a temporary
1261 		 * full delta list header.
1262 		 */
1263 		delta_list = &delta_entry->temp_delta_list;
1264 		delta_list->start = get_immutable_start(delta_zone->memory, list_number);
1265 		end_offset = get_immutable_start(delta_zone->memory, list_number + 1);
1266 		delta_list->size = end_offset - delta_list->start;
1267 		delta_list->save_key = 0;
1268 		delta_list->save_offset = 0;
1269 	}
1270 
1271 	if (key > delta_list->save_key) {
1272 		delta_entry->key = delta_list->save_key;
1273 		delta_entry->offset = delta_list->save_offset;
1274 	} else {
1275 		delta_entry->key = 0;
1276 		delta_entry->offset = 0;
1277 		if (key == 0) {
1278 			/*
1279 			 * This usually means we're about to walk the entire delta list, so get all
1280 			 * of it into the CPU cache.
1281 			 */
1282 			uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE],
1283 					   delta_list->size / BITS_PER_BYTE, false);
1284 		}
1285 	}
1286 
1287 	delta_entry->at_end = false;
1288 	delta_entry->delta_zone = delta_zone;
1289 	delta_entry->delta_list = delta_list;
1290 	delta_entry->entry_bits = 0;
1291 	delta_entry->is_collision = false;
1292 	delta_entry->list_number = list_number;
1293 	delta_entry->list_overflow = false;
1294 	delta_entry->value_bits = delta_zone->value_bits;
1295 	return UDS_SUCCESS;
1296 }
1297 
get_delta_entry_offset(const struct delta_index_entry * delta_entry)1298 static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry)
1299 {
1300 	return delta_entry->delta_list->start + delta_entry->offset;
1301 }
1302 
1303 /*
1304  * Decode a delta index entry delta value. The delta_index_entry basically describes the previous
1305  * list entry, and has had its offset field changed to point to the subsequent entry. We decode the
1306  * bit stream and update the delta_list_entry to describe the entry.
1307  */
decode_delta(struct delta_index_entry * delta_entry)1308 static inline void decode_delta(struct delta_index_entry *delta_entry)
1309 {
1310 	int key_bits;
1311 	u32 delta;
1312 	const struct delta_zone *delta_zone = delta_entry->delta_zone;
1313 	const u8 *memory = delta_zone->memory;
1314 	u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
1315 	const u8 *addr = memory + delta_offset / BITS_PER_BYTE;
1316 	int offset = delta_offset % BITS_PER_BYTE;
1317 	u32 data = get_unaligned_le32(addr) >> offset;
1318 
1319 	addr += sizeof(u32);
1320 	key_bits = delta_zone->min_bits;
1321 	delta = data & ((1 << key_bits) - 1);
1322 	if (delta >= delta_zone->min_keys) {
1323 		data >>= key_bits;
1324 		if (data == 0) {
1325 			key_bits = sizeof(u32) * BITS_PER_BYTE - offset;
1326 			while ((data = get_unaligned_le32(addr)) == 0) {
1327 				addr += sizeof(u32);
1328 				key_bits += sizeof(u32) * BITS_PER_BYTE;
1329 			}
1330 		}
1331 		key_bits += ffs(data);
1332 		delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys);
1333 	}
1334 	delta_entry->delta = delta;
1335 	delta_entry->key += delta;
1336 
1337 	/* Check for a collision, a delta of zero after the start. */
1338 	if (unlikely((delta == 0) && (delta_entry->offset > 0))) {
1339 		delta_entry->is_collision = true;
1340 		delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS;
1341 	} else {
1342 		delta_entry->is_collision = false;
1343 		delta_entry->entry_bits = delta_entry->value_bits + key_bits;
1344 	}
1345 }
1346 
uds_next_delta_index_entry(struct delta_index_entry * delta_entry)1347 noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry)
1348 {
1349 	int result;
1350 	const struct delta_list *delta_list;
1351 	u32 next_offset;
1352 	u16 size;
1353 
1354 	result = assert_not_at_end(delta_entry);
1355 	if (result != UDS_SUCCESS)
1356 		return result;
1357 
1358 	delta_list = delta_entry->delta_list;
1359 	delta_entry->offset += delta_entry->entry_bits;
1360 	size = delta_list->size;
1361 	if (unlikely(delta_entry->offset >= size)) {
1362 		delta_entry->at_end = true;
1363 		delta_entry->delta = 0;
1364 		delta_entry->is_collision = false;
1365 		result = VDO_ASSERT((delta_entry->offset == size),
1366 				    "next offset past end of delta list");
1367 		if (result != VDO_SUCCESS)
1368 			result = UDS_CORRUPT_DATA;
1369 
1370 		return result;
1371 	}
1372 
1373 	decode_delta(delta_entry);
1374 
1375 	next_offset = delta_entry->offset + delta_entry->entry_bits;
1376 	if (next_offset > size) {
1377 		/*
1378 		 * This is not an assertion because uds_validate_chapter_index_page() wants to
1379 		 * handle this error.
1380 		 */
1381 		vdo_log_warning("Decoded past the end of the delta list");
1382 		return UDS_CORRUPT_DATA;
1383 	}
1384 
1385 	return UDS_SUCCESS;
1386 }
1387 
uds_remember_delta_index_offset(const struct delta_index_entry * delta_entry)1388 int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry)
1389 {
1390 	int result;
1391 	struct delta_list *delta_list = delta_entry->delta_list;
1392 
1393 	result = VDO_ASSERT(!delta_entry->is_collision, "entry is not a collision");
1394 	if (result != VDO_SUCCESS)
1395 		return result;
1396 
1397 	delta_list->save_key = delta_entry->key - delta_entry->delta;
1398 	delta_list->save_offset = delta_entry->offset;
1399 	return UDS_SUCCESS;
1400 }
1401 
set_delta(struct delta_index_entry * delta_entry,u32 delta)1402 static void set_delta(struct delta_index_entry *delta_entry, u32 delta)
1403 {
1404 	const struct delta_zone *delta_zone = delta_entry->delta_zone;
1405 	u32 key_bits = (delta_zone->min_bits +
1406 			((delta_zone->incr_keys - delta_zone->min_keys + delta) /
1407 			 delta_zone->incr_keys));
1408 
1409 	delta_entry->delta = delta;
1410 	delta_entry->entry_bits = delta_entry->value_bits + key_bits;
1411 }
1412 
get_collision_name(const struct delta_index_entry * entry,u8 * name)1413 static void get_collision_name(const struct delta_index_entry *entry, u8 *name)
1414 {
1415 	u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
1416 	const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
1417 	int size = COLLISION_BYTES;
1418 	int shift = offset % BITS_PER_BYTE;
1419 
1420 	while (--size >= 0)
1421 		*name++ = get_unaligned_le16(addr++) >> shift;
1422 }
1423 
set_collision_name(const struct delta_index_entry * entry,const u8 * name)1424 static void set_collision_name(const struct delta_index_entry *entry, const u8 *name)
1425 {
1426 	u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
1427 	u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
1428 	int size = COLLISION_BYTES;
1429 	int shift = offset % BITS_PER_BYTE;
1430 	u16 mask = ~((u16) 0xFF << shift);
1431 	u16 data;
1432 
1433 	while (--size >= 0) {
1434 		data = (get_unaligned_le16(addr) & mask) | (*name++ << shift);
1435 		put_unaligned_le16(data, addr++);
1436 	}
1437 }
1438 
uds_get_delta_index_entry(const struct delta_index * delta_index,u32 list_number,u32 key,const u8 * name,struct delta_index_entry * delta_entry)1439 int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number,
1440 			      u32 key, const u8 *name,
1441 			      struct delta_index_entry *delta_entry)
1442 {
1443 	int result;
1444 
1445 	result = uds_start_delta_index_search(delta_index, list_number, key,
1446 					      delta_entry);
1447 	if (result != UDS_SUCCESS)
1448 		return result;
1449 
1450 	do {
1451 		result = uds_next_delta_index_entry(delta_entry);
1452 		if (result != UDS_SUCCESS)
1453 			return result;
1454 	} while (!delta_entry->at_end && (key > delta_entry->key));
1455 
1456 	result = uds_remember_delta_index_offset(delta_entry);
1457 	if (result != UDS_SUCCESS)
1458 		return result;
1459 
1460 	if (!delta_entry->at_end && (key == delta_entry->key)) {
1461 		struct delta_index_entry collision_entry = *delta_entry;
1462 
1463 		for (;;) {
1464 			u8 full_name[COLLISION_BYTES];
1465 
1466 			result = uds_next_delta_index_entry(&collision_entry);
1467 			if (result != UDS_SUCCESS)
1468 				return result;
1469 
1470 			if (collision_entry.at_end || !collision_entry.is_collision)
1471 				break;
1472 
1473 			get_collision_name(&collision_entry, full_name);
1474 			if (memcmp(full_name, name, COLLISION_BYTES) == 0) {
1475 				*delta_entry = collision_entry;
1476 				break;
1477 			}
1478 		}
1479 	}
1480 
1481 	return UDS_SUCCESS;
1482 }
1483 
uds_get_delta_entry_collision(const struct delta_index_entry * delta_entry,u8 * name)1484 int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name)
1485 {
1486 	int result;
1487 
1488 	result = assert_not_at_end(delta_entry);
1489 	if (result != UDS_SUCCESS)
1490 		return result;
1491 
1492 	result = VDO_ASSERT(delta_entry->is_collision,
1493 			    "Cannot get full block name from a non-collision delta index entry");
1494 	if (result != VDO_SUCCESS)
1495 		return UDS_BAD_STATE;
1496 
1497 	get_collision_name(delta_entry, name);
1498 	return UDS_SUCCESS;
1499 }
1500 
uds_get_delta_entry_value(const struct delta_index_entry * delta_entry)1501 u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry)
1502 {
1503 	return get_field(delta_entry->delta_zone->memory,
1504 			 get_delta_entry_offset(delta_entry), delta_entry->value_bits);
1505 }
1506 
assert_mutable_entry(const struct delta_index_entry * delta_entry)1507 static int assert_mutable_entry(const struct delta_index_entry *delta_entry)
1508 {
1509 	int result = VDO_ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list),
1510 			        "delta index is mutable");
1511 	if (result != VDO_SUCCESS)
1512 		result = UDS_BAD_STATE;
1513 
1514 	return result;
1515 }
1516 
uds_set_delta_entry_value(const struct delta_index_entry * delta_entry,u32 value)1517 int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value)
1518 {
1519 	int result;
1520 	u32 value_mask = (1 << delta_entry->value_bits) - 1;
1521 
1522 	result = assert_mutable_entry(delta_entry);
1523 	if (result != UDS_SUCCESS)
1524 		return result;
1525 
1526 	result = assert_not_at_end(delta_entry);
1527 	if (result != UDS_SUCCESS)
1528 		return result;
1529 
1530 	result = VDO_ASSERT((value & value_mask) == value,
1531 			    "Value (%u) being set in a delta index is too large (must fit in %u bits)",
1532 			    value, delta_entry->value_bits);
1533 	if (result != VDO_SUCCESS)
1534 		return UDS_INVALID_ARGUMENT;
1535 
1536 	set_field(value, delta_entry->delta_zone->memory,
1537 		  get_delta_entry_offset(delta_entry), delta_entry->value_bits);
1538 	return UDS_SUCCESS;
1539 }
1540 
1541 /*
1542  * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated
1543  * by growing_index, then rebalancing the lists in the new chunk.
1544  */
extend_delta_zone(struct delta_zone * delta_zone,u32 growing_index,size_t growing_size)1545 static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index,
1546 			     size_t growing_size)
1547 {
1548 	ktime_t start_time;
1549 	ktime_t end_time;
1550 	struct delta_list *delta_lists;
1551 	u32 i;
1552 	size_t used_space;
1553 
1554 
1555 	/* Calculate the amount of space that is or will be in use. */
1556 	start_time = current_time_ns(CLOCK_MONOTONIC);
1557 	delta_lists = delta_zone->delta_lists;
1558 	used_space = growing_size;
1559 	for (i = 0; i <= delta_zone->list_count + 1; i++)
1560 		used_space += get_delta_list_byte_size(&delta_lists[i]);
1561 
1562 	if (delta_zone->size < used_space)
1563 		return UDS_OVERFLOW;
1564 
1565 	/* Compute the new offsets of the delta lists. */
1566 	compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space);
1567 
1568 	/*
1569 	 * When we rebalance the delta list, we will include the end guard list in the rebalancing.
1570 	 * It contains the end guard data, which must be copied.
1571 	 */
1572 	rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1);
1573 	end_time = current_time_ns(CLOCK_MONOTONIC);
1574 	delta_zone->rebalance_count++;
1575 	delta_zone->rebalance_time += ktime_sub(end_time, start_time);
1576 	return UDS_SUCCESS;
1577 }
1578 
insert_bits(struct delta_index_entry * delta_entry,u16 size)1579 static int insert_bits(struct delta_index_entry *delta_entry, u16 size)
1580 {
1581 	u64 free_before;
1582 	u64 free_after;
1583 	u64 source;
1584 	u64 destination;
1585 	u32 count;
1586 	bool before_flag;
1587 	u8 *memory;
1588 	struct delta_zone *delta_zone = delta_entry->delta_zone;
1589 	struct delta_list *delta_list = delta_entry->delta_list;
1590 	/* Compute bits in use before and after the inserted bits. */
1591 	u32 total_size = delta_list->size;
1592 	u32 before_size = delta_entry->offset;
1593 	u32 after_size = total_size - delta_entry->offset;
1594 
1595 	if (total_size + size > U16_MAX) {
1596 		delta_entry->list_overflow = true;
1597 		delta_zone->overflow_count++;
1598 		return UDS_OVERFLOW;
1599 	}
1600 
1601 	/* Compute bits available before and after the delta list. */
1602 	free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
1603 	free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
1604 
1605 	if ((size <= free_before) && (size <= free_after)) {
1606 		/*
1607 		 * We have enough space to use either before or after the list. Select the smaller
1608 		 * amount of data. If it is exactly the same, try to take from the larger amount of
1609 		 * free space.
1610 		 */
1611 		if (before_size < after_size)
1612 			before_flag = true;
1613 		else if (after_size < before_size)
1614 			before_flag = false;
1615 		else
1616 			before_flag = free_before > free_after;
1617 	} else if (size <= free_before) {
1618 		/* There is space before but not after. */
1619 		before_flag = true;
1620 	} else if (size <= free_after) {
1621 		/* There is space after but not before. */
1622 		before_flag = false;
1623 	} else {
1624 		/*
1625 		 * Neither of the surrounding spaces is large enough for this request. Extend
1626 		 * and/or rebalance the delta list memory choosing to move the least amount of
1627 		 * data.
1628 		 */
1629 		int result;
1630 		u32 growing_index = delta_entry->list_number + 1;
1631 
1632 		before_flag = before_size < after_size;
1633 		if (!before_flag)
1634 			growing_index++;
1635 		result = extend_delta_zone(delta_zone, growing_index,
1636 					   BITS_TO_BYTES(size));
1637 		if (result != UDS_SUCCESS)
1638 			return result;
1639 	}
1640 
1641 	delta_list->size += size;
1642 	if (before_flag) {
1643 		source = delta_list->start;
1644 		destination = source - size;
1645 		delta_list->start -= size;
1646 		count = before_size;
1647 	} else {
1648 		source = delta_list->start + delta_entry->offset;
1649 		destination = source + size;
1650 		count = after_size;
1651 	}
1652 
1653 	memory = delta_zone->memory;
1654 	move_bits(memory, source, memory, destination, count);
1655 	return UDS_SUCCESS;
1656 }
1657 
encode_delta(const struct delta_index_entry * delta_entry)1658 static void encode_delta(const struct delta_index_entry *delta_entry)
1659 {
1660 	u32 temp;
1661 	u32 t1;
1662 	u32 t2;
1663 	u64 offset;
1664 	const struct delta_zone *delta_zone = delta_entry->delta_zone;
1665 	u8 *memory = delta_zone->memory;
1666 
1667 	offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
1668 	if (delta_entry->delta < delta_zone->min_keys) {
1669 		set_field(delta_entry->delta, memory, offset, delta_zone->min_bits);
1670 		return;
1671 	}
1672 
1673 	temp = delta_entry->delta - delta_zone->min_keys;
1674 	t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys;
1675 	t2 = temp / delta_zone->incr_keys;
1676 	set_field(t1, memory, offset, delta_zone->min_bits);
1677 	set_zero(memory, offset + delta_zone->min_bits, t2);
1678 	set_field(1, memory, offset + delta_zone->min_bits + t2, 1);
1679 }
1680 
encode_entry(const struct delta_index_entry * delta_entry,u32 value,const u8 * name)1681 static void encode_entry(const struct delta_index_entry *delta_entry, u32 value,
1682 			 const u8 *name)
1683 {
1684 	u8 *memory = delta_entry->delta_zone->memory;
1685 	u64 offset = get_delta_entry_offset(delta_entry);
1686 
1687 	set_field(value, memory, offset, delta_entry->value_bits);
1688 	encode_delta(delta_entry);
1689 	if (name != NULL)
1690 		set_collision_name(delta_entry, name);
1691 }
1692 
1693 /*
1694  * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must
1695  * be provided.
1696  */
uds_put_delta_index_entry(struct delta_index_entry * delta_entry,u32 key,u32 value,const u8 * name)1697 int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value,
1698 			      const u8 *name)
1699 {
1700 	int result;
1701 	struct delta_zone *delta_zone;
1702 
1703 	result = assert_mutable_entry(delta_entry);
1704 	if (result != UDS_SUCCESS)
1705 		return result;
1706 
1707 	if (delta_entry->is_collision) {
1708 		/*
1709 		 * The caller wants us to insert a collision entry onto a collision entry. This
1710 		 * happens when we find a collision and attempt to add the name again to the index.
1711 		 * This is normally a fatal error unless we are replaying a closed chapter while we
1712 		 * are rebuilding a volume index.
1713 		 */
1714 		return UDS_DUPLICATE_NAME;
1715 	}
1716 
1717 	if (delta_entry->offset < delta_entry->delta_list->save_offset) {
1718 		/*
1719 		 * The saved entry offset is after the new entry and will no longer be valid, so
1720 		 * replace it with the insertion point.
1721 		 */
1722 		result = uds_remember_delta_index_offset(delta_entry);
1723 		if (result != UDS_SUCCESS)
1724 			return result;
1725 	}
1726 
1727 	if (name != NULL) {
1728 		/* Insert a collision entry which is placed after this entry. */
1729 		result = assert_not_at_end(delta_entry);
1730 		if (result != UDS_SUCCESS)
1731 			return result;
1732 
1733 		result = VDO_ASSERT((key == delta_entry->key),
1734 				    "incorrect key for collision entry");
1735 		if (result != VDO_SUCCESS)
1736 			return result;
1737 
1738 		delta_entry->offset += delta_entry->entry_bits;
1739 		set_delta(delta_entry, 0);
1740 		delta_entry->is_collision = true;
1741 		delta_entry->entry_bits += COLLISION_BITS;
1742 		result = insert_bits(delta_entry, delta_entry->entry_bits);
1743 	} else if (delta_entry->at_end) {
1744 		/* Insert a new entry at the end of the delta list. */
1745 		result = VDO_ASSERT((key >= delta_entry->key), "key past end of list");
1746 		if (result != VDO_SUCCESS)
1747 			return result;
1748 
1749 		set_delta(delta_entry, key - delta_entry->key);
1750 		delta_entry->key = key;
1751 		delta_entry->at_end = false;
1752 		result = insert_bits(delta_entry, delta_entry->entry_bits);
1753 	} else {
1754 		u16 old_entry_size;
1755 		u16 additional_size;
1756 		struct delta_index_entry next_entry;
1757 		u32 next_value;
1758 
1759 		/*
1760 		 * Insert a new entry which requires the delta in the following entry to be
1761 		 * updated.
1762 		 */
1763 		result = VDO_ASSERT((key < delta_entry->key),
1764 				    "key precedes following entry");
1765 		if (result != VDO_SUCCESS)
1766 			return result;
1767 
1768 		result = VDO_ASSERT((key >= delta_entry->key - delta_entry->delta),
1769 				    "key effects following entry's delta");
1770 		if (result != VDO_SUCCESS)
1771 			return result;
1772 
1773 		old_entry_size = delta_entry->entry_bits;
1774 		next_entry = *delta_entry;
1775 		next_value = uds_get_delta_entry_value(&next_entry);
1776 		set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta));
1777 		delta_entry->key = key;
1778 		set_delta(&next_entry, next_entry.key - key);
1779 		next_entry.offset += delta_entry->entry_bits;
1780 		/* The two new entries are always bigger than the single entry being replaced. */
1781 		additional_size = (delta_entry->entry_bits +
1782 				   next_entry.entry_bits - old_entry_size);
1783 		result = insert_bits(delta_entry, additional_size);
1784 		if (result != UDS_SUCCESS)
1785 			return result;
1786 
1787 		encode_entry(&next_entry, next_value, NULL);
1788 	}
1789 
1790 	if (result != UDS_SUCCESS)
1791 		return result;
1792 
1793 	encode_entry(delta_entry, value, name);
1794 	delta_zone = delta_entry->delta_zone;
1795 	delta_zone->record_count++;
1796 	delta_zone->collision_count += delta_entry->is_collision ? 1 : 0;
1797 	return UDS_SUCCESS;
1798 }
1799 
delete_bits(const struct delta_index_entry * delta_entry,int size)1800 static void delete_bits(const struct delta_index_entry *delta_entry, int size)
1801 {
1802 	u64 source;
1803 	u64 destination;
1804 	u32 count;
1805 	bool before_flag;
1806 	struct delta_list *delta_list = delta_entry->delta_list;
1807 	u8 *memory = delta_entry->delta_zone->memory;
1808 	/* Compute bits retained before and after the deleted bits. */
1809 	u32 total_size = delta_list->size;
1810 	u32 before_size = delta_entry->offset;
1811 	u32 after_size = total_size - delta_entry->offset - size;
1812 
1813 	/*
1814 	 * Determine whether to add to the available space either before or after the delta list.
1815 	 * We prefer to move the least amount of data. If it is exactly the same, try to add to the
1816 	 * smaller amount of free space.
1817 	 */
1818 	if (before_size < after_size) {
1819 		before_flag = true;
1820 	} else if (after_size < before_size) {
1821 		before_flag = false;
1822 	} else {
1823 		u64 free_before =
1824 			(delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
1825 		u64 free_after =
1826 			(delta_list[1].start - (delta_list[0].start + delta_list[0].size));
1827 
1828 		before_flag = (free_before < free_after);
1829 	}
1830 
1831 	delta_list->size -= size;
1832 	if (before_flag) {
1833 		source = delta_list->start;
1834 		destination = source + size;
1835 		delta_list->start += size;
1836 		count = before_size;
1837 	} else {
1838 		destination = delta_list->start + delta_entry->offset;
1839 		source = destination + size;
1840 		count = after_size;
1841 	}
1842 
1843 	move_bits(memory, source, memory, destination, count);
1844 }
1845 
uds_remove_delta_index_entry(struct delta_index_entry * delta_entry)1846 int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry)
1847 {
1848 	int result;
1849 	struct delta_index_entry next_entry;
1850 	struct delta_zone *delta_zone;
1851 	struct delta_list *delta_list;
1852 
1853 	result = assert_mutable_entry(delta_entry);
1854 	if (result != UDS_SUCCESS)
1855 		return result;
1856 
1857 	next_entry = *delta_entry;
1858 	result = uds_next_delta_index_entry(&next_entry);
1859 	if (result != UDS_SUCCESS)
1860 		return result;
1861 
1862 	delta_zone = delta_entry->delta_zone;
1863 
1864 	if (delta_entry->is_collision) {
1865 		/* This is a collision entry, so just remove it. */
1866 		delete_bits(delta_entry, delta_entry->entry_bits);
1867 		next_entry.offset = delta_entry->offset;
1868 		delta_zone->collision_count -= 1;
1869 	} else if (next_entry.at_end) {
1870 		/* This entry is at the end of the list, so just remove it. */
1871 		delete_bits(delta_entry, delta_entry->entry_bits);
1872 		next_entry.key -= delta_entry->delta;
1873 		next_entry.offset = delta_entry->offset;
1874 	} else {
1875 		/* The delta in the next entry needs to be updated. */
1876 		u32 next_value = uds_get_delta_entry_value(&next_entry);
1877 		u16 old_size = delta_entry->entry_bits + next_entry.entry_bits;
1878 
1879 		if (next_entry.is_collision) {
1880 			next_entry.is_collision = false;
1881 			delta_zone->collision_count -= 1;
1882 		}
1883 
1884 		set_delta(&next_entry, delta_entry->delta + next_entry.delta);
1885 		next_entry.offset = delta_entry->offset;
1886 		/* The one new entry is always smaller than the two entries being replaced. */
1887 		delete_bits(delta_entry, old_size - next_entry.entry_bits);
1888 		encode_entry(&next_entry, next_value, NULL);
1889 	}
1890 
1891 	delta_zone->record_count--;
1892 	delta_zone->discard_count++;
1893 	*delta_entry = next_entry;
1894 
1895 	delta_list = delta_entry->delta_list;
1896 	if (delta_entry->offset < delta_list->save_offset) {
1897 		/* The saved entry offset is no longer valid. */
1898 		delta_list->save_key = 0;
1899 		delta_list->save_offset = 0;
1900 	}
1901 
1902 	return UDS_SUCCESS;
1903 }
1904 
uds_get_delta_index_stats(const struct delta_index * delta_index,struct delta_index_stats * stats)1905 void uds_get_delta_index_stats(const struct delta_index *delta_index,
1906 			       struct delta_index_stats *stats)
1907 {
1908 	unsigned int z;
1909 	const struct delta_zone *delta_zone;
1910 
1911 	memset(stats, 0, sizeof(struct delta_index_stats));
1912 	for (z = 0; z < delta_index->zone_count; z++) {
1913 		delta_zone = &delta_index->delta_zones[z];
1914 		stats->rebalance_time += delta_zone->rebalance_time;
1915 		stats->rebalance_count += delta_zone->rebalance_count;
1916 		stats->record_count += delta_zone->record_count;
1917 		stats->collision_count += delta_zone->collision_count;
1918 		stats->discard_count += delta_zone->discard_count;
1919 		stats->overflow_count += delta_zone->overflow_count;
1920 		stats->list_count += delta_zone->list_count;
1921 	}
1922 }
1923 
uds_compute_delta_index_size(u32 entry_count,u32 mean_delta,u32 payload_bits)1924 size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits)
1925 {
1926 	u16 min_bits;
1927 	u32 incr_keys;
1928 	u32 min_keys;
1929 
1930 	compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys);
1931 	/* On average, each delta is encoded into about min_bits + 1.5 bits. */
1932 	return entry_count * (payload_bits + min_bits + 1) + entry_count / 2;
1933 }
1934 
uds_get_delta_index_page_count(u32 entry_count,u32 list_count,u32 mean_delta,u32 payload_bits,size_t bytes_per_page)1935 u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
1936 				   u32 payload_bits, size_t bytes_per_page)
1937 {
1938 	unsigned int bits_per_delta_list;
1939 	unsigned int bits_per_page;
1940 	size_t bits_per_index;
1941 
1942 	/* Compute the expected number of bits needed for all the entries. */
1943 	bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta,
1944 						      payload_bits);
1945 	bits_per_delta_list = bits_per_index / list_count;
1946 
1947 	/* Add in the immutable delta list headers. */
1948 	bits_per_index += list_count * IMMUTABLE_HEADER_SIZE;
1949 	/* Compute the number of usable bits on an immutable index page. */
1950 	bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE);
1951 	/*
1952 	 * Reduce the bits per page by one immutable delta list header and one delta list to
1953 	 * account for internal fragmentation.
1954 	 */
1955 	bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list;
1956 	/* Now compute the number of pages needed. */
1957 	return DIV_ROUND_UP(bits_per_index, bits_per_page);
1958 }
1959 
uds_log_delta_index_entry(struct delta_index_entry * delta_entry)1960 void uds_log_delta_index_entry(struct delta_index_entry *delta_entry)
1961 {
1962 	vdo_log_ratelimit(vdo_log_info,
1963 			  "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
1964 			  delta_entry->list_number, delta_entry->key,
1965 			  delta_entry->offset, delta_entry->at_end ? " end" : "",
1966 			  delta_entry->is_collision ? " collision" : "",
1967 			  delta_entry->delta_list->size,
1968 			  delta_entry->list_overflow ? " overflow" : "");
1969 	delta_entry->list_overflow = false;
1970 }
1971