1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 /* 3 * Copyright 2023 Red Hat 4 */ 5 6 #ifndef VDO_RECOVERY_JOURNAL_H 7 #define VDO_RECOVERY_JOURNAL_H 8 9 #include <linux/list.h> 10 11 #include "numeric.h" 12 13 #include "admin-state.h" 14 #include "constants.h" 15 #include "encodings.h" 16 #include "flush.h" 17 #include "statistics.h" 18 #include "types.h" 19 #include "wait-queue.h" 20 21 /** 22 * DOC: recovery journal. 23 * 24 * The recovery_journal provides a log of all block mapping and reference count changes which have 25 * not yet been stably written to the block map or slab journals. This log helps to reduce the 26 * write amplification of writes by providing amortization of slab journal and block map page 27 * updates. 28 * 29 * The recovery journal has a single dedicated queue and thread for performing all journal updates. 30 * The concurrency guarantees of this single-threaded model allow the code to omit more 31 * fine-grained locking for recovery journal structures. 32 * 33 * The journal consists of a set of on-disk blocks arranged as a circular log with monotonically 34 * increasing sequence numbers. Three sequence numbers serve to define the active extent of the 35 * journal. The 'head' is the oldest active block in the journal. The 'tail' is the end of the 36 * half-open interval containing the active blocks. 'active' is the number of the block actively 37 * receiving entries. In an empty journal, head == active == tail. Once any entries are added, tail 38 * = active + 1, and head may be any value in the interval [tail - size, active]. 39 * 40 * The journal also contains a set of in-memory blocks which are used to buffer up entries until 41 * they can be committed. In general the number of in-memory blocks ('tail_buffer_count') will be 42 * less than the on-disk size. Each in-memory block is also a vdo_completion. Each in-memory block 43 * has a vio which is used to commit that block to disk. The vio's data is the on-disk 44 * representation of the journal block. In addition each in-memory block has a buffer which is used 45 * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are 46 * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active 47 * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is 48 * moved back to the 'free_tail_blocks' ring. 49 * 50 * When entries are added to the journal, they are added to the active in-memory block, as 51 * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be 52 * committed, the requesting VIO will be attached to the in-memory block to which the caller's 53 * entry was added. If the caller does wish to wait, or if the entry filled the active block, an 54 * attempt will be made to commit that block to disk. If there is already another commit in 55 * progress, the attempt will be ignored and then automatically retried when the in-progress commit 56 * completes. If there is no commit in progress, any data_vios waiting on the block are transferred 57 * to the block's vio which is then written, automatically waking all of the waiters when it 58 * completes. When the write completes, any entries which accumulated in the block are copied to 59 * the vio's data buffer. 60 * 61 * Finally, the journal maintains a set of counters, one for each on disk journal block. These 62 * counters are used as locks to prevent premature reaping of journal blocks. Each time a new 63 * sequence number is used, the counter for the corresponding block is incremented. The counter is 64 * subsequently decremented when that block is filled and then committed for the last time. This 65 * prevents blocks from being reaped while they are still being updated. The counter is also 66 * incremented once for each entry added to a block, and decremented once each time the block map 67 * is updated in memory for that request. This prevents blocks from being reaped while their VIOs 68 * are still active. Finally, each in-memory block map page tracks the oldest journal block that 69 * contains entries corresponding to uncommitted updates to that block map page. Each time an 70 * in-memory block map page is updated, it checks if the journal block for the VIO is earlier than 71 * the one it references, in which case it increments the count on the earlier journal block and 72 * decrements the count on the later journal block, maintaining a lock on the oldest journal block 73 * containing entries for that page. When a block map page has been flushed from the cache, the 74 * counter for the journal block it references is decremented. Whenever the counter for the head 75 * block goes to 0, the head is advanced until it comes to a block whose counter is not 0 or until 76 * it reaches the active block. This is the mechanism for reclaiming journal space on disk. 77 * 78 * If there is no in-memory space when a VIO attempts to add an entry, the VIO will be attached to 79 * the 'commit_completion' and will be woken the next time a full block has committed. If there is 80 * no on-disk space when a VIO attempts to add an entry, the VIO will be attached to the 81 * 'reap_completion', and will be woken the next time a journal block is reaped. 82 */ 83 84 enum vdo_zone_type { 85 VDO_ZONE_TYPE_ADMIN, 86 VDO_ZONE_TYPE_JOURNAL, 87 VDO_ZONE_TYPE_LOGICAL, 88 VDO_ZONE_TYPE_PHYSICAL, 89 }; 90 91 struct lock_counter { 92 /* The completion for notifying the owner of a lock release */ 93 struct vdo_completion completion; 94 /* The number of logical zones which may hold locks */ 95 zone_count_t logical_zones; 96 /* The number of physical zones which may hold locks */ 97 zone_count_t physical_zones; 98 /* The number of locks */ 99 block_count_t locks; 100 /* Whether the lock release notification is in flight */ 101 atomic_t state; 102 /* The number of logical zones which hold each lock */ 103 atomic_t *logical_zone_counts; 104 /* The number of physical zones which hold each lock */ 105 atomic_t *physical_zone_counts; 106 /* The per-lock counts for the journal zone */ 107 u16 *journal_counters; 108 /* The per-lock decrement counts for the journal zone */ 109 atomic_t *journal_decrement_counts; 110 /* The per-zone, per-lock reference counts for logical zones */ 111 u16 *logical_counters; 112 /* The per-zone, per-lock reference counts for physical zones */ 113 u16 *physical_counters; 114 }; 115 116 struct recovery_journal_block { 117 /* The doubly linked pointers for the free or active lists */ 118 struct list_head list_node; 119 /* The waiter for the pending full block list */ 120 struct vdo_waiter write_waiter; 121 /* The journal to which this block belongs */ 122 struct recovery_journal *journal; 123 /* A pointer to the current sector in the packed block buffer */ 124 struct packed_journal_sector *sector; 125 /* The vio for writing this block */ 126 struct vio vio; 127 /* The sequence number for this block */ 128 sequence_number_t sequence_number; 129 /* The location of this block in the on-disk journal */ 130 physical_block_number_t block_number; 131 /* Whether this block is being committed */ 132 bool committing; 133 /* The total number of entries in this block */ 134 journal_entry_count_t entry_count; 135 /* The total number of uncommitted entries (queued or committing) */ 136 journal_entry_count_t uncommitted_entry_count; 137 /* The number of new entries in the current commit */ 138 journal_entry_count_t entries_in_commit; 139 /* The queue of vios which will make entries for the next commit */ 140 struct vdo_wait_queue entry_waiters; 141 /* The queue of vios waiting for the current commit */ 142 struct vdo_wait_queue commit_waiters; 143 }; 144 145 struct recovery_journal { 146 /* The thread ID of the journal zone */ 147 thread_id_t thread_id; 148 /* The slab depot which can hold locks on this journal */ 149 struct slab_depot *depot; 150 /* The block map which can hold locks on this journal */ 151 struct block_map *block_map; 152 /* The queue of vios waiting to make entries */ 153 struct vdo_wait_queue entry_waiters; 154 /* The number of free entries in the journal */ 155 u64 available_space; 156 /* The number of decrement entries which need to be made */ 157 data_vio_count_t pending_decrement_count; 158 /* Whether the journal is adding entries from the increment or decrement waiters queues */ 159 bool adding_entries; 160 /* The administrative state of the journal */ 161 struct admin_state state; 162 /* Whether a reap is in progress */ 163 bool reaping; 164 /* The location of the first journal block */ 165 physical_block_number_t origin; 166 /* The oldest active block in the journal on disk for block map rebuild */ 167 sequence_number_t block_map_head; 168 /* The oldest active block in the journal on disk for slab journal replay */ 169 sequence_number_t slab_journal_head; 170 /* The newest block in the journal on disk to which a write has finished */ 171 sequence_number_t last_write_acknowledged; 172 /* The end of the half-open interval of the active journal */ 173 sequence_number_t tail; 174 /* The point at which the last entry will have been added */ 175 struct journal_point append_point; 176 /* The journal point of the vio most recently released from the journal */ 177 struct journal_point commit_point; 178 /* The nonce of the VDO */ 179 nonce_t nonce; 180 /* The number of recoveries completed by the VDO */ 181 u8 recovery_count; 182 /* The number of entries which fit in a single block */ 183 journal_entry_count_t entries_per_block; 184 /* Unused in-memory journal blocks */ 185 struct list_head free_tail_blocks; 186 /* In-memory journal blocks with records */ 187 struct list_head active_tail_blocks; 188 /* A pointer to the active block (the one we are adding entries to now) */ 189 struct recovery_journal_block *active_block; 190 /* Journal blocks that need writing */ 191 struct vdo_wait_queue pending_writes; 192 /* The new block map reap head after reaping */ 193 sequence_number_t block_map_reap_head; 194 /* The head block number for the block map rebuild range */ 195 block_count_t block_map_head_block_number; 196 /* The new slab journal reap head after reaping */ 197 sequence_number_t slab_journal_reap_head; 198 /* The head block number for the slab journal replay range */ 199 block_count_t slab_journal_head_block_number; 200 /* The data-less vio, usable only for flushing */ 201 struct vio *flush_vio; 202 /* The number of blocks in the on-disk journal */ 203 block_count_t size; 204 /* The number of logical blocks that are in-use */ 205 block_count_t logical_blocks_used; 206 /* The number of block map pages that are allocated */ 207 block_count_t block_map_data_blocks; 208 /* The number of journal blocks written but not yet acknowledged */ 209 block_count_t pending_write_count; 210 /* The threshold at which slab journal tail blocks will be written out */ 211 block_count_t slab_journal_commit_threshold; 212 /* Counters for events in the journal that are reported as statistics */ 213 struct recovery_journal_statistics events; 214 /* The locks for each on-disk block */ 215 struct lock_counter lock_counter; 216 /* The tail blocks */ 217 struct recovery_journal_block blocks[]; 218 }; 219 220 /** 221 * vdo_get_recovery_journal_block_number() - Get the physical block number for a given sequence 222 * number. 223 * @journal: The journal. 224 * @sequence: The sequence number of the desired block. 225 * 226 * Return: The block number corresponding to the sequence number. 227 */ 228 static inline physical_block_number_t __must_check 229 vdo_get_recovery_journal_block_number(const struct recovery_journal *journal, 230 sequence_number_t sequence) 231 { 232 /* 233 * Since journal size is a power of two, the block number modulus can just be extracted 234 * from the low-order bits of the sequence. 235 */ 236 return vdo_compute_recovery_journal_block_number(journal->size, sequence); 237 } 238 239 /** 240 * vdo_compute_recovery_journal_check_byte() - Compute the check byte for a given sequence number. 241 * @journal: The journal. 242 * @sequence: The sequence number. 243 * 244 * Return: The check byte corresponding to the sequence number. 245 */ 246 static inline u8 __must_check 247 vdo_compute_recovery_journal_check_byte(const struct recovery_journal *journal, 248 sequence_number_t sequence) 249 { 250 /* The check byte must change with each trip around the journal. */ 251 return (((sequence / journal->size) & 0x7F) | 0x80); 252 } 253 254 int __must_check vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, 255 nonce_t nonce, struct vdo *vdo, 256 struct partition *partition, 257 u64 recovery_count, 258 block_count_t journal_size, 259 struct recovery_journal **journal_ptr); 260 261 void vdo_free_recovery_journal(struct recovery_journal *journal); 262 263 void vdo_initialize_recovery_journal_post_repair(struct recovery_journal *journal, 264 u64 recovery_count, 265 sequence_number_t tail, 266 block_count_t logical_blocks_used, 267 block_count_t block_map_data_blocks); 268 269 block_count_t __must_check 270 vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal); 271 272 thread_id_t __must_check vdo_get_recovery_journal_thread_id(struct recovery_journal *journal); 273 274 void vdo_open_recovery_journal(struct recovery_journal *journal, 275 struct slab_depot *depot, struct block_map *block_map); 276 277 sequence_number_t 278 vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal); 279 280 block_count_t __must_check vdo_get_recovery_journal_length(block_count_t journal_size); 281 282 struct recovery_journal_state_7_0 __must_check 283 vdo_record_recovery_journal(const struct recovery_journal *journal); 284 285 void vdo_add_recovery_journal_entry(struct recovery_journal *journal, 286 struct data_vio *data_vio); 287 288 void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal, 289 sequence_number_t sequence_number, 290 enum vdo_zone_type zone_type, 291 zone_count_t zone_id); 292 293 void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal, 294 sequence_number_t sequence_number, 295 enum vdo_zone_type zone_type, 296 zone_count_t zone_id); 297 298 void vdo_release_journal_entry_lock(struct recovery_journal *journal, 299 sequence_number_t sequence_number); 300 301 void vdo_drain_recovery_journal(struct recovery_journal *journal, 302 const struct admin_state_code *operation, 303 struct vdo_completion *parent); 304 305 void vdo_resume_recovery_journal(struct recovery_journal *journal, 306 struct vdo_completion *parent); 307 308 block_count_t __must_check 309 vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal); 310 311 struct recovery_journal_statistics __must_check 312 vdo_get_recovery_journal_statistics(const struct recovery_journal *journal); 313 314 void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal); 315 316 #endif /* VDO_RECOVERY_JOURNAL_H */ 317