1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H 3 #define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H 4 5 #include "replicas_format.h" 6 7 /* 8 * Disk accounting - KEY_TYPE_accounting - on disk format: 9 * 10 * Here, the key has considerably more structure than a typical key (bpos); an 11 * accounting key is 'struct disk_accounting_pos', which is a union of bpos. 12 * 13 * More specifically: a key is just a muliword integer (where word endianness 14 * matches native byte order), so we're treating bpos as an opaque 20 byte 15 * integer and mapping bch_accounting_key to that. 16 * 17 * This is a type-tagged union of all our various subtypes; a disk accounting 18 * key can be device counters, replicas counters, et cetera - it's extensible. 19 * 20 * The value is a list of u64s or s64s; the number of counters is specific to a 21 * given accounting type. 22 * 23 * Unlike with other key types, updates are _deltas_, and the deltas are not 24 * resolved until the update to the underlying btree, done by btree write buffer 25 * flush or journal replay. 26 * 27 * Journal replay in particular requires special handling. The journal tracks a 28 * range of entries which may possibly have not yet been applied to the btree 29 * yet - it does not know definitively whether individual entries are dirty and 30 * still need to be applied. 31 * 32 * To handle this, we use the version field of struct bkey, and give every 33 * accounting update a unique version number - a total ordering in time; the 34 * version number is derived from the key's position in the journal. Then 35 * journal replay can compare the version number of the key from the journal 36 * with the version number of the key in the btree to determine if a key needs 37 * to be replayed. 38 * 39 * For this to work, we must maintain this strict time ordering of updates as 40 * they are flushed to the btree, both via write buffer flush and via journal 41 * replay. This has complications for the write buffer code while journal replay 42 * is still in progress; the write buffer cannot flush any accounting keys to 43 * the btree until journal replay has finished replaying its accounting keys, or 44 * the (newer) version number of the keys from the write buffer will cause 45 * updates from journal replay to be lost. 46 */ 47 48 struct bch_accounting { 49 struct bch_val v; 50 __u64 d[]; 51 }; 52 53 #define BCH_ACCOUNTING_MAX_COUNTERS 3 54 55 #define BCH_DATA_TYPES() \ 56 x(free, 0) \ 57 x(sb, 1) \ 58 x(journal, 2) \ 59 x(btree, 3) \ 60 x(user, 4) \ 61 x(cached, 5) \ 62 x(parity, 6) \ 63 x(stripe, 7) \ 64 x(need_gc_gens, 8) \ 65 x(need_discard, 9) \ 66 x(unstriped, 10) 67 68 enum bch_data_type { 69 #define x(t, n) BCH_DATA_##t, 70 BCH_DATA_TYPES() 71 #undef x 72 BCH_DATA_NR 73 }; 74 75 static inline bool data_type_is_empty(enum bch_data_type type) 76 { 77 switch (type) { 78 case BCH_DATA_free: 79 case BCH_DATA_need_gc_gens: 80 case BCH_DATA_need_discard: 81 return true; 82 default: 83 return false; 84 } 85 } 86 87 static inline bool data_type_is_hidden(enum bch_data_type type) 88 { 89 switch (type) { 90 case BCH_DATA_sb: 91 case BCH_DATA_journal: 92 return true; 93 default: 94 return false; 95 } 96 } 97 98 /* 99 * field 1: name 100 * field 2: id 101 * field 3: number of counters (max 3) 102 */ 103 104 #define BCH_DISK_ACCOUNTING_TYPES() \ 105 x(nr_inodes, 0, 1) \ 106 x(persistent_reserved, 1, 1) \ 107 x(replicas, 2, 1) \ 108 x(dev_data_type, 3, 3) \ 109 x(compression, 4, 3) \ 110 x(snapshot, 5, 1) \ 111 x(btree, 6, 1) \ 112 x(rebalance_work, 7, 1) \ 113 x(inum, 8, 3) 114 115 enum disk_accounting_type { 116 #define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, 117 BCH_DISK_ACCOUNTING_TYPES() 118 #undef x 119 BCH_DISK_ACCOUNTING_TYPE_NR, 120 }; 121 122 /* 123 * No subtypes - number of inodes in the entire filesystem 124 * 125 * XXX: perhaps we could add a per-subvolume counter? 126 */ 127 struct bch_acct_nr_inodes { 128 }; 129 130 /* 131 * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the 132 * reservation: 133 */ 134 struct bch_acct_persistent_reserved { 135 __u8 nr_replicas; 136 }; 137 138 /* 139 * device, data type counter fields: 140 * [ 141 * nr_buckets 142 * live sectors (in buckets of that data type) 143 * sectors of internal fragmentation 144 * ] 145 * 146 * XXX: live sectors should've been done differently, you can have multiple data 147 * types in the same bucket (user, stripe, cached) and this collapses them to 148 * the bucket data type, and makes the internal fragmentation counter redundant 149 */ 150 struct bch_acct_dev_data_type { 151 __u8 dev; 152 __u8 data_type; 153 }; 154 155 /* 156 * Compression type fields: 157 * [ 158 * number of extents 159 * uncompressed size 160 * compressed size 161 * ] 162 * 163 * Compression ratio, average extent size (fragmentation). 164 */ 165 struct bch_acct_compression { 166 __u8 type; 167 }; 168 169 /* 170 * On disk usage by snapshot id; counts same values as replicas counter, but 171 * aggregated differently 172 */ 173 struct bch_acct_snapshot { 174 __u32 id; 175 } __packed; 176 177 struct bch_acct_btree { 178 __u32 id; 179 } __packed; 180 181 /* 182 * inum counter fields: 183 * [ 184 * number of extents 185 * sum of extent sizes - bkey size 186 * this field is similar to inode.bi_sectors, except here extents in 187 * different snapshots but the same inode number are all collapsed to the 188 * same counter 189 * sum of on disk size - same values tracked by replicas counters 190 * ] 191 * 192 * This tracks on disk fragmentation. 193 */ 194 struct bch_acct_inum { 195 __u64 inum; 196 } __packed; 197 198 /* 199 * Simple counter of the amount of data (on disk sectors) rebalance needs to 200 * move, extents counted here are also in the rebalance_work btree. 201 */ 202 struct bch_acct_rebalance_work { 203 }; 204 205 struct disk_accounting_pos { 206 union { 207 struct { 208 __u8 type; 209 union { 210 struct bch_acct_nr_inodes nr_inodes; 211 struct bch_acct_persistent_reserved persistent_reserved; 212 struct bch_replicas_entry_v1 replicas; 213 struct bch_acct_dev_data_type dev_data_type; 214 struct bch_acct_compression compression; 215 struct bch_acct_snapshot snapshot; 216 struct bch_acct_btree btree; 217 struct bch_acct_rebalance_work rebalance_work; 218 struct bch_acct_inum inum; 219 } __packed; 220 } __packed; 221 struct bpos _pad; 222 }; 223 }; 224 225 #endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */ 226