| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 1 | /* | 
|  | 2 | * fs/logfs/gc.c	- garbage collection code | 
|  | 3 | * | 
|  | 4 | * As should be obvious for Linux kernel code, license is GPLv2 | 
|  | 5 | * | 
|  | 6 | * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> | 
|  | 7 | */ | 
|  | 8 | #include "logfs.h" | 
|  | 9 | #include <linux/sched.h> | 
| Tejun Heo | 5a0e3ad | 2010-03-24 17:04:11 +0900 | [diff] [blame] | 10 | #include <linux/slab.h> | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 11 |  | 
|  | 12 | /* | 
|  | 13 | * Wear leveling needs to kick in when the difference between low erase | 
|  | 14 | * counts and high erase counts gets too big.  A good value for "too big" | 
|  | 15 | * may be somewhat below 10% of maximum erase count for the device. | 
|  | 16 | * Why not 397, to pick a nice round number with no specific meaning? :) | 
|  | 17 | * | 
|  | 18 | * WL_RATELIMIT is the minimum time between two wear level events.  A huge | 
|  | 19 | * number of segments may fulfil the requirements for wear leveling at the | 
|  | 20 | * same time.  If that happens we don't want to cause a latency from hell, | 
|  | 21 | * but just gently pick one segment every so often and minimize overhead. | 
|  | 22 | */ | 
|  | 23 | #define WL_DELTA 397 | 
|  | 24 | #define WL_RATELIMIT 100 | 
|  | 25 | #define MAX_OBJ_ALIASES	2600 | 
|  | 26 | #define SCAN_RATIO 512	/* number of scanned segments per gc'd segment */ | 
|  | 27 | #define LIST_SIZE 64	/* base size of candidate lists */ | 
|  | 28 | #define SCAN_ROUNDS 128	/* maximum number of complete medium scans */ | 
|  | 29 | #define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ | 
|  | 30 |  | 
|  | 31 | static int no_free_segments(struct super_block *sb) | 
|  | 32 | { | 
|  | 33 | struct logfs_super *super = logfs_super(sb); | 
|  | 34 |  | 
|  | 35 | return super->s_free_list.count; | 
|  | 36 | } | 
|  | 37 |  | 
|  | 38 | /* journal has distance -1, top-most ifile layer distance 0 */ | 
|  | 39 | static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) | 
|  | 40 | { | 
|  | 41 | struct logfs_super *super = logfs_super(sb); | 
|  | 42 | u8 gc_level = (__force u8)__gc_level; | 
|  | 43 |  | 
|  | 44 | switch (gc_level) { | 
|  | 45 | case 0: /* fall through */ | 
|  | 46 | case 1: /* fall through */ | 
|  | 47 | case 2: /* fall through */ | 
|  | 48 | case 3: | 
|  | 49 | /* file data or indirect blocks */ | 
|  | 50 | return super->s_ifile_levels + super->s_iblock_levels - gc_level; | 
|  | 51 | case 6: /* fall through */ | 
|  | 52 | case 7: /* fall through */ | 
|  | 53 | case 8: /* fall through */ | 
|  | 54 | case 9: | 
|  | 55 | /* inode file data or indirect blocks */ | 
|  | 56 | return super->s_ifile_levels - (gc_level - 6); | 
|  | 57 | default: | 
|  | 58 | printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", | 
|  | 59 | gc_level); | 
|  | 60 | WARN_ON(1); | 
|  | 61 | return super->s_ifile_levels + super->s_iblock_levels; | 
|  | 62 | } | 
|  | 63 | } | 
|  | 64 |  | 
|  | 65 | static int segment_is_reserved(struct super_block *sb, u32 segno) | 
|  | 66 | { | 
|  | 67 | struct logfs_super *super = logfs_super(sb); | 
|  | 68 | struct logfs_area *area; | 
|  | 69 | void *reserved; | 
|  | 70 | int i; | 
|  | 71 |  | 
|  | 72 | /* Some segments are reserved.  Just pretend they were all valid */ | 
|  | 73 | reserved = btree_lookup32(&super->s_reserved_segments, segno); | 
|  | 74 | if (reserved) | 
|  | 75 | return 1; | 
|  | 76 |  | 
|  | 77 | /* Currently open segments */ | 
|  | 78 | for_each_area(i) { | 
|  | 79 | area = super->s_area[i]; | 
|  | 80 | if (area->a_is_open && area->a_segno == segno) | 
|  | 81 | return 1; | 
|  | 82 | } | 
|  | 83 |  | 
|  | 84 | return 0; | 
|  | 85 | } | 
|  | 86 |  | 
|  | 87 | static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) | 
|  | 88 | { | 
|  | 89 | BUG(); | 
|  | 90 | } | 
|  | 91 |  | 
|  | 92 | /* | 
|  | 93 | * Returns the bytes consumed by valid objects in this segment.  Object headers | 
|  | 94 | * are counted, the segment header is not. | 
|  | 95 | */ | 
|  | 96 | static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, | 
|  | 97 | gc_level_t *gc_level) | 
|  | 98 | { | 
|  | 99 | struct logfs_segment_entry se; | 
|  | 100 | u32 ec_level; | 
|  | 101 |  | 
|  | 102 | logfs_get_segment_entry(sb, segno, &se); | 
|  | 103 | if (se.ec_level == cpu_to_be32(BADSEG) || | 
|  | 104 | se.valid == cpu_to_be32(RESERVED)) | 
|  | 105 | return RESERVED; | 
|  | 106 |  | 
|  | 107 | ec_level = be32_to_cpu(se.ec_level); | 
|  | 108 | *ec = ec_level >> 4; | 
|  | 109 | *gc_level = GC_LEVEL(ec_level & 0xf); | 
|  | 110 | return be32_to_cpu(se.valid); | 
|  | 111 | } | 
|  | 112 |  | 
|  | 113 | static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, | 
|  | 114 | u64 bix, gc_level_t gc_level) | 
|  | 115 | { | 
|  | 116 | struct inode *inode; | 
|  | 117 | int err, cookie; | 
|  | 118 |  | 
|  | 119 | inode = logfs_safe_iget(sb, ino, &cookie); | 
|  | 120 | err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); | 
|  | 121 | BUG_ON(err); | 
|  | 122 | logfs_safe_iput(inode, cookie); | 
|  | 123 | } | 
|  | 124 |  | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 125 | static u32 logfs_gc_segment(struct super_block *sb, u32 segno) | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 126 | { | 
|  | 127 | struct logfs_super *super = logfs_super(sb); | 
|  | 128 | struct logfs_segment_header sh; | 
|  | 129 | struct logfs_object_header oh; | 
|  | 130 | u64 ofs, ino, bix; | 
|  | 131 | u32 seg_ofs, logical_segno, cleaned = 0; | 
|  | 132 | int err, len, valid; | 
|  | 133 | gc_level_t gc_level; | 
|  | 134 |  | 
|  | 135 | LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); | 
|  | 136 |  | 
|  | 137 | btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); | 
|  | 138 | err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); | 
|  | 139 | BUG_ON(err); | 
|  | 140 | gc_level = GC_LEVEL(sh.level); | 
|  | 141 | logical_segno = be32_to_cpu(sh.segno); | 
|  | 142 | if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { | 
|  | 143 | logfs_mark_segment_bad(sb, segno); | 
|  | 144 | cleaned = -1; | 
|  | 145 | goto out; | 
|  | 146 | } | 
|  | 147 |  | 
|  | 148 | for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; | 
|  | 149 | seg_ofs + sizeof(oh) < super->s_segsize; ) { | 
|  | 150 | ofs = dev_ofs(sb, logical_segno, seg_ofs); | 
|  | 151 | err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), | 
|  | 152 | &oh); | 
|  | 153 | BUG_ON(err); | 
|  | 154 |  | 
|  | 155 | if (!memchr_inv(&oh, 0xff, sizeof(oh))) | 
|  | 156 | break; | 
|  | 157 |  | 
|  | 158 | if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { | 
|  | 159 | logfs_mark_segment_bad(sb, segno); | 
|  | 160 | cleaned = super->s_segsize - 1; | 
|  | 161 | goto out; | 
|  | 162 | } | 
|  | 163 |  | 
|  | 164 | ino = be64_to_cpu(oh.ino); | 
|  | 165 | bix = be64_to_cpu(oh.bix); | 
|  | 166 | len = sizeof(oh) + be16_to_cpu(oh.len); | 
|  | 167 | valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); | 
|  | 168 | if (valid == 1) { | 
|  | 169 | logfs_cleanse_block(sb, ofs, ino, bix, gc_level); | 
|  | 170 | cleaned += len; | 
|  | 171 | } else if (valid == 2) { | 
|  | 172 | /* Will be invalid upon journal commit */ | 
|  | 173 | cleaned += len; | 
|  | 174 | } | 
|  | 175 | seg_ofs += len; | 
|  | 176 | } | 
|  | 177 | out: | 
|  | 178 | btree_remove32(&super->s_reserved_segments, segno); | 
|  | 179 | return cleaned; | 
|  | 180 | } | 
|  | 181 |  | 
|  | 182 | static struct gc_candidate *add_list(struct gc_candidate *cand, | 
|  | 183 | struct candidate_list *list) | 
|  | 184 | { | 
|  | 185 | struct rb_node **p = &list->rb_tree.rb_node; | 
|  | 186 | struct rb_node *parent = NULL; | 
|  | 187 | struct gc_candidate *cur; | 
|  | 188 | int comp; | 
|  | 189 |  | 
|  | 190 | cand->list = list; | 
|  | 191 | while (*p) { | 
|  | 192 | parent = *p; | 
|  | 193 | cur = rb_entry(parent, struct gc_candidate, rb_node); | 
|  | 194 |  | 
|  | 195 | if (list->sort_by_ec) | 
|  | 196 | comp = cand->erase_count < cur->erase_count; | 
|  | 197 | else | 
|  | 198 | comp = cand->valid < cur->valid; | 
|  | 199 |  | 
|  | 200 | if (comp) | 
|  | 201 | p = &parent->rb_left; | 
|  | 202 | else | 
|  | 203 | p = &parent->rb_right; | 
|  | 204 | } | 
|  | 205 | rb_link_node(&cand->rb_node, parent, p); | 
|  | 206 | rb_insert_color(&cand->rb_node, &list->rb_tree); | 
|  | 207 |  | 
|  | 208 | if (list->count <= list->maxcount) { | 
|  | 209 | list->count++; | 
|  | 210 | return NULL; | 
|  | 211 | } | 
|  | 212 | cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); | 
|  | 213 | rb_erase(&cand->rb_node, &list->rb_tree); | 
|  | 214 | cand->list = NULL; | 
|  | 215 | return cand; | 
|  | 216 | } | 
|  | 217 |  | 
|  | 218 | static void remove_from_list(struct gc_candidate *cand) | 
|  | 219 | { | 
|  | 220 | struct candidate_list *list = cand->list; | 
|  | 221 |  | 
|  | 222 | rb_erase(&cand->rb_node, &list->rb_tree); | 
|  | 223 | list->count--; | 
|  | 224 | } | 
|  | 225 |  | 
|  | 226 | static void free_candidate(struct super_block *sb, struct gc_candidate *cand) | 
|  | 227 | { | 
|  | 228 | struct logfs_super *super = logfs_super(sb); | 
|  | 229 |  | 
|  | 230 | btree_remove32(&super->s_cand_tree, cand->segno); | 
|  | 231 | kfree(cand); | 
|  | 232 | } | 
|  | 233 |  | 
|  | 234 | u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) | 
|  | 235 | { | 
|  | 236 | struct gc_candidate *cand; | 
|  | 237 | u32 segno; | 
|  | 238 |  | 
|  | 239 | BUG_ON(list->count == 0); | 
|  | 240 |  | 
|  | 241 | cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | 
|  | 242 | remove_from_list(cand); | 
|  | 243 | segno = cand->segno; | 
|  | 244 | if (ec) | 
|  | 245 | *ec = cand->erase_count; | 
|  | 246 | free_candidate(sb, cand); | 
|  | 247 | return segno; | 
|  | 248 | } | 
|  | 249 |  | 
|  | 250 | /* | 
|  | 251 | * We have several lists to manage segments with.  The reserve_list is used to | 
|  | 252 | * deal with bad blocks.  We try to keep the best (lowest ec) segments on this | 
|  | 253 | * list. | 
|  | 254 | * The free_list contains free segments for normal usage.  It usually gets the | 
|  | 255 | * second pick after the reserve_list.  But when the free_list is running short | 
|  | 256 | * it is more important to keep the free_list full than to keep a reserve. | 
|  | 257 | * | 
|  | 258 | * Segments that are not free are put onto a per-level low_list.  If we have | 
|  | 259 | * to run garbage collection, we pick a candidate from there.  All segments on | 
|  | 260 | * those lists should have at least some free space so GC will make progress. | 
|  | 261 | * | 
|  | 262 | * And last we have the ec_list, which is used to pick segments for wear | 
|  | 263 | * leveling. | 
|  | 264 | * | 
|  | 265 | * If all appropriate lists are full, we simply free the candidate and forget | 
|  | 266 | * about that segment for a while.  We have better candidates for each purpose. | 
|  | 267 | */ | 
|  | 268 | static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) | 
|  | 269 | { | 
|  | 270 | struct logfs_super *super = logfs_super(sb); | 
|  | 271 | u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; | 
|  | 272 |  | 
|  | 273 | if (cand->valid == 0) { | 
|  | 274 | /* 100% free segments */ | 
|  | 275 | log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", | 
|  | 276 | cand->segno, cand->erase_count, | 
|  | 277 | dev_ofs(sb, cand->segno, 0)); | 
|  | 278 | cand = add_list(cand, &super->s_reserve_list); | 
|  | 279 | if (cand) { | 
|  | 280 | log_gc_noisy("add free segment %x (ec %x) at %llx\n", | 
|  | 281 | cand->segno, cand->erase_count, | 
|  | 282 | dev_ofs(sb, cand->segno, 0)); | 
|  | 283 | cand = add_list(cand, &super->s_free_list); | 
|  | 284 | } | 
|  | 285 | } else { | 
|  | 286 | /* good candidates for Garbage Collection */ | 
|  | 287 | if (cand->valid < full) | 
|  | 288 | cand = add_list(cand, &super->s_low_list[cand->dist]); | 
|  | 289 | /* good candidates for wear leveling, | 
|  | 290 | * segments that were recently written get ignored */ | 
|  | 291 | if (cand) | 
|  | 292 | cand = add_list(cand, &super->s_ec_list); | 
|  | 293 | } | 
|  | 294 | if (cand) | 
|  | 295 | free_candidate(sb, cand); | 
|  | 296 | } | 
|  | 297 |  | 
|  | 298 | static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, | 
|  | 299 | u8 dist) | 
|  | 300 | { | 
|  | 301 | struct logfs_super *super = logfs_super(sb); | 
|  | 302 | struct gc_candidate *cand; | 
|  | 303 |  | 
|  | 304 | cand = kmalloc(sizeof(*cand), GFP_NOFS); | 
|  | 305 | if (!cand) | 
|  | 306 | return -ENOMEM; | 
|  | 307 |  | 
|  | 308 | cand->segno = segno; | 
|  | 309 | cand->valid = valid; | 
|  | 310 | cand->erase_count = ec; | 
|  | 311 | cand->dist = dist; | 
|  | 312 |  | 
|  | 313 | btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); | 
|  | 314 | __add_candidate(sb, cand); | 
|  | 315 | return 0; | 
|  | 316 | } | 
|  | 317 |  | 
|  | 318 | static void remove_segment_from_lists(struct super_block *sb, u32 segno) | 
|  | 319 | { | 
|  | 320 | struct logfs_super *super = logfs_super(sb); | 
|  | 321 | struct gc_candidate *cand; | 
|  | 322 |  | 
|  | 323 | cand = btree_lookup32(&super->s_cand_tree, segno); | 
|  | 324 | if (cand) { | 
|  | 325 | remove_from_list(cand); | 
|  | 326 | free_candidate(sb, cand); | 
|  | 327 | } | 
|  | 328 | } | 
|  | 329 |  | 
|  | 330 | static void scan_segment(struct super_block *sb, u32 segno) | 
|  | 331 | { | 
|  | 332 | u32 valid, ec = 0; | 
|  | 333 | gc_level_t gc_level = 0; | 
|  | 334 | u8 dist; | 
|  | 335 |  | 
|  | 336 | if (segment_is_reserved(sb, segno)) | 
|  | 337 | return; | 
|  | 338 |  | 
|  | 339 | remove_segment_from_lists(sb, segno); | 
|  | 340 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | 
|  | 341 | if (valid == RESERVED) | 
|  | 342 | return; | 
|  | 343 |  | 
|  | 344 | dist = root_distance(sb, gc_level); | 
|  | 345 | add_candidate(sb, segno, valid, ec, dist); | 
|  | 346 | } | 
|  | 347 |  | 
|  | 348 | static struct gc_candidate *first_in_list(struct candidate_list *list) | 
|  | 349 | { | 
|  | 350 | if (list->count == 0) | 
|  | 351 | return NULL; | 
|  | 352 | return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | 
|  | 353 | } | 
|  | 354 |  | 
|  | 355 | /* | 
|  | 356 | * Find the best segment for garbage collection.  Main criterion is | 
|  | 357 | * the segment requiring the least effort to clean.  Secondary | 
|  | 358 | * criterion is to GC on the lowest level available. | 
|  | 359 | * | 
|  | 360 | * So we search the least effort segment on the lowest level first, | 
|  | 361 | * then move up and pick another segment iff is requires significantly | 
|  | 362 | * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison. | 
|  | 363 | */ | 
|  | 364 | static struct gc_candidate *get_candidate(struct super_block *sb) | 
|  | 365 | { | 
|  | 366 | struct logfs_super *super = logfs_super(sb); | 
|  | 367 | int i, max_dist; | 
|  | 368 | struct gc_candidate *cand = NULL, *this; | 
|  | 369 |  | 
|  | 370 | max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); | 
|  | 371 |  | 
|  | 372 | for (i = max_dist; i >= 0; i--) { | 
|  | 373 | this = first_in_list(&super->s_low_list[i]); | 
|  | 374 | if (!this) | 
|  | 375 | continue; | 
|  | 376 | if (!cand) | 
|  | 377 | cand = this; | 
|  | 378 | if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) | 
|  | 379 | cand = this; | 
|  | 380 | } | 
|  | 381 | return cand; | 
|  | 382 | } | 
|  | 383 |  | 
|  | 384 | static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) | 
|  | 385 | { | 
|  | 386 | struct logfs_super *super = logfs_super(sb); | 
|  | 387 | gc_level_t gc_level; | 
|  | 388 | u32 cleaned, valid, segno, ec; | 
|  | 389 | u8 dist; | 
|  | 390 |  | 
|  | 391 | if (!cand) { | 
|  | 392 | log_gc("GC attempted, but no candidate found\n"); | 
|  | 393 | return 0; | 
|  | 394 | } | 
|  | 395 |  | 
|  | 396 | segno = cand->segno; | 
|  | 397 | dist = cand->dist; | 
|  | 398 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | 
|  | 399 | free_candidate(sb, cand); | 
|  | 400 | log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", | 
|  | 401 | segno, (u64)segno << super->s_segshift, | 
|  | 402 | dist, no_free_segments(sb), valid, | 
|  | 403 | super->s_free_bytes); | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 404 | cleaned = logfs_gc_segment(sb, segno); | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 405 | log_gc("GC segment #%02x complete - now %x valid\n", segno, | 
|  | 406 | valid - cleaned); | 
|  | 407 | BUG_ON(cleaned != valid); | 
|  | 408 | return 1; | 
|  | 409 | } | 
|  | 410 |  | 
|  | 411 | static int logfs_gc_once(struct super_block *sb) | 
|  | 412 | { | 
|  | 413 | struct gc_candidate *cand; | 
|  | 414 |  | 
|  | 415 | cand = get_candidate(sb); | 
|  | 416 | if (cand) | 
|  | 417 | remove_from_list(cand); | 
|  | 418 | return __logfs_gc_once(sb, cand); | 
|  | 419 | } | 
|  | 420 |  | 
|  | 421 | /* returns 1 if a wrap occurs, 0 otherwise */ | 
|  | 422 | static int logfs_scan_some(struct super_block *sb) | 
|  | 423 | { | 
|  | 424 | struct logfs_super *super = logfs_super(sb); | 
|  | 425 | u32 segno; | 
|  | 426 | int i, ret = 0; | 
|  | 427 |  | 
|  | 428 | segno = super->s_sweeper; | 
|  | 429 | for (i = SCAN_RATIO; i > 0; i--) { | 
|  | 430 | segno++; | 
|  | 431 | if (segno >= super->s_no_segs) { | 
|  | 432 | segno = 0; | 
|  | 433 | ret = 1; | 
|  | 434 | /* Break out of the loop.  We want to read a single | 
|  | 435 | * block from the segment size on next invocation if | 
|  | 436 | * SCAN_RATIO is set to match block size | 
|  | 437 | */ | 
|  | 438 | break; | 
|  | 439 | } | 
|  | 440 |  | 
|  | 441 | scan_segment(sb, segno); | 
|  | 442 | } | 
|  | 443 | super->s_sweeper = segno; | 
|  | 444 | return ret; | 
|  | 445 | } | 
|  | 446 |  | 
|  | 447 | /* | 
|  | 448 | * In principle, this function should loop forever, looking for GC candidates | 
|  | 449 | * and moving data.  LogFS is designed in such a way that this loop is | 
|  | 450 | * guaranteed to terminate. | 
|  | 451 | * | 
|  | 452 | * Limiting the loop to some iterations serves purely to catch cases when | 
|  | 453 | * these guarantees have failed.  An actual endless loop is an obvious bug | 
|  | 454 | * and should be reported as such. | 
|  | 455 | */ | 
|  | 456 | static void __logfs_gc_pass(struct super_block *sb, int target) | 
|  | 457 | { | 
|  | 458 | struct logfs_super *super = logfs_super(sb); | 
|  | 459 | struct logfs_block *block; | 
|  | 460 | int round, progress, last_progress = 0; | 
|  | 461 |  | 
| Joern Engel | 032d8f7 | 2010-04-13 17:46:37 +0200 | [diff] [blame] | 462 | /* | 
|  | 463 | * Doing too many changes to the segfile at once would result | 
|  | 464 | * in a large number of aliases.  Write the journal before | 
|  | 465 | * things get out of hand. | 
|  | 466 | */ | 
|  | 467 | if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES) | 
|  | 468 | logfs_write_anchor(sb); | 
|  | 469 |  | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 470 | if (no_free_segments(sb) >= target && | 
|  | 471 | super->s_no_object_aliases < MAX_OBJ_ALIASES) | 
|  | 472 | return; | 
|  | 473 |  | 
|  | 474 | log_gc("__logfs_gc_pass(%x)\n", target); | 
|  | 475 | for (round = 0; round < SCAN_ROUNDS; ) { | 
|  | 476 | if (no_free_segments(sb) >= target) | 
|  | 477 | goto write_alias; | 
|  | 478 |  | 
|  | 479 | /* Sync in-memory state with on-medium state in case they | 
|  | 480 | * diverged */ | 
| Joern Engel | c6d3830 | 2010-03-04 21:36:19 +0100 | [diff] [blame] | 481 | logfs_write_anchor(sb); | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 482 | round += logfs_scan_some(sb); | 
|  | 483 | if (no_free_segments(sb) >= target) | 
|  | 484 | goto write_alias; | 
|  | 485 | progress = logfs_gc_once(sb); | 
|  | 486 | if (progress) | 
|  | 487 | last_progress = round; | 
|  | 488 | else if (round - last_progress > 2) | 
|  | 489 | break; | 
|  | 490 | continue; | 
|  | 491 |  | 
|  | 492 | /* | 
|  | 493 | * The goto logic is nasty, I just don't know a better way to | 
|  | 494 | * code it.  GC is supposed to ensure two things: | 
|  | 495 | * 1. Enough free segments are available. | 
|  | 496 | * 2. The number of aliases is bounded. | 
|  | 497 | * When 1. is achieved, we take a look at 2. and write back | 
|  | 498 | * some alias-containing blocks, if necessary.  However, after | 
|  | 499 | * each such write we need to go back to 1., as writes can | 
|  | 500 | * consume free segments. | 
|  | 501 | */ | 
|  | 502 | write_alias: | 
|  | 503 | if (super->s_no_object_aliases < MAX_OBJ_ALIASES) | 
|  | 504 | return; | 
|  | 505 | if (list_empty(&super->s_object_alias)) { | 
|  | 506 | /* All aliases are still in btree */ | 
|  | 507 | return; | 
|  | 508 | } | 
|  | 509 | log_gc("Write back one alias\n"); | 
|  | 510 | block = list_entry(super->s_object_alias.next, | 
|  | 511 | struct logfs_block, alias_list); | 
|  | 512 | block->ops->write_block(block); | 
|  | 513 | /* | 
|  | 514 | * To round off the nasty goto logic, we reset round here.  It | 
|  | 515 | * is a safety-net for GC not making any progress and limited | 
|  | 516 | * to something reasonably small.  If incremented it for every | 
|  | 517 | * single alias, the loop could terminate rather quickly. | 
|  | 518 | */ | 
|  | 519 | round = 0; | 
|  | 520 | } | 
|  | 521 | LOGFS_BUG(sb); | 
|  | 522 | } | 
|  | 523 |  | 
|  | 524 | static int wl_ratelimit(struct super_block *sb, u64 *next_event) | 
|  | 525 | { | 
|  | 526 | struct logfs_super *super = logfs_super(sb); | 
|  | 527 |  | 
|  | 528 | if (*next_event < super->s_gec) { | 
|  | 529 | *next_event = super->s_gec + WL_RATELIMIT; | 
|  | 530 | return 0; | 
|  | 531 | } | 
|  | 532 | return 1; | 
|  | 533 | } | 
|  | 534 |  | 
|  | 535 | static void logfs_wl_pass(struct super_block *sb) | 
|  | 536 | { | 
|  | 537 | struct logfs_super *super = logfs_super(sb); | 
|  | 538 | struct gc_candidate *wl_cand, *free_cand; | 
|  | 539 |  | 
|  | 540 | if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) | 
|  | 541 | return; | 
|  | 542 |  | 
|  | 543 | wl_cand = first_in_list(&super->s_ec_list); | 
|  | 544 | if (!wl_cand) | 
|  | 545 | return; | 
|  | 546 | free_cand = first_in_list(&super->s_free_list); | 
|  | 547 | if (!free_cand) | 
|  | 548 | return; | 
|  | 549 |  | 
|  | 550 | if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { | 
|  | 551 | remove_from_list(wl_cand); | 
|  | 552 | __logfs_gc_once(sb, wl_cand); | 
|  | 553 | } | 
|  | 554 | } | 
|  | 555 |  | 
|  | 556 | /* | 
|  | 557 | * The journal needs wear leveling as well.  But moving the journal is an | 
|  | 558 | * expensive operation so we try to avoid it as much as possible.  And if we | 
|  | 559 | * have to do it, we move the whole journal, not individual segments. | 
|  | 560 | * | 
|  | 561 | * Ratelimiting is not strictly necessary here, it mainly serves to avoid the | 
|  | 562 | * calculations.  First we check whether moving the journal would be a | 
|  | 563 | * significant improvement.  That means that a) the current journal segments | 
|  | 564 | * have more wear than the future journal segments and b) the current journal | 
|  | 565 | * segments have more wear than normal ostore segments. | 
|  | 566 | * Rationale for b) is that we don't have to move the journal if it is aging | 
|  | 567 | * less than the ostore, even if the reserve segments age even less (they are | 
|  | 568 | * excluded from wear leveling, after all). | 
|  | 569 | * Next we check that the superblocks have less wear than the journal.  Since | 
|  | 570 | * moving the journal requires writing the superblocks, we have to protect the | 
|  | 571 | * superblocks even more than the journal. | 
|  | 572 | * | 
|  | 573 | * Also we double the acceptable wear difference, compared to ostore wear | 
|  | 574 | * leveling.  Journal data is read and rewritten rapidly, comparatively.  So | 
|  | 575 | * soft errors have much less time to accumulate and we allow the journal to | 
|  | 576 | * be a bit worse than the ostore. | 
|  | 577 | */ | 
|  | 578 | static void logfs_journal_wl_pass(struct super_block *sb) | 
|  | 579 | { | 
|  | 580 | struct logfs_super *super = logfs_super(sb); | 
|  | 581 | struct gc_candidate *cand; | 
|  | 582 | u32 min_journal_ec = -1, max_reserve_ec = 0; | 
|  | 583 | int i; | 
|  | 584 |  | 
|  | 585 | if (wl_ratelimit(sb, &super->s_wl_gec_journal)) | 
|  | 586 | return; | 
|  | 587 |  | 
|  | 588 | if (super->s_reserve_list.count < super->s_no_journal_segs) { | 
|  | 589 | /* Reserve is not full enough to move complete journal */ | 
|  | 590 | return; | 
|  | 591 | } | 
|  | 592 |  | 
|  | 593 | journal_for_each(i) | 
|  | 594 | if (super->s_journal_seg[i]) | 
|  | 595 | min_journal_ec = min(min_journal_ec, | 
|  | 596 | super->s_journal_ec[i]); | 
|  | 597 | cand = rb_entry(rb_first(&super->s_free_list.rb_tree), | 
|  | 598 | struct gc_candidate, rb_node); | 
|  | 599 | max_reserve_ec = cand->erase_count; | 
|  | 600 | for (i = 0; i < 2; i++) { | 
|  | 601 | struct logfs_segment_entry se; | 
|  | 602 | u32 segno = seg_no(sb, super->s_sb_ofs[i]); | 
|  | 603 | u32 ec; | 
|  | 604 |  | 
|  | 605 | logfs_get_segment_entry(sb, segno, &se); | 
|  | 606 | ec = be32_to_cpu(se.ec_level) >> 4; | 
|  | 607 | max_reserve_ec = max(max_reserve_ec, ec); | 
|  | 608 | } | 
|  | 609 |  | 
|  | 610 | if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { | 
|  | 611 | do_logfs_journal_wl_pass(sb); | 
|  | 612 | } | 
|  | 613 | } | 
|  | 614 |  | 
|  | 615 | void logfs_gc_pass(struct super_block *sb) | 
|  | 616 | { | 
|  | 617 | struct logfs_super *super = logfs_super(sb); | 
|  | 618 |  | 
|  | 619 | //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); | 
|  | 620 | /* Write journal before free space is getting saturated with dirty | 
|  | 621 | * objects. | 
|  | 622 | */ | 
|  | 623 | if (super->s_dirty_used_bytes + super->s_dirty_free_bytes | 
|  | 624 | + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) | 
| Joern Engel | c6d3830 | 2010-03-04 21:36:19 +0100 | [diff] [blame] | 625 | logfs_write_anchor(sb); | 
|  | 626 | __logfs_gc_pass(sb, super->s_total_levels); | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 627 | logfs_wl_pass(sb); | 
|  | 628 | logfs_journal_wl_pass(sb); | 
|  | 629 | } | 
|  | 630 |  | 
|  | 631 | static int check_area(struct super_block *sb, int i) | 
|  | 632 | { | 
|  | 633 | struct logfs_super *super = logfs_super(sb); | 
|  | 634 | struct logfs_area *area = super->s_area[i]; | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 635 | gc_level_t gc_level; | 
|  | 636 | u32 cleaned, valid, ec; | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 637 | u32 segno = area->a_segno; | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 638 | u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 639 |  | 
|  | 640 | if (!area->a_is_open) | 
|  | 641 | return 0; | 
|  | 642 |  | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 643 | if (super->s_devops->can_write_buf(sb, ofs) == 0) | 
|  | 644 | return 0; | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 645 |  | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 646 | printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs); | 
|  | 647 | /* | 
|  | 648 | * The device cannot write back the write buffer.  Most likely the | 
|  | 649 | * wbuf was already written out and the system crashed at some point | 
|  | 650 | * before the journal commit happened.  In that case we wouldn't have | 
|  | 651 | * to do anything.  But if the crash happened before the wbuf was | 
|  | 652 | * written out correctly, we must GC this segment.  So assume the | 
|  | 653 | * worst and always do the GC run. | 
|  | 654 | */ | 
|  | 655 | area->a_is_open = 0; | 
|  | 656 | valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | 
|  | 657 | cleaned = logfs_gc_segment(sb, segno); | 
|  | 658 | if (cleaned != valid) | 
|  | 659 | return -EIO; | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 660 | return 0; | 
|  | 661 | } | 
|  | 662 |  | 
|  | 663 | int logfs_check_areas(struct super_block *sb) | 
|  | 664 | { | 
|  | 665 | int i, err; | 
|  | 666 |  | 
|  | 667 | for_each_area(i) { | 
|  | 668 | err = check_area(sb, i); | 
|  | 669 | if (err) | 
|  | 670 | return err; | 
|  | 671 | } | 
|  | 672 | return 0; | 
|  | 673 | } | 
|  | 674 |  | 
|  | 675 | static void logfs_init_candlist(struct candidate_list *list, int maxcount, | 
|  | 676 | int sort_by_ec) | 
|  | 677 | { | 
|  | 678 | list->count = 0; | 
|  | 679 | list->maxcount = maxcount; | 
|  | 680 | list->sort_by_ec = sort_by_ec; | 
|  | 681 | list->rb_tree = RB_ROOT; | 
|  | 682 | } | 
|  | 683 |  | 
|  | 684 | int logfs_init_gc(struct super_block *sb) | 
|  | 685 | { | 
|  | 686 | struct logfs_super *super = logfs_super(sb); | 
|  | 687 | int i; | 
|  | 688 |  | 
|  | 689 | btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); | 
|  | 690 | logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); | 
|  | 691 | logfs_init_candlist(&super->s_reserve_list, | 
|  | 692 | super->s_bad_seg_reserve, 1); | 
|  | 693 | for_each_area(i) | 
|  | 694 | logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); | 
|  | 695 | logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); | 
|  | 696 | return 0; | 
|  | 697 | } | 
|  | 698 |  | 
|  | 699 | static void logfs_cleanup_list(struct super_block *sb, | 
|  | 700 | struct candidate_list *list) | 
|  | 701 | { | 
|  | 702 | struct gc_candidate *cand; | 
|  | 703 |  | 
|  | 704 | while (list->count) { | 
|  | 705 | cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, | 
|  | 706 | rb_node); | 
|  | 707 | remove_from_list(cand); | 
|  | 708 | free_candidate(sb, cand); | 
|  | 709 | } | 
|  | 710 | BUG_ON(list->rb_tree.rb_node); | 
|  | 711 | } | 
|  | 712 |  | 
|  | 713 | void logfs_cleanup_gc(struct super_block *sb) | 
|  | 714 | { | 
|  | 715 | struct logfs_super *super = logfs_super(sb); | 
|  | 716 | int i; | 
|  | 717 |  | 
|  | 718 | if (!super->s_free_list.count) | 
|  | 719 | return; | 
|  | 720 |  | 
|  | 721 | /* | 
|  | 722 | * FIXME: The btree may still contain a single empty node.  So we | 
|  | 723 | * call the grim visitor to clean up that mess.  Btree code should | 
|  | 724 | * do it for us, really. | 
|  | 725 | */ | 
|  | 726 | btree_grim_visitor32(&super->s_cand_tree, 0, NULL); | 
|  | 727 | logfs_cleanup_list(sb, &super->s_free_list); | 
|  | 728 | logfs_cleanup_list(sb, &super->s_reserve_list); | 
|  | 729 | for_each_area(i) | 
|  | 730 | logfs_cleanup_list(sb, &super->s_low_list[i]); | 
|  | 731 | logfs_cleanup_list(sb, &super->s_ec_list); | 
|  | 732 | } |