| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 1 | /* | 
 | 2 |  * fs/logfs/gc.c	- garbage collection code | 
 | 3 |  * | 
 | 4 |  * As should be obvious for Linux kernel code, license is GPLv2 | 
 | 5 |  * | 
 | 6 |  * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> | 
 | 7 |  */ | 
 | 8 | #include "logfs.h" | 
 | 9 | #include <linux/sched.h> | 
| Tejun Heo | 5a0e3ad | 2010-03-24 17:04:11 +0900 | [diff] [blame] | 10 | #include <linux/slab.h> | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 11 |  | 
 | 12 | /* | 
 | 13 |  * Wear leveling needs to kick in when the difference between low erase | 
 | 14 |  * counts and high erase counts gets too big.  A good value for "too big" | 
 | 15 |  * may be somewhat below 10% of maximum erase count for the device. | 
 | 16 |  * Why not 397, to pick a nice round number with no specific meaning? :) | 
 | 17 |  * | 
 | 18 |  * WL_RATELIMIT is the minimum time between two wear level events.  A huge | 
 | 19 |  * number of segments may fulfil the requirements for wear leveling at the | 
 | 20 |  * same time.  If that happens we don't want to cause a latency from hell, | 
 | 21 |  * but just gently pick one segment every so often and minimize overhead. | 
 | 22 |  */ | 
 | 23 | #define WL_DELTA 397 | 
 | 24 | #define WL_RATELIMIT 100 | 
 | 25 | #define MAX_OBJ_ALIASES	2600 | 
 | 26 | #define SCAN_RATIO 512	/* number of scanned segments per gc'd segment */ | 
 | 27 | #define LIST_SIZE 64	/* base size of candidate lists */ | 
 | 28 | #define SCAN_ROUNDS 128	/* maximum number of complete medium scans */ | 
 | 29 | #define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ | 
 | 30 |  | 
 | 31 | static int no_free_segments(struct super_block *sb) | 
 | 32 | { | 
 | 33 | 	struct logfs_super *super = logfs_super(sb); | 
 | 34 |  | 
 | 35 | 	return super->s_free_list.count; | 
 | 36 | } | 
 | 37 |  | 
 | 38 | /* journal has distance -1, top-most ifile layer distance 0 */ | 
 | 39 | static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) | 
 | 40 | { | 
 | 41 | 	struct logfs_super *super = logfs_super(sb); | 
 | 42 | 	u8 gc_level = (__force u8)__gc_level; | 
 | 43 |  | 
 | 44 | 	switch (gc_level) { | 
 | 45 | 	case 0: /* fall through */ | 
 | 46 | 	case 1: /* fall through */ | 
 | 47 | 	case 2: /* fall through */ | 
 | 48 | 	case 3: | 
 | 49 | 		/* file data or indirect blocks */ | 
 | 50 | 		return super->s_ifile_levels + super->s_iblock_levels - gc_level; | 
 | 51 | 	case 6: /* fall through */ | 
 | 52 | 	case 7: /* fall through */ | 
 | 53 | 	case 8: /* fall through */ | 
 | 54 | 	case 9: | 
 | 55 | 		/* inode file data or indirect blocks */ | 
 | 56 | 		return super->s_ifile_levels - (gc_level - 6); | 
 | 57 | 	default: | 
 | 58 | 		printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", | 
 | 59 | 				gc_level); | 
 | 60 | 		WARN_ON(1); | 
 | 61 | 		return super->s_ifile_levels + super->s_iblock_levels; | 
 | 62 | 	} | 
 | 63 | } | 
 | 64 |  | 
 | 65 | static int segment_is_reserved(struct super_block *sb, u32 segno) | 
 | 66 | { | 
 | 67 | 	struct logfs_super *super = logfs_super(sb); | 
 | 68 | 	struct logfs_area *area; | 
 | 69 | 	void *reserved; | 
 | 70 | 	int i; | 
 | 71 |  | 
 | 72 | 	/* Some segments are reserved.  Just pretend they were all valid */ | 
 | 73 | 	reserved = btree_lookup32(&super->s_reserved_segments, segno); | 
 | 74 | 	if (reserved) | 
 | 75 | 		return 1; | 
 | 76 |  | 
 | 77 | 	/* Currently open segments */ | 
 | 78 | 	for_each_area(i) { | 
 | 79 | 		area = super->s_area[i]; | 
 | 80 | 		if (area->a_is_open && area->a_segno == segno) | 
 | 81 | 			return 1; | 
 | 82 | 	} | 
 | 83 |  | 
 | 84 | 	return 0; | 
 | 85 | } | 
 | 86 |  | 
 | 87 | static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) | 
 | 88 | { | 
 | 89 | 	BUG(); | 
 | 90 | } | 
 | 91 |  | 
 | 92 | /* | 
 | 93 |  * Returns the bytes consumed by valid objects in this segment.  Object headers | 
 | 94 |  * are counted, the segment header is not. | 
 | 95 |  */ | 
 | 96 | static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, | 
 | 97 | 		gc_level_t *gc_level) | 
 | 98 | { | 
 | 99 | 	struct logfs_segment_entry se; | 
 | 100 | 	u32 ec_level; | 
 | 101 |  | 
 | 102 | 	logfs_get_segment_entry(sb, segno, &se); | 
 | 103 | 	if (se.ec_level == cpu_to_be32(BADSEG) || | 
 | 104 | 			se.valid == cpu_to_be32(RESERVED)) | 
 | 105 | 		return RESERVED; | 
 | 106 |  | 
 | 107 | 	ec_level = be32_to_cpu(se.ec_level); | 
 | 108 | 	*ec = ec_level >> 4; | 
 | 109 | 	*gc_level = GC_LEVEL(ec_level & 0xf); | 
 | 110 | 	return be32_to_cpu(se.valid); | 
 | 111 | } | 
 | 112 |  | 
 | 113 | static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, | 
 | 114 | 		u64 bix, gc_level_t gc_level) | 
 | 115 | { | 
 | 116 | 	struct inode *inode; | 
 | 117 | 	int err, cookie; | 
 | 118 |  | 
 | 119 | 	inode = logfs_safe_iget(sb, ino, &cookie); | 
 | 120 | 	err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); | 
 | 121 | 	BUG_ON(err); | 
 | 122 | 	logfs_safe_iput(inode, cookie); | 
 | 123 | } | 
 | 124 |  | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 125 | static u32 logfs_gc_segment(struct super_block *sb, u32 segno) | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 126 | { | 
 | 127 | 	struct logfs_super *super = logfs_super(sb); | 
 | 128 | 	struct logfs_segment_header sh; | 
 | 129 | 	struct logfs_object_header oh; | 
 | 130 | 	u64 ofs, ino, bix; | 
 | 131 | 	u32 seg_ofs, logical_segno, cleaned = 0; | 
 | 132 | 	int err, len, valid; | 
 | 133 | 	gc_level_t gc_level; | 
 | 134 |  | 
 | 135 | 	LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); | 
 | 136 |  | 
 | 137 | 	btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); | 
 | 138 | 	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); | 
 | 139 | 	BUG_ON(err); | 
 | 140 | 	gc_level = GC_LEVEL(sh.level); | 
 | 141 | 	logical_segno = be32_to_cpu(sh.segno); | 
 | 142 | 	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { | 
 | 143 | 		logfs_mark_segment_bad(sb, segno); | 
 | 144 | 		cleaned = -1; | 
 | 145 | 		goto out; | 
 | 146 | 	} | 
 | 147 |  | 
 | 148 | 	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; | 
 | 149 | 			seg_ofs + sizeof(oh) < super->s_segsize; ) { | 
 | 150 | 		ofs = dev_ofs(sb, logical_segno, seg_ofs); | 
 | 151 | 		err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), | 
 | 152 | 				&oh); | 
 | 153 | 		BUG_ON(err); | 
 | 154 |  | 
 | 155 | 		if (!memchr_inv(&oh, 0xff, sizeof(oh))) | 
 | 156 | 			break; | 
 | 157 |  | 
 | 158 | 		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { | 
 | 159 | 			logfs_mark_segment_bad(sb, segno); | 
 | 160 | 			cleaned = super->s_segsize - 1; | 
 | 161 | 			goto out; | 
 | 162 | 		} | 
 | 163 |  | 
 | 164 | 		ino = be64_to_cpu(oh.ino); | 
 | 165 | 		bix = be64_to_cpu(oh.bix); | 
 | 166 | 		len = sizeof(oh) + be16_to_cpu(oh.len); | 
 | 167 | 		valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); | 
 | 168 | 		if (valid == 1) { | 
 | 169 | 			logfs_cleanse_block(sb, ofs, ino, bix, gc_level); | 
 | 170 | 			cleaned += len; | 
 | 171 | 		} else if (valid == 2) { | 
 | 172 | 			/* Will be invalid upon journal commit */ | 
 | 173 | 			cleaned += len; | 
 | 174 | 		} | 
 | 175 | 		seg_ofs += len; | 
 | 176 | 	} | 
 | 177 | out: | 
 | 178 | 	btree_remove32(&super->s_reserved_segments, segno); | 
 | 179 | 	return cleaned; | 
 | 180 | } | 
 | 181 |  | 
 | 182 | static struct gc_candidate *add_list(struct gc_candidate *cand, | 
 | 183 | 		struct candidate_list *list) | 
 | 184 | { | 
 | 185 | 	struct rb_node **p = &list->rb_tree.rb_node; | 
 | 186 | 	struct rb_node *parent = NULL; | 
 | 187 | 	struct gc_candidate *cur; | 
 | 188 | 	int comp; | 
 | 189 |  | 
 | 190 | 	cand->list = list; | 
 | 191 | 	while (*p) { | 
 | 192 | 		parent = *p; | 
 | 193 | 		cur = rb_entry(parent, struct gc_candidate, rb_node); | 
 | 194 |  | 
 | 195 | 		if (list->sort_by_ec) | 
 | 196 | 			comp = cand->erase_count < cur->erase_count; | 
 | 197 | 		else | 
 | 198 | 			comp = cand->valid < cur->valid; | 
 | 199 |  | 
 | 200 | 		if (comp) | 
 | 201 | 			p = &parent->rb_left; | 
 | 202 | 		else | 
 | 203 | 			p = &parent->rb_right; | 
 | 204 | 	} | 
 | 205 | 	rb_link_node(&cand->rb_node, parent, p); | 
 | 206 | 	rb_insert_color(&cand->rb_node, &list->rb_tree); | 
 | 207 |  | 
 | 208 | 	if (list->count <= list->maxcount) { | 
 | 209 | 		list->count++; | 
 | 210 | 		return NULL; | 
 | 211 | 	} | 
 | 212 | 	cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); | 
 | 213 | 	rb_erase(&cand->rb_node, &list->rb_tree); | 
 | 214 | 	cand->list = NULL; | 
 | 215 | 	return cand; | 
 | 216 | } | 
 | 217 |  | 
 | 218 | static void remove_from_list(struct gc_candidate *cand) | 
 | 219 | { | 
 | 220 | 	struct candidate_list *list = cand->list; | 
 | 221 |  | 
 | 222 | 	rb_erase(&cand->rb_node, &list->rb_tree); | 
 | 223 | 	list->count--; | 
 | 224 | } | 
 | 225 |  | 
 | 226 | static void free_candidate(struct super_block *sb, struct gc_candidate *cand) | 
 | 227 | { | 
 | 228 | 	struct logfs_super *super = logfs_super(sb); | 
 | 229 |  | 
 | 230 | 	btree_remove32(&super->s_cand_tree, cand->segno); | 
 | 231 | 	kfree(cand); | 
 | 232 | } | 
 | 233 |  | 
 | 234 | u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) | 
 | 235 | { | 
 | 236 | 	struct gc_candidate *cand; | 
 | 237 | 	u32 segno; | 
 | 238 |  | 
 | 239 | 	BUG_ON(list->count == 0); | 
 | 240 |  | 
 | 241 | 	cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | 
 | 242 | 	remove_from_list(cand); | 
 | 243 | 	segno = cand->segno; | 
 | 244 | 	if (ec) | 
 | 245 | 		*ec = cand->erase_count; | 
 | 246 | 	free_candidate(sb, cand); | 
 | 247 | 	return segno; | 
 | 248 | } | 
 | 249 |  | 
 | 250 | /* | 
 | 251 |  * We have several lists to manage segments with.  The reserve_list is used to | 
 | 252 |  * deal with bad blocks.  We try to keep the best (lowest ec) segments on this | 
 | 253 |  * list. | 
 | 254 |  * The free_list contains free segments for normal usage.  It usually gets the | 
 | 255 |  * second pick after the reserve_list.  But when the free_list is running short | 
 | 256 |  * it is more important to keep the free_list full than to keep a reserve. | 
 | 257 |  * | 
 | 258 |  * Segments that are not free are put onto a per-level low_list.  If we have | 
 | 259 |  * to run garbage collection, we pick a candidate from there.  All segments on | 
 | 260 |  * those lists should have at least some free space so GC will make progress. | 
 | 261 |  * | 
 | 262 |  * And last we have the ec_list, which is used to pick segments for wear | 
 | 263 |  * leveling. | 
 | 264 |  * | 
 | 265 |  * If all appropriate lists are full, we simply free the candidate and forget | 
 | 266 |  * about that segment for a while.  We have better candidates for each purpose. | 
 | 267 |  */ | 
 | 268 | static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) | 
 | 269 | { | 
 | 270 | 	struct logfs_super *super = logfs_super(sb); | 
 | 271 | 	u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; | 
 | 272 |  | 
 | 273 | 	if (cand->valid == 0) { | 
 | 274 | 		/* 100% free segments */ | 
 | 275 | 		log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", | 
 | 276 | 				cand->segno, cand->erase_count, | 
 | 277 | 				dev_ofs(sb, cand->segno, 0)); | 
 | 278 | 		cand = add_list(cand, &super->s_reserve_list); | 
 | 279 | 		if (cand) { | 
 | 280 | 			log_gc_noisy("add free segment %x (ec %x) at %llx\n", | 
 | 281 | 					cand->segno, cand->erase_count, | 
 | 282 | 					dev_ofs(sb, cand->segno, 0)); | 
 | 283 | 			cand = add_list(cand, &super->s_free_list); | 
 | 284 | 		} | 
 | 285 | 	} else { | 
 | 286 | 		/* good candidates for Garbage Collection */ | 
 | 287 | 		if (cand->valid < full) | 
 | 288 | 			cand = add_list(cand, &super->s_low_list[cand->dist]); | 
 | 289 | 		/* good candidates for wear leveling, | 
 | 290 | 		 * segments that were recently written get ignored */ | 
 | 291 | 		if (cand) | 
 | 292 | 			cand = add_list(cand, &super->s_ec_list); | 
 | 293 | 	} | 
 | 294 | 	if (cand) | 
 | 295 | 		free_candidate(sb, cand); | 
 | 296 | } | 
 | 297 |  | 
 | 298 | static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, | 
 | 299 | 		u8 dist) | 
 | 300 | { | 
 | 301 | 	struct logfs_super *super = logfs_super(sb); | 
 | 302 | 	struct gc_candidate *cand; | 
 | 303 |  | 
 | 304 | 	cand = kmalloc(sizeof(*cand), GFP_NOFS); | 
 | 305 | 	if (!cand) | 
 | 306 | 		return -ENOMEM; | 
 | 307 |  | 
 | 308 | 	cand->segno = segno; | 
 | 309 | 	cand->valid = valid; | 
 | 310 | 	cand->erase_count = ec; | 
 | 311 | 	cand->dist = dist; | 
 | 312 |  | 
 | 313 | 	btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); | 
 | 314 | 	__add_candidate(sb, cand); | 
 | 315 | 	return 0; | 
 | 316 | } | 
 | 317 |  | 
 | 318 | static void remove_segment_from_lists(struct super_block *sb, u32 segno) | 
 | 319 | { | 
 | 320 | 	struct logfs_super *super = logfs_super(sb); | 
 | 321 | 	struct gc_candidate *cand; | 
 | 322 |  | 
 | 323 | 	cand = btree_lookup32(&super->s_cand_tree, segno); | 
 | 324 | 	if (cand) { | 
 | 325 | 		remove_from_list(cand); | 
 | 326 | 		free_candidate(sb, cand); | 
 | 327 | 	} | 
 | 328 | } | 
 | 329 |  | 
 | 330 | static void scan_segment(struct super_block *sb, u32 segno) | 
 | 331 | { | 
 | 332 | 	u32 valid, ec = 0; | 
 | 333 | 	gc_level_t gc_level = 0; | 
 | 334 | 	u8 dist; | 
 | 335 |  | 
 | 336 | 	if (segment_is_reserved(sb, segno)) | 
 | 337 | 		return; | 
 | 338 |  | 
 | 339 | 	remove_segment_from_lists(sb, segno); | 
 | 340 | 	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | 
 | 341 | 	if (valid == RESERVED) | 
 | 342 | 		return; | 
 | 343 |  | 
 | 344 | 	dist = root_distance(sb, gc_level); | 
 | 345 | 	add_candidate(sb, segno, valid, ec, dist); | 
 | 346 | } | 
 | 347 |  | 
 | 348 | static struct gc_candidate *first_in_list(struct candidate_list *list) | 
 | 349 | { | 
 | 350 | 	if (list->count == 0) | 
 | 351 | 		return NULL; | 
 | 352 | 	return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); | 
 | 353 | } | 
 | 354 |  | 
 | 355 | /* | 
 | 356 |  * Find the best segment for garbage collection.  Main criterion is | 
 | 357 |  * the segment requiring the least effort to clean.  Secondary | 
 | 358 |  * criterion is to GC on the lowest level available. | 
 | 359 |  * | 
 | 360 |  * So we search the least effort segment on the lowest level first, | 
 | 361 |  * then move up and pick another segment iff is requires significantly | 
 | 362 |  * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison. | 
 | 363 |  */ | 
 | 364 | static struct gc_candidate *get_candidate(struct super_block *sb) | 
 | 365 | { | 
 | 366 | 	struct logfs_super *super = logfs_super(sb); | 
 | 367 | 	int i, max_dist; | 
 | 368 | 	struct gc_candidate *cand = NULL, *this; | 
 | 369 |  | 
 | 370 | 	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); | 
 | 371 |  | 
 | 372 | 	for (i = max_dist; i >= 0; i--) { | 
 | 373 | 		this = first_in_list(&super->s_low_list[i]); | 
 | 374 | 		if (!this) | 
 | 375 | 			continue; | 
 | 376 | 		if (!cand) | 
 | 377 | 			cand = this; | 
 | 378 | 		if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) | 
 | 379 | 			cand = this; | 
 | 380 | 	} | 
 | 381 | 	return cand; | 
 | 382 | } | 
 | 383 |  | 
 | 384 | static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) | 
 | 385 | { | 
 | 386 | 	struct logfs_super *super = logfs_super(sb); | 
 | 387 | 	gc_level_t gc_level; | 
 | 388 | 	u32 cleaned, valid, segno, ec; | 
 | 389 | 	u8 dist; | 
 | 390 |  | 
 | 391 | 	if (!cand) { | 
 | 392 | 		log_gc("GC attempted, but no candidate found\n"); | 
 | 393 | 		return 0; | 
 | 394 | 	} | 
 | 395 |  | 
 | 396 | 	segno = cand->segno; | 
 | 397 | 	dist = cand->dist; | 
 | 398 | 	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | 
 | 399 | 	free_candidate(sb, cand); | 
 | 400 | 	log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", | 
 | 401 | 			segno, (u64)segno << super->s_segshift, | 
 | 402 | 			dist, no_free_segments(sb), valid, | 
 | 403 | 			super->s_free_bytes); | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 404 | 	cleaned = logfs_gc_segment(sb, segno); | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 405 | 	log_gc("GC segment #%02x complete - now %x valid\n", segno, | 
 | 406 | 			valid - cleaned); | 
 | 407 | 	BUG_ON(cleaned != valid); | 
 | 408 | 	return 1; | 
 | 409 | } | 
 | 410 |  | 
 | 411 | static int logfs_gc_once(struct super_block *sb) | 
 | 412 | { | 
 | 413 | 	struct gc_candidate *cand; | 
 | 414 |  | 
 | 415 | 	cand = get_candidate(sb); | 
 | 416 | 	if (cand) | 
 | 417 | 		remove_from_list(cand); | 
 | 418 | 	return __logfs_gc_once(sb, cand); | 
 | 419 | } | 
 | 420 |  | 
 | 421 | /* returns 1 if a wrap occurs, 0 otherwise */ | 
 | 422 | static int logfs_scan_some(struct super_block *sb) | 
 | 423 | { | 
 | 424 | 	struct logfs_super *super = logfs_super(sb); | 
 | 425 | 	u32 segno; | 
 | 426 | 	int i, ret = 0; | 
 | 427 |  | 
 | 428 | 	segno = super->s_sweeper; | 
 | 429 | 	for (i = SCAN_RATIO; i > 0; i--) { | 
 | 430 | 		segno++; | 
 | 431 | 		if (segno >= super->s_no_segs) { | 
 | 432 | 			segno = 0; | 
 | 433 | 			ret = 1; | 
 | 434 | 			/* Break out of the loop.  We want to read a single | 
 | 435 | 			 * block from the segment size on next invocation if | 
 | 436 | 			 * SCAN_RATIO is set to match block size | 
 | 437 | 			 */ | 
 | 438 | 			break; | 
 | 439 | 		} | 
 | 440 |  | 
 | 441 | 		scan_segment(sb, segno); | 
 | 442 | 	} | 
 | 443 | 	super->s_sweeper = segno; | 
 | 444 | 	return ret; | 
 | 445 | } | 
 | 446 |  | 
 | 447 | /* | 
 | 448 |  * In principle, this function should loop forever, looking for GC candidates | 
 | 449 |  * and moving data.  LogFS is designed in such a way that this loop is | 
 | 450 |  * guaranteed to terminate. | 
 | 451 |  * | 
 | 452 |  * Limiting the loop to some iterations serves purely to catch cases when | 
 | 453 |  * these guarantees have failed.  An actual endless loop is an obvious bug | 
 | 454 |  * and should be reported as such. | 
 | 455 |  */ | 
 | 456 | static void __logfs_gc_pass(struct super_block *sb, int target) | 
 | 457 | { | 
 | 458 | 	struct logfs_super *super = logfs_super(sb); | 
 | 459 | 	struct logfs_block *block; | 
 | 460 | 	int round, progress, last_progress = 0; | 
 | 461 |  | 
| Joern Engel | 032d8f7 | 2010-04-13 17:46:37 +0200 | [diff] [blame] | 462 | 	/* | 
 | 463 | 	 * Doing too many changes to the segfile at once would result | 
 | 464 | 	 * in a large number of aliases.  Write the journal before | 
 | 465 | 	 * things get out of hand. | 
 | 466 | 	 */ | 
 | 467 | 	if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES) | 
 | 468 | 		logfs_write_anchor(sb); | 
 | 469 |  | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 470 | 	if (no_free_segments(sb) >= target && | 
 | 471 | 			super->s_no_object_aliases < MAX_OBJ_ALIASES) | 
 | 472 | 		return; | 
 | 473 |  | 
 | 474 | 	log_gc("__logfs_gc_pass(%x)\n", target); | 
 | 475 | 	for (round = 0; round < SCAN_ROUNDS; ) { | 
 | 476 | 		if (no_free_segments(sb) >= target) | 
 | 477 | 			goto write_alias; | 
 | 478 |  | 
 | 479 | 		/* Sync in-memory state with on-medium state in case they | 
 | 480 | 		 * diverged */ | 
| Joern Engel | c6d3830 | 2010-03-04 21:36:19 +0100 | [diff] [blame] | 481 | 		logfs_write_anchor(sb); | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 482 | 		round += logfs_scan_some(sb); | 
 | 483 | 		if (no_free_segments(sb) >= target) | 
 | 484 | 			goto write_alias; | 
 | 485 | 		progress = logfs_gc_once(sb); | 
 | 486 | 		if (progress) | 
 | 487 | 			last_progress = round; | 
 | 488 | 		else if (round - last_progress > 2) | 
 | 489 | 			break; | 
 | 490 | 		continue; | 
 | 491 |  | 
 | 492 | 		/* | 
 | 493 | 		 * The goto logic is nasty, I just don't know a better way to | 
 | 494 | 		 * code it.  GC is supposed to ensure two things: | 
 | 495 | 		 * 1. Enough free segments are available. | 
 | 496 | 		 * 2. The number of aliases is bounded. | 
 | 497 | 		 * When 1. is achieved, we take a look at 2. and write back | 
 | 498 | 		 * some alias-containing blocks, if necessary.  However, after | 
 | 499 | 		 * each such write we need to go back to 1., as writes can | 
 | 500 | 		 * consume free segments. | 
 | 501 | 		 */ | 
 | 502 | write_alias: | 
 | 503 | 		if (super->s_no_object_aliases < MAX_OBJ_ALIASES) | 
 | 504 | 			return; | 
 | 505 | 		if (list_empty(&super->s_object_alias)) { | 
 | 506 | 			/* All aliases are still in btree */ | 
 | 507 | 			return; | 
 | 508 | 		} | 
 | 509 | 		log_gc("Write back one alias\n"); | 
 | 510 | 		block = list_entry(super->s_object_alias.next, | 
 | 511 | 				struct logfs_block, alias_list); | 
 | 512 | 		block->ops->write_block(block); | 
 | 513 | 		/* | 
 | 514 | 		 * To round off the nasty goto logic, we reset round here.  It | 
 | 515 | 		 * is a safety-net for GC not making any progress and limited | 
 | 516 | 		 * to something reasonably small.  If incremented it for every | 
 | 517 | 		 * single alias, the loop could terminate rather quickly. | 
 | 518 | 		 */ | 
 | 519 | 		round = 0; | 
 | 520 | 	} | 
 | 521 | 	LOGFS_BUG(sb); | 
 | 522 | } | 
 | 523 |  | 
 | 524 | static int wl_ratelimit(struct super_block *sb, u64 *next_event) | 
 | 525 | { | 
 | 526 | 	struct logfs_super *super = logfs_super(sb); | 
 | 527 |  | 
 | 528 | 	if (*next_event < super->s_gec) { | 
 | 529 | 		*next_event = super->s_gec + WL_RATELIMIT; | 
 | 530 | 		return 0; | 
 | 531 | 	} | 
 | 532 | 	return 1; | 
 | 533 | } | 
 | 534 |  | 
 | 535 | static void logfs_wl_pass(struct super_block *sb) | 
 | 536 | { | 
 | 537 | 	struct logfs_super *super = logfs_super(sb); | 
 | 538 | 	struct gc_candidate *wl_cand, *free_cand; | 
 | 539 |  | 
 | 540 | 	if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) | 
 | 541 | 		return; | 
 | 542 |  | 
 | 543 | 	wl_cand = first_in_list(&super->s_ec_list); | 
 | 544 | 	if (!wl_cand) | 
 | 545 | 		return; | 
 | 546 | 	free_cand = first_in_list(&super->s_free_list); | 
 | 547 | 	if (!free_cand) | 
 | 548 | 		return; | 
 | 549 |  | 
 | 550 | 	if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { | 
 | 551 | 		remove_from_list(wl_cand); | 
 | 552 | 		__logfs_gc_once(sb, wl_cand); | 
 | 553 | 	} | 
 | 554 | } | 
 | 555 |  | 
 | 556 | /* | 
 | 557 |  * The journal needs wear leveling as well.  But moving the journal is an | 
 | 558 |  * expensive operation so we try to avoid it as much as possible.  And if we | 
 | 559 |  * have to do it, we move the whole journal, not individual segments. | 
 | 560 |  * | 
 | 561 |  * Ratelimiting is not strictly necessary here, it mainly serves to avoid the | 
 | 562 |  * calculations.  First we check whether moving the journal would be a | 
 | 563 |  * significant improvement.  That means that a) the current journal segments | 
 | 564 |  * have more wear than the future journal segments and b) the current journal | 
 | 565 |  * segments have more wear than normal ostore segments. | 
 | 566 |  * Rationale for b) is that we don't have to move the journal if it is aging | 
 | 567 |  * less than the ostore, even if the reserve segments age even less (they are | 
 | 568 |  * excluded from wear leveling, after all). | 
 | 569 |  * Next we check that the superblocks have less wear than the journal.  Since | 
 | 570 |  * moving the journal requires writing the superblocks, we have to protect the | 
 | 571 |  * superblocks even more than the journal. | 
 | 572 |  * | 
 | 573 |  * Also we double the acceptable wear difference, compared to ostore wear | 
 | 574 |  * leveling.  Journal data is read and rewritten rapidly, comparatively.  So | 
 | 575 |  * soft errors have much less time to accumulate and we allow the journal to | 
 | 576 |  * be a bit worse than the ostore. | 
 | 577 |  */ | 
 | 578 | static void logfs_journal_wl_pass(struct super_block *sb) | 
 | 579 | { | 
 | 580 | 	struct logfs_super *super = logfs_super(sb); | 
 | 581 | 	struct gc_candidate *cand; | 
 | 582 | 	u32 min_journal_ec = -1, max_reserve_ec = 0; | 
 | 583 | 	int i; | 
 | 584 |  | 
 | 585 | 	if (wl_ratelimit(sb, &super->s_wl_gec_journal)) | 
 | 586 | 		return; | 
 | 587 |  | 
 | 588 | 	if (super->s_reserve_list.count < super->s_no_journal_segs) { | 
 | 589 | 		/* Reserve is not full enough to move complete journal */ | 
 | 590 | 		return; | 
 | 591 | 	} | 
 | 592 |  | 
 | 593 | 	journal_for_each(i) | 
 | 594 | 		if (super->s_journal_seg[i]) | 
 | 595 | 			min_journal_ec = min(min_journal_ec, | 
 | 596 | 					super->s_journal_ec[i]); | 
 | 597 | 	cand = rb_entry(rb_first(&super->s_free_list.rb_tree), | 
 | 598 | 			struct gc_candidate, rb_node); | 
 | 599 | 	max_reserve_ec = cand->erase_count; | 
 | 600 | 	for (i = 0; i < 2; i++) { | 
 | 601 | 		struct logfs_segment_entry se; | 
 | 602 | 		u32 segno = seg_no(sb, super->s_sb_ofs[i]); | 
 | 603 | 		u32 ec; | 
 | 604 |  | 
 | 605 | 		logfs_get_segment_entry(sb, segno, &se); | 
 | 606 | 		ec = be32_to_cpu(se.ec_level) >> 4; | 
 | 607 | 		max_reserve_ec = max(max_reserve_ec, ec); | 
 | 608 | 	} | 
 | 609 |  | 
 | 610 | 	if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { | 
 | 611 | 		do_logfs_journal_wl_pass(sb); | 
 | 612 | 	} | 
 | 613 | } | 
 | 614 |  | 
 | 615 | void logfs_gc_pass(struct super_block *sb) | 
 | 616 | { | 
 | 617 | 	struct logfs_super *super = logfs_super(sb); | 
 | 618 |  | 
 | 619 | 	//BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); | 
 | 620 | 	/* Write journal before free space is getting saturated with dirty | 
 | 621 | 	 * objects. | 
 | 622 | 	 */ | 
 | 623 | 	if (super->s_dirty_used_bytes + super->s_dirty_free_bytes | 
 | 624 | 			+ LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) | 
| Joern Engel | c6d3830 | 2010-03-04 21:36:19 +0100 | [diff] [blame] | 625 | 		logfs_write_anchor(sb); | 
 | 626 | 	__logfs_gc_pass(sb, super->s_total_levels); | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 627 | 	logfs_wl_pass(sb); | 
 | 628 | 	logfs_journal_wl_pass(sb); | 
 | 629 | } | 
 | 630 |  | 
 | 631 | static int check_area(struct super_block *sb, int i) | 
 | 632 | { | 
 | 633 | 	struct logfs_super *super = logfs_super(sb); | 
 | 634 | 	struct logfs_area *area = super->s_area[i]; | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 635 | 	gc_level_t gc_level; | 
 | 636 | 	u32 cleaned, valid, ec; | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 637 | 	u32 segno = area->a_segno; | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 638 | 	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 639 |  | 
 | 640 | 	if (!area->a_is_open) | 
 | 641 | 		return 0; | 
 | 642 |  | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 643 | 	if (super->s_devops->can_write_buf(sb, ofs) == 0) | 
 | 644 | 		return 0; | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 645 |  | 
| Joern Engel | 6f485b4 | 2010-05-07 19:38:40 +0200 | [diff] [blame] | 646 | 	printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs); | 
 | 647 | 	/* | 
 | 648 | 	 * The device cannot write back the write buffer.  Most likely the | 
 | 649 | 	 * wbuf was already written out and the system crashed at some point | 
 | 650 | 	 * before the journal commit happened.  In that case we wouldn't have | 
 | 651 | 	 * to do anything.  But if the crash happened before the wbuf was | 
 | 652 | 	 * written out correctly, we must GC this segment.  So assume the | 
 | 653 | 	 * worst and always do the GC run. | 
 | 654 | 	 */ | 
 | 655 | 	area->a_is_open = 0; | 
 | 656 | 	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); | 
 | 657 | 	cleaned = logfs_gc_segment(sb, segno); | 
 | 658 | 	if (cleaned != valid) | 
 | 659 | 		return -EIO; | 
| Joern Engel | 5db53f3 | 2009-11-20 20:13:39 +0100 | [diff] [blame] | 660 | 	return 0; | 
 | 661 | } | 
 | 662 |  | 
 | 663 | int logfs_check_areas(struct super_block *sb) | 
 | 664 | { | 
 | 665 | 	int i, err; | 
 | 666 |  | 
 | 667 | 	for_each_area(i) { | 
 | 668 | 		err = check_area(sb, i); | 
 | 669 | 		if (err) | 
 | 670 | 			return err; | 
 | 671 | 	} | 
 | 672 | 	return 0; | 
 | 673 | } | 
 | 674 |  | 
 | 675 | static void logfs_init_candlist(struct candidate_list *list, int maxcount, | 
 | 676 | 		int sort_by_ec) | 
 | 677 | { | 
 | 678 | 	list->count = 0; | 
 | 679 | 	list->maxcount = maxcount; | 
 | 680 | 	list->sort_by_ec = sort_by_ec; | 
 | 681 | 	list->rb_tree = RB_ROOT; | 
 | 682 | } | 
 | 683 |  | 
 | 684 | int logfs_init_gc(struct super_block *sb) | 
 | 685 | { | 
 | 686 | 	struct logfs_super *super = logfs_super(sb); | 
 | 687 | 	int i; | 
 | 688 |  | 
 | 689 | 	btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); | 
 | 690 | 	logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); | 
 | 691 | 	logfs_init_candlist(&super->s_reserve_list, | 
 | 692 | 			super->s_bad_seg_reserve, 1); | 
 | 693 | 	for_each_area(i) | 
 | 694 | 		logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); | 
 | 695 | 	logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); | 
 | 696 | 	return 0; | 
 | 697 | } | 
 | 698 |  | 
 | 699 | static void logfs_cleanup_list(struct super_block *sb, | 
 | 700 | 		struct candidate_list *list) | 
 | 701 | { | 
 | 702 | 	struct gc_candidate *cand; | 
 | 703 |  | 
 | 704 | 	while (list->count) { | 
 | 705 | 		cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, | 
 | 706 | 				rb_node); | 
 | 707 | 		remove_from_list(cand); | 
 | 708 | 		free_candidate(sb, cand); | 
 | 709 | 	} | 
 | 710 | 	BUG_ON(list->rb_tree.rb_node); | 
 | 711 | } | 
 | 712 |  | 
 | 713 | void logfs_cleanup_gc(struct super_block *sb) | 
 | 714 | { | 
 | 715 | 	struct logfs_super *super = logfs_super(sb); | 
 | 716 | 	int i; | 
 | 717 |  | 
 | 718 | 	if (!super->s_free_list.count) | 
 | 719 | 		return; | 
 | 720 |  | 
 | 721 | 	/* | 
 | 722 | 	 * FIXME: The btree may still contain a single empty node.  So we | 
 | 723 | 	 * call the grim visitor to clean up that mess.  Btree code should | 
 | 724 | 	 * do it for us, really. | 
 | 725 | 	 */ | 
 | 726 | 	btree_grim_visitor32(&super->s_cand_tree, 0, NULL); | 
 | 727 | 	logfs_cleanup_list(sb, &super->s_free_list); | 
 | 728 | 	logfs_cleanup_list(sb, &super->s_reserve_list); | 
 | 729 | 	for_each_area(i) | 
 | 730 | 		logfs_cleanup_list(sb, &super->s_low_list[i]); | 
 | 731 | 	logfs_cleanup_list(sb, &super->s_ec_list); | 
 | 732 | } |