| Artem Bityutskiy | 1e51764 | 2008-07-14 19:08:37 +0300 | [diff] [blame] | 1 | /* | 
 | 2 |  * This file is part of UBIFS. | 
 | 3 |  * | 
 | 4 |  * Copyright (C) 2006-2008 Nokia Corporation. | 
 | 5 |  * | 
 | 6 |  * This program is free software; you can redistribute it and/or modify it | 
 | 7 |  * under the terms of the GNU General Public License version 2 as published by | 
 | 8 |  * the Free Software Foundation. | 
 | 9 |  * | 
 | 10 |  * This program is distributed in the hope that it will be useful, but WITHOUT | 
 | 11 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | 
 | 12 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for | 
 | 13 |  * more details. | 
 | 14 |  * | 
 | 15 |  * You should have received a copy of the GNU General Public License along with | 
 | 16 |  * this program; if not, write to the Free Software Foundation, Inc., 51 | 
 | 17 |  * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | 
 | 18 |  * | 
 | 19 |  * Authors: Artem Bityutskiy (Битюцкий Артём) | 
 | 20 |  *          Adrian Hunter | 
 | 21 |  */ | 
 | 22 |  | 
 | 23 | /* | 
 | 24 |  * This file implements UBIFS shrinker which evicts clean znodes from the TNC | 
 | 25 |  * tree when Linux VM needs more RAM. | 
 | 26 |  * | 
 | 27 |  * We do not implement any LRU lists to find oldest znodes to free because it | 
 | 28 |  * would add additional overhead to the file system fast paths. So the shrinker | 
 | 29 |  * just walks the TNC tree when searching for znodes to free. | 
 | 30 |  * | 
 | 31 |  * If the root of a TNC sub-tree is clean and old enough, then the children are | 
 | 32 |  * also clean and old enough. So the shrinker walks the TNC in level order and | 
 | 33 |  * dumps entire sub-trees. | 
 | 34 |  * | 
 | 35 |  * The age of znodes is just the time-stamp when they were last looked at. | 
 | 36 |  * The current shrinker first tries to evict old znodes, then young ones. | 
 | 37 |  * | 
 | 38 |  * Since the shrinker is global, it has to protect against races with FS | 
 | 39 |  * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'. | 
 | 40 |  */ | 
 | 41 |  | 
 | 42 | #include "ubifs.h" | 
 | 43 |  | 
 | 44 | /* List of all UBIFS file-system instances */ | 
 | 45 | LIST_HEAD(ubifs_infos); | 
 | 46 |  | 
 | 47 | /* | 
 | 48 |  * We number each shrinker run and record the number on the ubifs_info structure | 
 | 49 |  * so that we can easily work out which ubifs_info structures have already been | 
 | 50 |  * done by the current run. | 
 | 51 |  */ | 
 | 52 | static unsigned int shrinker_run_no; | 
 | 53 |  | 
 | 54 | /* Protects 'ubifs_infos' list */ | 
 | 55 | DEFINE_SPINLOCK(ubifs_infos_lock); | 
 | 56 |  | 
 | 57 | /* Global clean znode counter (for all mounted UBIFS instances) */ | 
 | 58 | atomic_long_t ubifs_clean_zn_cnt; | 
 | 59 |  | 
 | 60 | /** | 
 | 61 |  * shrink_tnc - shrink TNC tree. | 
 | 62 |  * @c: UBIFS file-system description object | 
 | 63 |  * @nr: number of znodes to free | 
 | 64 |  * @age: the age of znodes to free | 
 | 65 |  * @contention: if any contention, this is set to %1 | 
 | 66 |  * | 
 | 67 |  * This function traverses TNC tree and frees clean znodes. It does not free | 
 | 68 |  * clean znodes which younger then @age. Returns number of freed znodes. | 
 | 69 |  */ | 
 | 70 | static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) | 
 | 71 | { | 
 | 72 | 	int total_freed = 0; | 
 | 73 | 	struct ubifs_znode *znode, *zprev; | 
 | 74 | 	int time = get_seconds(); | 
 | 75 |  | 
 | 76 | 	ubifs_assert(mutex_is_locked(&c->umount_mutex)); | 
 | 77 | 	ubifs_assert(mutex_is_locked(&c->tnc_mutex)); | 
 | 78 |  | 
 | 79 | 	if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0) | 
 | 80 | 		return 0; | 
 | 81 |  | 
 | 82 | 	/* | 
 | 83 | 	 * Traverse the TNC tree in levelorder manner, so that it is possible | 
 | 84 | 	 * to destroy large sub-trees. Indeed, if a znode is old, then all its | 
 | 85 | 	 * children are older or of the same age. | 
 | 86 | 	 * | 
 | 87 | 	 * Note, we are holding 'c->tnc_mutex', so we do not have to lock the | 
 | 88 | 	 * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is | 
 | 89 | 	 * changed only when the 'c->tnc_mutex' is held. | 
 | 90 | 	 */ | 
 | 91 | 	zprev = NULL; | 
 | 92 | 	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); | 
 | 93 | 	while (znode && total_freed < nr && | 
 | 94 | 	       atomic_long_read(&c->clean_zn_cnt) > 0) { | 
 | 95 | 		int freed; | 
 | 96 |  | 
 | 97 | 		/* | 
 | 98 | 		 * If the znode is clean, but it is in the 'c->cnext' list, this | 
 | 99 | 		 * means that this znode has just been written to flash as a | 
 | 100 | 		 * part of commit and was marked clean. They will be removed | 
 | 101 | 		 * from the list at end commit. We cannot change the list, | 
 | 102 | 		 * because it is not protected by any mutex (design decision to | 
 | 103 | 		 * make commit really independent and parallel to main I/O). So | 
 | 104 | 		 * we just skip these znodes. | 
 | 105 | 		 * | 
 | 106 | 		 * Note, the 'clean_zn_cnt' counters are not updated until | 
 | 107 | 		 * after the commit, so the UBIFS shrinker does not report | 
 | 108 | 		 * the znodes which are in the 'c->cnext' list as freeable. | 
 | 109 | 		 * | 
 | 110 | 		 * Also note, if the root of a sub-tree is not in 'c->cnext', | 
 | 111 | 		 * then the whole sub-tree is not in 'c->cnext' as well, so it | 
 | 112 | 		 * is safe to dump whole sub-tree. | 
 | 113 | 		 */ | 
 | 114 |  | 
 | 115 | 		if (znode->cnext) { | 
 | 116 | 			/* | 
 | 117 | 			 * Very soon these znodes will be removed from the list | 
 | 118 | 			 * and become freeable. | 
 | 119 | 			 */ | 
 | 120 | 			*contention = 1; | 
 | 121 | 		} else if (!ubifs_zn_dirty(znode) && | 
 | 122 | 			   abs(time - znode->time) >= age) { | 
 | 123 | 			if (znode->parent) | 
 | 124 | 				znode->parent->zbranch[znode->iip].znode = NULL; | 
 | 125 | 			else | 
 | 126 | 				c->zroot.znode = NULL; | 
 | 127 |  | 
 | 128 | 			freed = ubifs_destroy_tnc_subtree(znode); | 
 | 129 | 			atomic_long_sub(freed, &ubifs_clean_zn_cnt); | 
 | 130 | 			atomic_long_sub(freed, &c->clean_zn_cnt); | 
 | 131 | 			ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0); | 
 | 132 | 			total_freed += freed; | 
 | 133 | 			znode = zprev; | 
 | 134 | 		} | 
 | 135 |  | 
 | 136 | 		if (unlikely(!c->zroot.znode)) | 
 | 137 | 			break; | 
 | 138 |  | 
 | 139 | 		zprev = znode; | 
 | 140 | 		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); | 
 | 141 | 		cond_resched(); | 
 | 142 | 	} | 
 | 143 |  | 
 | 144 | 	return total_freed; | 
 | 145 | } | 
 | 146 |  | 
 | 147 | /** | 
 | 148 |  * shrink_tnc_trees - shrink UBIFS TNC trees. | 
 | 149 |  * @nr: number of znodes to free | 
 | 150 |  * @age: the age of znodes to free | 
 | 151 |  * @contention: if any contention, this is set to %1 | 
 | 152 |  * | 
 | 153 |  * This function walks the list of mounted UBIFS file-systems and frees clean | 
 | 154 |  * znodes which are older then @age, until at least @nr znodes are freed. | 
 | 155 |  * Returns the number of freed znodes. | 
 | 156 |  */ | 
 | 157 | static int shrink_tnc_trees(int nr, int age, int *contention) | 
 | 158 | { | 
 | 159 | 	struct ubifs_info *c; | 
 | 160 | 	struct list_head *p; | 
 | 161 | 	unsigned int run_no; | 
 | 162 | 	int freed = 0; | 
 | 163 |  | 
 | 164 | 	spin_lock(&ubifs_infos_lock); | 
 | 165 | 	do { | 
 | 166 | 		run_no = ++shrinker_run_no; | 
 | 167 | 	} while (run_no == 0); | 
 | 168 | 	/* Iterate over all mounted UBIFS file-systems and try to shrink them */ | 
 | 169 | 	p = ubifs_infos.next; | 
 | 170 | 	while (p != &ubifs_infos) { | 
 | 171 | 		c = list_entry(p, struct ubifs_info, infos_list); | 
 | 172 | 		/* | 
 | 173 | 		 * We move the ones we do to the end of the list, so we stop | 
 | 174 | 		 * when we see one we have already done. | 
 | 175 | 		 */ | 
 | 176 | 		if (c->shrinker_run_no == run_no) | 
 | 177 | 			break; | 
 | 178 | 		if (!mutex_trylock(&c->umount_mutex)) { | 
 | 179 | 			/* Some un-mount is in progress, try next FS */ | 
 | 180 | 			*contention = 1; | 
 | 181 | 			p = p->next; | 
 | 182 | 			continue; | 
 | 183 | 		} | 
 | 184 | 		/* | 
 | 185 | 		 * We're holding 'c->umount_mutex', so the file-system won't go | 
 | 186 | 		 * away. | 
 | 187 | 		 */ | 
 | 188 | 		if (!mutex_trylock(&c->tnc_mutex)) { | 
 | 189 | 			mutex_unlock(&c->umount_mutex); | 
 | 190 | 			*contention = 1; | 
 | 191 | 			p = p->next; | 
 | 192 | 			continue; | 
 | 193 | 		} | 
 | 194 | 		spin_unlock(&ubifs_infos_lock); | 
 | 195 | 		/* | 
 | 196 | 		 * OK, now we have TNC locked, the file-system cannot go away - | 
 | 197 | 		 * it is safe to reap the cache. | 
 | 198 | 		 */ | 
 | 199 | 		c->shrinker_run_no = run_no; | 
 | 200 | 		freed += shrink_tnc(c, nr, age, contention); | 
 | 201 | 		mutex_unlock(&c->tnc_mutex); | 
 | 202 | 		spin_lock(&ubifs_infos_lock); | 
 | 203 | 		/* Get the next list element before we move this one */ | 
 | 204 | 		p = p->next; | 
 | 205 | 		/* | 
 | 206 | 		 * Move this one to the end of the list to provide some | 
 | 207 | 		 * fairness. | 
 | 208 | 		 */ | 
 | 209 | 		list_del(&c->infos_list); | 
 | 210 | 		list_add_tail(&c->infos_list, &ubifs_infos); | 
 | 211 | 		mutex_unlock(&c->umount_mutex); | 
 | 212 | 		if (freed >= nr) | 
 | 213 | 			break; | 
 | 214 | 	} | 
 | 215 | 	spin_unlock(&ubifs_infos_lock); | 
 | 216 | 	return freed; | 
 | 217 | } | 
 | 218 |  | 
 | 219 | /** | 
 | 220 |  * kick_a_thread - kick a background thread to start commit. | 
 | 221 |  * | 
 | 222 |  * This function kicks a background thread to start background commit. Returns | 
 | 223 |  * %-1 if a thread was kicked or there is another reason to assume the memory | 
 | 224 |  * will soon be freed or become freeable. If there are no dirty znodes, returns | 
 | 225 |  * %0. | 
 | 226 |  */ | 
 | 227 | static int kick_a_thread(void) | 
 | 228 | { | 
 | 229 | 	int i; | 
 | 230 | 	struct ubifs_info *c; | 
 | 231 |  | 
 | 232 | 	/* | 
 | 233 | 	 * Iterate over all mounted UBIFS file-systems and find out if there is | 
 | 234 | 	 * already an ongoing commit operation there. If no, then iterate for | 
 | 235 | 	 * the second time and initiate background commit. | 
 | 236 | 	 */ | 
 | 237 | 	spin_lock(&ubifs_infos_lock); | 
 | 238 | 	for (i = 0; i < 2; i++) { | 
 | 239 | 		list_for_each_entry(c, &ubifs_infos, infos_list) { | 
 | 240 | 			long dirty_zn_cnt; | 
 | 241 |  | 
 | 242 | 			if (!mutex_trylock(&c->umount_mutex)) { | 
 | 243 | 				/* | 
 | 244 | 				 * Some un-mount is in progress, it will | 
 | 245 | 				 * certainly free memory, so just return. | 
 | 246 | 				 */ | 
 | 247 | 				spin_unlock(&ubifs_infos_lock); | 
 | 248 | 				return -1; | 
 | 249 | 			} | 
 | 250 |  | 
 | 251 | 			dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); | 
 | 252 |  | 
 | 253 | 			if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || | 
 | 254 | 			    c->ro_media) { | 
 | 255 | 				mutex_unlock(&c->umount_mutex); | 
 | 256 | 				continue; | 
 | 257 | 			} | 
 | 258 |  | 
 | 259 | 			if (c->cmt_state != COMMIT_RESTING) { | 
 | 260 | 				spin_unlock(&ubifs_infos_lock); | 
 | 261 | 				mutex_unlock(&c->umount_mutex); | 
 | 262 | 				return -1; | 
 | 263 | 			} | 
 | 264 |  | 
 | 265 | 			if (i == 1) { | 
 | 266 | 				list_del(&c->infos_list); | 
 | 267 | 				list_add_tail(&c->infos_list, &ubifs_infos); | 
 | 268 | 				spin_unlock(&ubifs_infos_lock); | 
 | 269 |  | 
 | 270 | 				ubifs_request_bg_commit(c); | 
 | 271 | 				mutex_unlock(&c->umount_mutex); | 
 | 272 | 				return -1; | 
 | 273 | 			} | 
 | 274 | 			mutex_unlock(&c->umount_mutex); | 
 | 275 | 		} | 
 | 276 | 	} | 
 | 277 | 	spin_unlock(&ubifs_infos_lock); | 
 | 278 |  | 
 | 279 | 	return 0; | 
 | 280 | } | 
 | 281 |  | 
 | 282 | int ubifs_shrinker(int nr, gfp_t gfp_mask) | 
 | 283 | { | 
 | 284 | 	int freed, contention = 0; | 
 | 285 | 	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); | 
 | 286 |  | 
 | 287 | 	if (nr == 0) | 
 | 288 | 		return clean_zn_cnt; | 
 | 289 |  | 
 | 290 | 	if (!clean_zn_cnt) { | 
 | 291 | 		/* | 
 | 292 | 		 * No clean znodes, nothing to reap. All we can do in this case | 
 | 293 | 		 * is to kick background threads to start commit, which will | 
 | 294 | 		 * probably make clean znodes which, in turn, will be freeable. | 
 | 295 | 		 * And we return -1 which means will make VM call us again | 
 | 296 | 		 * later. | 
 | 297 | 		 */ | 
 | 298 | 		dbg_tnc("no clean znodes, kick a thread"); | 
 | 299 | 		return kick_a_thread(); | 
 | 300 | 	} | 
 | 301 |  | 
 | 302 | 	freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention); | 
 | 303 | 	if (freed >= nr) | 
 | 304 | 		goto out; | 
 | 305 |  | 
 | 306 | 	dbg_tnc("not enough old znodes, try to free young ones"); | 
 | 307 | 	freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention); | 
 | 308 | 	if (freed >= nr) | 
 | 309 | 		goto out; | 
 | 310 |  | 
 | 311 | 	dbg_tnc("not enough young znodes, free all"); | 
 | 312 | 	freed += shrink_tnc_trees(nr - freed, 0, &contention); | 
 | 313 |  | 
 | 314 | 	if (!freed && contention) { | 
 | 315 | 		dbg_tnc("freed nothing, but contention"); | 
 | 316 | 		return -1; | 
 | 317 | 	} | 
 | 318 |  | 
 | 319 | out: | 
 | 320 | 	dbg_tnc("%d znodes were freed, requested %d", freed, nr); | 
 | 321 | 	return freed; | 
 | 322 | } |