| Mikulas Patocka | 95d402f | 2011-10-31 20:19:09 +0000 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright (C) 2009-2011 Red Hat, Inc. | 
|  | 3 | * | 
|  | 4 | * Author: Mikulas Patocka <mpatocka@redhat.com> | 
|  | 5 | * | 
|  | 6 | * This file is released under the GPL. | 
|  | 7 | */ | 
|  | 8 |  | 
|  | 9 | #include "dm-bufio.h" | 
|  | 10 |  | 
|  | 11 | #include <linux/device-mapper.h> | 
|  | 12 | #include <linux/dm-io.h> | 
|  | 13 | #include <linux/slab.h> | 
|  | 14 | #include <linux/vmalloc.h> | 
|  | 15 | #include <linux/version.h> | 
|  | 16 | #include <linux/shrinker.h> | 
| Stephen Rothwell | 6f66263 | 2011-11-01 18:30:49 +1100 | [diff] [blame] | 17 | #include <linux/module.h> | 
| Mikulas Patocka | 95d402f | 2011-10-31 20:19:09 +0000 | [diff] [blame] | 18 |  | 
|  | 19 | #define DM_MSG_PREFIX "bufio" | 
|  | 20 |  | 
|  | 21 | /* | 
|  | 22 | * Memory management policy: | 
|  | 23 | *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory | 
|  | 24 | *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). | 
|  | 25 | *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. | 
|  | 26 | *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT | 
|  | 27 | *	dirty buffers. | 
|  | 28 | */ | 
|  | 29 | #define DM_BUFIO_MIN_BUFFERS		8 | 
|  | 30 |  | 
|  | 31 | #define DM_BUFIO_MEMORY_PERCENT		2 | 
|  | 32 | #define DM_BUFIO_VMALLOC_PERCENT	25 | 
|  | 33 | #define DM_BUFIO_WRITEBACK_PERCENT	75 | 
|  | 34 |  | 
|  | 35 | /* | 
|  | 36 | * Check buffer ages in this interval (seconds) | 
|  | 37 | */ | 
|  | 38 | #define DM_BUFIO_WORK_TIMER_SECS	10 | 
|  | 39 |  | 
|  | 40 | /* | 
|  | 41 | * Free buffers when they are older than this (seconds) | 
|  | 42 | */ | 
|  | 43 | #define DM_BUFIO_DEFAULT_AGE_SECS	60 | 
|  | 44 |  | 
|  | 45 | /* | 
|  | 46 | * The number of bvec entries that are embedded directly in the buffer. | 
|  | 47 | * If the chunk size is larger, dm-io is used to do the io. | 
|  | 48 | */ | 
|  | 49 | #define DM_BUFIO_INLINE_VECS		16 | 
|  | 50 |  | 
|  | 51 | /* | 
|  | 52 | * Buffer hash | 
|  | 53 | */ | 
|  | 54 | #define DM_BUFIO_HASH_BITS	20 | 
|  | 55 | #define DM_BUFIO_HASH(block) \ | 
|  | 56 | ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ | 
|  | 57 | ((1 << DM_BUFIO_HASH_BITS) - 1)) | 
|  | 58 |  | 
|  | 59 | /* | 
|  | 60 | * Don't try to use kmem_cache_alloc for blocks larger than this. | 
|  | 61 | * For explanation, see alloc_buffer_data below. | 
|  | 62 | */ | 
|  | 63 | #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT	(PAGE_SIZE >> 1) | 
|  | 64 | #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT	(PAGE_SIZE << (MAX_ORDER - 1)) | 
|  | 65 |  | 
|  | 66 | /* | 
|  | 67 | * dm_buffer->list_mode | 
|  | 68 | */ | 
|  | 69 | #define LIST_CLEAN	0 | 
|  | 70 | #define LIST_DIRTY	1 | 
|  | 71 | #define LIST_SIZE	2 | 
|  | 72 |  | 
|  | 73 | /* | 
|  | 74 | * Linking of buffers: | 
|  | 75 | *	All buffers are linked to cache_hash with their hash_list field. | 
|  | 76 | * | 
|  | 77 | *	Clean buffers that are not being written (B_WRITING not set) | 
|  | 78 | *	are linked to lru[LIST_CLEAN] with their lru_list field. | 
|  | 79 | * | 
|  | 80 | *	Dirty and clean buffers that are being written are linked to | 
|  | 81 | *	lru[LIST_DIRTY] with their lru_list field. When the write | 
|  | 82 | *	finishes, the buffer cannot be relinked immediately (because we | 
|  | 83 | *	are in an interrupt context and relinking requires process | 
|  | 84 | *	context), so some clean-not-writing buffers can be held on | 
|  | 85 | *	dirty_lru too.  They are later added to lru in the process | 
|  | 86 | *	context. | 
|  | 87 | */ | 
|  | 88 | struct dm_bufio_client { | 
|  | 89 | struct mutex lock; | 
|  | 90 |  | 
|  | 91 | struct list_head lru[LIST_SIZE]; | 
|  | 92 | unsigned long n_buffers[LIST_SIZE]; | 
|  | 93 |  | 
|  | 94 | struct block_device *bdev; | 
|  | 95 | unsigned block_size; | 
|  | 96 | unsigned char sectors_per_block_bits; | 
|  | 97 | unsigned char pages_per_block_bits; | 
|  | 98 | unsigned char blocks_per_page_bits; | 
|  | 99 | unsigned aux_size; | 
|  | 100 | void (*alloc_callback)(struct dm_buffer *); | 
|  | 101 | void (*write_callback)(struct dm_buffer *); | 
|  | 102 |  | 
|  | 103 | struct dm_io_client *dm_io; | 
|  | 104 |  | 
|  | 105 | struct list_head reserved_buffers; | 
|  | 106 | unsigned need_reserved_buffers; | 
|  | 107 |  | 
|  | 108 | struct hlist_head *cache_hash; | 
|  | 109 | wait_queue_head_t free_buffer_wait; | 
|  | 110 |  | 
|  | 111 | int async_write_error; | 
|  | 112 |  | 
|  | 113 | struct list_head client_list; | 
|  | 114 | struct shrinker shrinker; | 
|  | 115 | }; | 
|  | 116 |  | 
|  | 117 | /* | 
|  | 118 | * Buffer state bits. | 
|  | 119 | */ | 
|  | 120 | #define B_READING	0 | 
|  | 121 | #define B_WRITING	1 | 
|  | 122 | #define B_DIRTY		2 | 
|  | 123 |  | 
|  | 124 | /* | 
|  | 125 | * Describes how the block was allocated: | 
|  | 126 | * kmem_cache_alloc(), __get_free_pages() or vmalloc(). | 
|  | 127 | * See the comment at alloc_buffer_data. | 
|  | 128 | */ | 
|  | 129 | enum data_mode { | 
|  | 130 | DATA_MODE_SLAB = 0, | 
|  | 131 | DATA_MODE_GET_FREE_PAGES = 1, | 
|  | 132 | DATA_MODE_VMALLOC = 2, | 
|  | 133 | DATA_MODE_LIMIT = 3 | 
|  | 134 | }; | 
|  | 135 |  | 
|  | 136 | struct dm_buffer { | 
|  | 137 | struct hlist_node hash_list; | 
|  | 138 | struct list_head lru_list; | 
|  | 139 | sector_t block; | 
|  | 140 | void *data; | 
|  | 141 | enum data_mode data_mode; | 
|  | 142 | unsigned char list_mode;		/* LIST_* */ | 
|  | 143 | unsigned hold_count; | 
|  | 144 | int read_error; | 
|  | 145 | int write_error; | 
|  | 146 | unsigned long state; | 
|  | 147 | unsigned long last_accessed; | 
|  | 148 | struct dm_bufio_client *c; | 
|  | 149 | struct bio bio; | 
|  | 150 | struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; | 
|  | 151 | }; | 
|  | 152 |  | 
|  | 153 | /*----------------------------------------------------------------*/ | 
|  | 154 |  | 
|  | 155 | static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; | 
|  | 156 | static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; | 
|  | 157 |  | 
|  | 158 | static inline int dm_bufio_cache_index(struct dm_bufio_client *c) | 
|  | 159 | { | 
|  | 160 | unsigned ret = c->blocks_per_page_bits - 1; | 
|  | 161 |  | 
|  | 162 | BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); | 
|  | 163 |  | 
|  | 164 | return ret; | 
|  | 165 | } | 
|  | 166 |  | 
|  | 167 | #define DM_BUFIO_CACHE(c)	(dm_bufio_caches[dm_bufio_cache_index(c)]) | 
|  | 168 | #define DM_BUFIO_CACHE_NAME(c)	(dm_bufio_cache_names[dm_bufio_cache_index(c)]) | 
|  | 169 |  | 
|  | 170 | #define dm_bufio_in_request()	(!!current->bio_list) | 
|  | 171 |  | 
|  | 172 | static void dm_bufio_lock(struct dm_bufio_client *c) | 
|  | 173 | { | 
|  | 174 | mutex_lock_nested(&c->lock, dm_bufio_in_request()); | 
|  | 175 | } | 
|  | 176 |  | 
|  | 177 | static int dm_bufio_trylock(struct dm_bufio_client *c) | 
|  | 178 | { | 
|  | 179 | return mutex_trylock(&c->lock); | 
|  | 180 | } | 
|  | 181 |  | 
|  | 182 | static void dm_bufio_unlock(struct dm_bufio_client *c) | 
|  | 183 | { | 
|  | 184 | mutex_unlock(&c->lock); | 
|  | 185 | } | 
|  | 186 |  | 
|  | 187 | /* | 
|  | 188 | * FIXME Move to sched.h? | 
|  | 189 | */ | 
|  | 190 | #ifdef CONFIG_PREEMPT_VOLUNTARY | 
|  | 191 | #  define dm_bufio_cond_resched()		\ | 
|  | 192 | do {						\ | 
|  | 193 | if (unlikely(need_resched()))		\ | 
|  | 194 | _cond_resched();		\ | 
|  | 195 | } while (0) | 
|  | 196 | #else | 
|  | 197 | #  define dm_bufio_cond_resched()                do { } while (0) | 
|  | 198 | #endif | 
|  | 199 |  | 
|  | 200 | /*----------------------------------------------------------------*/ | 
|  | 201 |  | 
|  | 202 | /* | 
|  | 203 | * Default cache size: available memory divided by the ratio. | 
|  | 204 | */ | 
|  | 205 | static unsigned long dm_bufio_default_cache_size; | 
|  | 206 |  | 
|  | 207 | /* | 
|  | 208 | * Total cache size set by the user. | 
|  | 209 | */ | 
|  | 210 | static unsigned long dm_bufio_cache_size; | 
|  | 211 |  | 
|  | 212 | /* | 
|  | 213 | * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change | 
|  | 214 | * at any time.  If it disagrees, the user has changed cache size. | 
|  | 215 | */ | 
|  | 216 | static unsigned long dm_bufio_cache_size_latch; | 
|  | 217 |  | 
|  | 218 | static DEFINE_SPINLOCK(param_spinlock); | 
|  | 219 |  | 
|  | 220 | /* | 
|  | 221 | * Buffers are freed after this timeout | 
|  | 222 | */ | 
|  | 223 | static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; | 
|  | 224 |  | 
|  | 225 | static unsigned long dm_bufio_peak_allocated; | 
|  | 226 | static unsigned long dm_bufio_allocated_kmem_cache; | 
|  | 227 | static unsigned long dm_bufio_allocated_get_free_pages; | 
|  | 228 | static unsigned long dm_bufio_allocated_vmalloc; | 
|  | 229 | static unsigned long dm_bufio_current_allocated; | 
|  | 230 |  | 
|  | 231 | /*----------------------------------------------------------------*/ | 
|  | 232 |  | 
|  | 233 | /* | 
|  | 234 | * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count | 
|  | 235 | */ | 
|  | 236 | static unsigned long dm_bufio_cache_size_per_client; | 
|  | 237 |  | 
|  | 238 | /* | 
|  | 239 | * The current number of clients. | 
|  | 240 | */ | 
|  | 241 | static int dm_bufio_client_count; | 
|  | 242 |  | 
|  | 243 | /* | 
|  | 244 | * The list of all clients. | 
|  | 245 | */ | 
|  | 246 | static LIST_HEAD(dm_bufio_all_clients); | 
|  | 247 |  | 
|  | 248 | /* | 
|  | 249 | * This mutex protects dm_bufio_cache_size_latch, | 
|  | 250 | * dm_bufio_cache_size_per_client and dm_bufio_client_count | 
|  | 251 | */ | 
|  | 252 | static DEFINE_MUTEX(dm_bufio_clients_lock); | 
|  | 253 |  | 
|  | 254 | /*----------------------------------------------------------------*/ | 
|  | 255 |  | 
|  | 256 | static void adjust_total_allocated(enum data_mode data_mode, long diff) | 
|  | 257 | { | 
|  | 258 | static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { | 
|  | 259 | &dm_bufio_allocated_kmem_cache, | 
|  | 260 | &dm_bufio_allocated_get_free_pages, | 
|  | 261 | &dm_bufio_allocated_vmalloc, | 
|  | 262 | }; | 
|  | 263 |  | 
|  | 264 | spin_lock(¶m_spinlock); | 
|  | 265 |  | 
|  | 266 | *class_ptr[data_mode] += diff; | 
|  | 267 |  | 
|  | 268 | dm_bufio_current_allocated += diff; | 
|  | 269 |  | 
|  | 270 | if (dm_bufio_current_allocated > dm_bufio_peak_allocated) | 
|  | 271 | dm_bufio_peak_allocated = dm_bufio_current_allocated; | 
|  | 272 |  | 
|  | 273 | spin_unlock(¶m_spinlock); | 
|  | 274 | } | 
|  | 275 |  | 
|  | 276 | /* | 
|  | 277 | * Change the number of clients and recalculate per-client limit. | 
|  | 278 | */ | 
|  | 279 | static void __cache_size_refresh(void) | 
|  | 280 | { | 
|  | 281 | BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); | 
|  | 282 | BUG_ON(dm_bufio_client_count < 0); | 
|  | 283 |  | 
|  | 284 | dm_bufio_cache_size_latch = dm_bufio_cache_size; | 
|  | 285 |  | 
|  | 286 | barrier(); | 
|  | 287 |  | 
|  | 288 | /* | 
|  | 289 | * Use default if set to 0 and report the actual cache size used. | 
|  | 290 | */ | 
|  | 291 | if (!dm_bufio_cache_size_latch) { | 
|  | 292 | (void)cmpxchg(&dm_bufio_cache_size, 0, | 
|  | 293 | dm_bufio_default_cache_size); | 
|  | 294 | dm_bufio_cache_size_latch = dm_bufio_default_cache_size; | 
|  | 295 | } | 
|  | 296 |  | 
|  | 297 | dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / | 
|  | 298 | (dm_bufio_client_count ? : 1); | 
|  | 299 | } | 
|  | 300 |  | 
|  | 301 | /* | 
|  | 302 | * Allocating buffer data. | 
|  | 303 | * | 
|  | 304 | * Small buffers are allocated with kmem_cache, to use space optimally. | 
|  | 305 | * | 
|  | 306 | * For large buffers, we choose between get_free_pages and vmalloc. | 
|  | 307 | * Each has advantages and disadvantages. | 
|  | 308 | * | 
|  | 309 | * __get_free_pages can randomly fail if the memory is fragmented. | 
|  | 310 | * __vmalloc won't randomly fail, but vmalloc space is limited (it may be | 
|  | 311 | * as low as 128M) so using it for caching is not appropriate. | 
|  | 312 | * | 
|  | 313 | * If the allocation may fail we use __get_free_pages. Memory fragmentation | 
|  | 314 | * won't have a fatal effect here, but it just causes flushes of some other | 
|  | 315 | * buffers and more I/O will be performed. Don't use __get_free_pages if it | 
|  | 316 | * always fails (i.e. order >= MAX_ORDER). | 
|  | 317 | * | 
|  | 318 | * If the allocation shouldn't fail we use __vmalloc. This is only for the | 
|  | 319 | * initial reserve allocation, so there's no risk of wasting all vmalloc | 
|  | 320 | * space. | 
|  | 321 | */ | 
|  | 322 | static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, | 
|  | 323 | enum data_mode *data_mode) | 
|  | 324 | { | 
|  | 325 | if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { | 
|  | 326 | *data_mode = DATA_MODE_SLAB; | 
|  | 327 | return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); | 
|  | 328 | } | 
|  | 329 |  | 
|  | 330 | if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && | 
|  | 331 | gfp_mask & __GFP_NORETRY) { | 
|  | 332 | *data_mode = DATA_MODE_GET_FREE_PAGES; | 
|  | 333 | return (void *)__get_free_pages(gfp_mask, | 
|  | 334 | c->pages_per_block_bits); | 
|  | 335 | } | 
|  | 336 |  | 
|  | 337 | *data_mode = DATA_MODE_VMALLOC; | 
|  | 338 | return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); | 
|  | 339 | } | 
|  | 340 |  | 
|  | 341 | /* | 
|  | 342 | * Free buffer's data. | 
|  | 343 | */ | 
|  | 344 | static void free_buffer_data(struct dm_bufio_client *c, | 
|  | 345 | void *data, enum data_mode data_mode) | 
|  | 346 | { | 
|  | 347 | switch (data_mode) { | 
|  | 348 | case DATA_MODE_SLAB: | 
|  | 349 | kmem_cache_free(DM_BUFIO_CACHE(c), data); | 
|  | 350 | break; | 
|  | 351 |  | 
|  | 352 | case DATA_MODE_GET_FREE_PAGES: | 
|  | 353 | free_pages((unsigned long)data, c->pages_per_block_bits); | 
|  | 354 | break; | 
|  | 355 |  | 
|  | 356 | case DATA_MODE_VMALLOC: | 
|  | 357 | vfree(data); | 
|  | 358 | break; | 
|  | 359 |  | 
|  | 360 | default: | 
|  | 361 | DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", | 
|  | 362 | data_mode); | 
|  | 363 | BUG(); | 
|  | 364 | } | 
|  | 365 | } | 
|  | 366 |  | 
|  | 367 | /* | 
|  | 368 | * Allocate buffer and its data. | 
|  | 369 | */ | 
|  | 370 | static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) | 
|  | 371 | { | 
|  | 372 | struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, | 
|  | 373 | gfp_mask); | 
|  | 374 |  | 
|  | 375 | if (!b) | 
|  | 376 | return NULL; | 
|  | 377 |  | 
|  | 378 | b->c = c; | 
|  | 379 |  | 
|  | 380 | b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); | 
|  | 381 | if (!b->data) { | 
|  | 382 | kfree(b); | 
|  | 383 | return NULL; | 
|  | 384 | } | 
|  | 385 |  | 
|  | 386 | adjust_total_allocated(b->data_mode, (long)c->block_size); | 
|  | 387 |  | 
|  | 388 | return b; | 
|  | 389 | } | 
|  | 390 |  | 
|  | 391 | /* | 
|  | 392 | * Free buffer and its data. | 
|  | 393 | */ | 
|  | 394 | static void free_buffer(struct dm_buffer *b) | 
|  | 395 | { | 
|  | 396 | struct dm_bufio_client *c = b->c; | 
|  | 397 |  | 
|  | 398 | adjust_total_allocated(b->data_mode, -(long)c->block_size); | 
|  | 399 |  | 
|  | 400 | free_buffer_data(c, b->data, b->data_mode); | 
|  | 401 | kfree(b); | 
|  | 402 | } | 
|  | 403 |  | 
|  | 404 | /* | 
|  | 405 | * Link buffer to the hash list and clean or dirty queue. | 
|  | 406 | */ | 
|  | 407 | static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) | 
|  | 408 | { | 
|  | 409 | struct dm_bufio_client *c = b->c; | 
|  | 410 |  | 
|  | 411 | c->n_buffers[dirty]++; | 
|  | 412 | b->block = block; | 
|  | 413 | b->list_mode = dirty; | 
|  | 414 | list_add(&b->lru_list, &c->lru[dirty]); | 
|  | 415 | hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); | 
|  | 416 | b->last_accessed = jiffies; | 
|  | 417 | } | 
|  | 418 |  | 
|  | 419 | /* | 
|  | 420 | * Unlink buffer from the hash list and dirty or clean queue. | 
|  | 421 | */ | 
|  | 422 | static void __unlink_buffer(struct dm_buffer *b) | 
|  | 423 | { | 
|  | 424 | struct dm_bufio_client *c = b->c; | 
|  | 425 |  | 
|  | 426 | BUG_ON(!c->n_buffers[b->list_mode]); | 
|  | 427 |  | 
|  | 428 | c->n_buffers[b->list_mode]--; | 
|  | 429 | hlist_del(&b->hash_list); | 
|  | 430 | list_del(&b->lru_list); | 
|  | 431 | } | 
|  | 432 |  | 
|  | 433 | /* | 
|  | 434 | * Place the buffer to the head of dirty or clean LRU queue. | 
|  | 435 | */ | 
|  | 436 | static void __relink_lru(struct dm_buffer *b, int dirty) | 
|  | 437 | { | 
|  | 438 | struct dm_bufio_client *c = b->c; | 
|  | 439 |  | 
|  | 440 | BUG_ON(!c->n_buffers[b->list_mode]); | 
|  | 441 |  | 
|  | 442 | c->n_buffers[b->list_mode]--; | 
|  | 443 | c->n_buffers[dirty]++; | 
|  | 444 | b->list_mode = dirty; | 
|  | 445 | list_del(&b->lru_list); | 
|  | 446 | list_add(&b->lru_list, &c->lru[dirty]); | 
|  | 447 | } | 
|  | 448 |  | 
|  | 449 | /*---------------------------------------------------------------- | 
|  | 450 | * Submit I/O on the buffer. | 
|  | 451 | * | 
|  | 452 | * Bio interface is faster but it has some problems: | 
|  | 453 | *	the vector list is limited (increasing this limit increases | 
|  | 454 | *	memory-consumption per buffer, so it is not viable); | 
|  | 455 | * | 
|  | 456 | *	the memory must be direct-mapped, not vmalloced; | 
|  | 457 | * | 
|  | 458 | *	the I/O driver can reject requests spuriously if it thinks that | 
|  | 459 | *	the requests are too big for the device or if they cross a | 
|  | 460 | *	controller-defined memory boundary. | 
|  | 461 | * | 
|  | 462 | * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and | 
|  | 463 | * it is not vmalloced, try using the bio interface. | 
|  | 464 | * | 
|  | 465 | * If the buffer is big, if it is vmalloced or if the underlying device | 
|  | 466 | * rejects the bio because it is too large, use dm-io layer to do the I/O. | 
|  | 467 | * The dm-io layer splits the I/O into multiple requests, avoiding the above | 
|  | 468 | * shortcomings. | 
|  | 469 | *--------------------------------------------------------------*/ | 
|  | 470 |  | 
|  | 471 | /* | 
|  | 472 | * dm-io completion routine. It just calls b->bio.bi_end_io, pretending | 
|  | 473 | * that the request was handled directly with bio interface. | 
|  | 474 | */ | 
|  | 475 | static void dmio_complete(unsigned long error, void *context) | 
|  | 476 | { | 
|  | 477 | struct dm_buffer *b = context; | 
|  | 478 |  | 
|  | 479 | b->bio.bi_end_io(&b->bio, error ? -EIO : 0); | 
|  | 480 | } | 
|  | 481 |  | 
|  | 482 | static void use_dmio(struct dm_buffer *b, int rw, sector_t block, | 
|  | 483 | bio_end_io_t *end_io) | 
|  | 484 | { | 
|  | 485 | int r; | 
|  | 486 | struct dm_io_request io_req = { | 
|  | 487 | .bi_rw = rw, | 
|  | 488 | .notify.fn = dmio_complete, | 
|  | 489 | .notify.context = b, | 
|  | 490 | .client = b->c->dm_io, | 
|  | 491 | }; | 
|  | 492 | struct dm_io_region region = { | 
|  | 493 | .bdev = b->c->bdev, | 
|  | 494 | .sector = block << b->c->sectors_per_block_bits, | 
|  | 495 | .count = b->c->block_size >> SECTOR_SHIFT, | 
|  | 496 | }; | 
|  | 497 |  | 
|  | 498 | if (b->data_mode != DATA_MODE_VMALLOC) { | 
|  | 499 | io_req.mem.type = DM_IO_KMEM; | 
|  | 500 | io_req.mem.ptr.addr = b->data; | 
|  | 501 | } else { | 
|  | 502 | io_req.mem.type = DM_IO_VMA; | 
|  | 503 | io_req.mem.ptr.vma = b->data; | 
|  | 504 | } | 
|  | 505 |  | 
|  | 506 | b->bio.bi_end_io = end_io; | 
|  | 507 |  | 
|  | 508 | r = dm_io(&io_req, 1, ®ion, NULL); | 
|  | 509 | if (r) | 
|  | 510 | end_io(&b->bio, r); | 
|  | 511 | } | 
|  | 512 |  | 
|  | 513 | static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, | 
|  | 514 | bio_end_io_t *end_io) | 
|  | 515 | { | 
|  | 516 | char *ptr; | 
|  | 517 | int len; | 
|  | 518 |  | 
|  | 519 | bio_init(&b->bio); | 
|  | 520 | b->bio.bi_io_vec = b->bio_vec; | 
|  | 521 | b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; | 
|  | 522 | b->bio.bi_sector = block << b->c->sectors_per_block_bits; | 
|  | 523 | b->bio.bi_bdev = b->c->bdev; | 
|  | 524 | b->bio.bi_end_io = end_io; | 
|  | 525 |  | 
|  | 526 | /* | 
|  | 527 | * We assume that if len >= PAGE_SIZE ptr is page-aligned. | 
|  | 528 | * If len < PAGE_SIZE the buffer doesn't cross page boundary. | 
|  | 529 | */ | 
|  | 530 | ptr = b->data; | 
|  | 531 | len = b->c->block_size; | 
|  | 532 |  | 
|  | 533 | if (len >= PAGE_SIZE) | 
|  | 534 | BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); | 
|  | 535 | else | 
|  | 536 | BUG_ON((unsigned long)ptr & (len - 1)); | 
|  | 537 |  | 
|  | 538 | do { | 
|  | 539 | if (!bio_add_page(&b->bio, virt_to_page(ptr), | 
|  | 540 | len < PAGE_SIZE ? len : PAGE_SIZE, | 
|  | 541 | virt_to_phys(ptr) & (PAGE_SIZE - 1))) { | 
|  | 542 | BUG_ON(b->c->block_size <= PAGE_SIZE); | 
|  | 543 | use_dmio(b, rw, block, end_io); | 
|  | 544 | return; | 
|  | 545 | } | 
|  | 546 |  | 
|  | 547 | len -= PAGE_SIZE; | 
|  | 548 | ptr += PAGE_SIZE; | 
|  | 549 | } while (len > 0); | 
|  | 550 |  | 
|  | 551 | submit_bio(rw, &b->bio); | 
|  | 552 | } | 
|  | 553 |  | 
|  | 554 | static void submit_io(struct dm_buffer *b, int rw, sector_t block, | 
|  | 555 | bio_end_io_t *end_io) | 
|  | 556 | { | 
|  | 557 | if (rw == WRITE && b->c->write_callback) | 
|  | 558 | b->c->write_callback(b); | 
|  | 559 |  | 
|  | 560 | if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && | 
|  | 561 | b->data_mode != DATA_MODE_VMALLOC) | 
|  | 562 | use_inline_bio(b, rw, block, end_io); | 
|  | 563 | else | 
|  | 564 | use_dmio(b, rw, block, end_io); | 
|  | 565 | } | 
|  | 566 |  | 
|  | 567 | /*---------------------------------------------------------------- | 
|  | 568 | * Writing dirty buffers | 
|  | 569 | *--------------------------------------------------------------*/ | 
|  | 570 |  | 
|  | 571 | /* | 
|  | 572 | * The endio routine for write. | 
|  | 573 | * | 
|  | 574 | * Set the error, clear B_WRITING bit and wake anyone who was waiting on | 
|  | 575 | * it. | 
|  | 576 | */ | 
|  | 577 | static void write_endio(struct bio *bio, int error) | 
|  | 578 | { | 
|  | 579 | struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); | 
|  | 580 |  | 
|  | 581 | b->write_error = error; | 
|  | 582 | if (error) { | 
|  | 583 | struct dm_bufio_client *c = b->c; | 
|  | 584 | (void)cmpxchg(&c->async_write_error, 0, error); | 
|  | 585 | } | 
|  | 586 |  | 
|  | 587 | BUG_ON(!test_bit(B_WRITING, &b->state)); | 
|  | 588 |  | 
|  | 589 | smp_mb__before_clear_bit(); | 
|  | 590 | clear_bit(B_WRITING, &b->state); | 
|  | 591 | smp_mb__after_clear_bit(); | 
|  | 592 |  | 
|  | 593 | wake_up_bit(&b->state, B_WRITING); | 
|  | 594 | } | 
|  | 595 |  | 
|  | 596 | /* | 
|  | 597 | * This function is called when wait_on_bit is actually waiting. | 
|  | 598 | */ | 
|  | 599 | static int do_io_schedule(void *word) | 
|  | 600 | { | 
|  | 601 | io_schedule(); | 
|  | 602 |  | 
|  | 603 | return 0; | 
|  | 604 | } | 
|  | 605 |  | 
|  | 606 | /* | 
|  | 607 | * Initiate a write on a dirty buffer, but don't wait for it. | 
|  | 608 | * | 
|  | 609 | * - If the buffer is not dirty, exit. | 
|  | 610 | * - If there some previous write going on, wait for it to finish (we can't | 
|  | 611 | *   have two writes on the same buffer simultaneously). | 
|  | 612 | * - Submit our write and don't wait on it. We set B_WRITING indicating | 
|  | 613 | *   that there is a write in progress. | 
|  | 614 | */ | 
|  | 615 | static void __write_dirty_buffer(struct dm_buffer *b) | 
|  | 616 | { | 
|  | 617 | if (!test_bit(B_DIRTY, &b->state)) | 
|  | 618 | return; | 
|  | 619 |  | 
|  | 620 | clear_bit(B_DIRTY, &b->state); | 
|  | 621 | wait_on_bit_lock(&b->state, B_WRITING, | 
|  | 622 | do_io_schedule, TASK_UNINTERRUPTIBLE); | 
|  | 623 |  | 
|  | 624 | submit_io(b, WRITE, b->block, write_endio); | 
|  | 625 | } | 
|  | 626 |  | 
|  | 627 | /* | 
|  | 628 | * Wait until any activity on the buffer finishes.  Possibly write the | 
|  | 629 | * buffer if it is dirty.  When this function finishes, there is no I/O | 
|  | 630 | * running on the buffer and the buffer is not dirty. | 
|  | 631 | */ | 
|  | 632 | static void __make_buffer_clean(struct dm_buffer *b) | 
|  | 633 | { | 
|  | 634 | BUG_ON(b->hold_count); | 
|  | 635 |  | 
|  | 636 | if (!b->state)	/* fast case */ | 
|  | 637 | return; | 
|  | 638 |  | 
|  | 639 | wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); | 
|  | 640 | __write_dirty_buffer(b); | 
|  | 641 | wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); | 
|  | 642 | } | 
|  | 643 |  | 
|  | 644 | /* | 
|  | 645 | * Find some buffer that is not held by anybody, clean it, unlink it and | 
|  | 646 | * return it. | 
|  | 647 | */ | 
|  | 648 | static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) | 
|  | 649 | { | 
|  | 650 | struct dm_buffer *b; | 
|  | 651 |  | 
|  | 652 | list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { | 
|  | 653 | BUG_ON(test_bit(B_WRITING, &b->state)); | 
|  | 654 | BUG_ON(test_bit(B_DIRTY, &b->state)); | 
|  | 655 |  | 
|  | 656 | if (!b->hold_count) { | 
|  | 657 | __make_buffer_clean(b); | 
|  | 658 | __unlink_buffer(b); | 
|  | 659 | return b; | 
|  | 660 | } | 
|  | 661 | dm_bufio_cond_resched(); | 
|  | 662 | } | 
|  | 663 |  | 
|  | 664 | list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { | 
|  | 665 | BUG_ON(test_bit(B_READING, &b->state)); | 
|  | 666 |  | 
|  | 667 | if (!b->hold_count) { | 
|  | 668 | __make_buffer_clean(b); | 
|  | 669 | __unlink_buffer(b); | 
|  | 670 | return b; | 
|  | 671 | } | 
|  | 672 | dm_bufio_cond_resched(); | 
|  | 673 | } | 
|  | 674 |  | 
|  | 675 | return NULL; | 
|  | 676 | } | 
|  | 677 |  | 
|  | 678 | /* | 
|  | 679 | * Wait until some other threads free some buffer or release hold count on | 
|  | 680 | * some buffer. | 
|  | 681 | * | 
|  | 682 | * This function is entered with c->lock held, drops it and regains it | 
|  | 683 | * before exiting. | 
|  | 684 | */ | 
|  | 685 | static void __wait_for_free_buffer(struct dm_bufio_client *c) | 
|  | 686 | { | 
|  | 687 | DECLARE_WAITQUEUE(wait, current); | 
|  | 688 |  | 
|  | 689 | add_wait_queue(&c->free_buffer_wait, &wait); | 
|  | 690 | set_task_state(current, TASK_UNINTERRUPTIBLE); | 
|  | 691 | dm_bufio_unlock(c); | 
|  | 692 |  | 
|  | 693 | io_schedule(); | 
|  | 694 |  | 
|  | 695 | set_task_state(current, TASK_RUNNING); | 
|  | 696 | remove_wait_queue(&c->free_buffer_wait, &wait); | 
|  | 697 |  | 
|  | 698 | dm_bufio_lock(c); | 
|  | 699 | } | 
|  | 700 |  | 
|  | 701 | /* | 
|  | 702 | * Allocate a new buffer. If the allocation is not possible, wait until | 
|  | 703 | * some other thread frees a buffer. | 
|  | 704 | * | 
|  | 705 | * May drop the lock and regain it. | 
|  | 706 | */ | 
|  | 707 | static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) | 
|  | 708 | { | 
|  | 709 | struct dm_buffer *b; | 
|  | 710 |  | 
|  | 711 | /* | 
|  | 712 | * dm-bufio is resistant to allocation failures (it just keeps | 
|  | 713 | * one buffer reserved in cases all the allocations fail). | 
|  | 714 | * So set flags to not try too hard: | 
|  | 715 | *	GFP_NOIO: don't recurse into the I/O layer | 
|  | 716 | *	__GFP_NORETRY: don't retry and rather return failure | 
|  | 717 | *	__GFP_NOMEMALLOC: don't use emergency reserves | 
|  | 718 | *	__GFP_NOWARN: don't print a warning in case of failure | 
|  | 719 | * | 
|  | 720 | * For debugging, if we set the cache size to 1, no new buffers will | 
|  | 721 | * be allocated. | 
|  | 722 | */ | 
|  | 723 | while (1) { | 
|  | 724 | if (dm_bufio_cache_size_latch != 1) { | 
|  | 725 | b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); | 
|  | 726 | if (b) | 
|  | 727 | return b; | 
|  | 728 | } | 
|  | 729 |  | 
|  | 730 | if (!list_empty(&c->reserved_buffers)) { | 
|  | 731 | b = list_entry(c->reserved_buffers.next, | 
|  | 732 | struct dm_buffer, lru_list); | 
|  | 733 | list_del(&b->lru_list); | 
|  | 734 | c->need_reserved_buffers++; | 
|  | 735 |  | 
|  | 736 | return b; | 
|  | 737 | } | 
|  | 738 |  | 
|  | 739 | b = __get_unclaimed_buffer(c); | 
|  | 740 | if (b) | 
|  | 741 | return b; | 
|  | 742 |  | 
|  | 743 | __wait_for_free_buffer(c); | 
|  | 744 | } | 
|  | 745 | } | 
|  | 746 |  | 
|  | 747 | static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) | 
|  | 748 | { | 
|  | 749 | struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); | 
|  | 750 |  | 
|  | 751 | if (c->alloc_callback) | 
|  | 752 | c->alloc_callback(b); | 
|  | 753 |  | 
|  | 754 | return b; | 
|  | 755 | } | 
|  | 756 |  | 
|  | 757 | /* | 
|  | 758 | * Free a buffer and wake other threads waiting for free buffers. | 
|  | 759 | */ | 
|  | 760 | static void __free_buffer_wake(struct dm_buffer *b) | 
|  | 761 | { | 
|  | 762 | struct dm_bufio_client *c = b->c; | 
|  | 763 |  | 
|  | 764 | if (!c->need_reserved_buffers) | 
|  | 765 | free_buffer(b); | 
|  | 766 | else { | 
|  | 767 | list_add(&b->lru_list, &c->reserved_buffers); | 
|  | 768 | c->need_reserved_buffers--; | 
|  | 769 | } | 
|  | 770 |  | 
|  | 771 | wake_up(&c->free_buffer_wait); | 
|  | 772 | } | 
|  | 773 |  | 
|  | 774 | static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) | 
|  | 775 | { | 
|  | 776 | struct dm_buffer *b, *tmp; | 
|  | 777 |  | 
|  | 778 | list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { | 
|  | 779 | BUG_ON(test_bit(B_READING, &b->state)); | 
|  | 780 |  | 
|  | 781 | if (!test_bit(B_DIRTY, &b->state) && | 
|  | 782 | !test_bit(B_WRITING, &b->state)) { | 
|  | 783 | __relink_lru(b, LIST_CLEAN); | 
|  | 784 | continue; | 
|  | 785 | } | 
|  | 786 |  | 
|  | 787 | if (no_wait && test_bit(B_WRITING, &b->state)) | 
|  | 788 | return; | 
|  | 789 |  | 
|  | 790 | __write_dirty_buffer(b); | 
|  | 791 | dm_bufio_cond_resched(); | 
|  | 792 | } | 
|  | 793 | } | 
|  | 794 |  | 
|  | 795 | /* | 
|  | 796 | * Get writeback threshold and buffer limit for a given client. | 
|  | 797 | */ | 
|  | 798 | static void __get_memory_limit(struct dm_bufio_client *c, | 
|  | 799 | unsigned long *threshold_buffers, | 
|  | 800 | unsigned long *limit_buffers) | 
|  | 801 | { | 
|  | 802 | unsigned long buffers; | 
|  | 803 |  | 
|  | 804 | if (dm_bufio_cache_size != dm_bufio_cache_size_latch) { | 
|  | 805 | mutex_lock(&dm_bufio_clients_lock); | 
|  | 806 | __cache_size_refresh(); | 
|  | 807 | mutex_unlock(&dm_bufio_clients_lock); | 
|  | 808 | } | 
|  | 809 |  | 
|  | 810 | buffers = dm_bufio_cache_size_per_client >> | 
|  | 811 | (c->sectors_per_block_bits + SECTOR_SHIFT); | 
|  | 812 |  | 
|  | 813 | if (buffers < DM_BUFIO_MIN_BUFFERS) | 
|  | 814 | buffers = DM_BUFIO_MIN_BUFFERS; | 
|  | 815 |  | 
|  | 816 | *limit_buffers = buffers; | 
|  | 817 | *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; | 
|  | 818 | } | 
|  | 819 |  | 
|  | 820 | /* | 
|  | 821 | * Check if we're over watermark. | 
|  | 822 | * If we are over threshold_buffers, start freeing buffers. | 
|  | 823 | * If we're over "limit_buffers", block until we get under the limit. | 
|  | 824 | */ | 
|  | 825 | static void __check_watermark(struct dm_bufio_client *c) | 
|  | 826 | { | 
|  | 827 | unsigned long threshold_buffers, limit_buffers; | 
|  | 828 |  | 
|  | 829 | __get_memory_limit(c, &threshold_buffers, &limit_buffers); | 
|  | 830 |  | 
|  | 831 | while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > | 
|  | 832 | limit_buffers) { | 
|  | 833 |  | 
|  | 834 | struct dm_buffer *b = __get_unclaimed_buffer(c); | 
|  | 835 |  | 
|  | 836 | if (!b) | 
|  | 837 | return; | 
|  | 838 |  | 
|  | 839 | __free_buffer_wake(b); | 
|  | 840 | dm_bufio_cond_resched(); | 
|  | 841 | } | 
|  | 842 |  | 
|  | 843 | if (c->n_buffers[LIST_DIRTY] > threshold_buffers) | 
|  | 844 | __write_dirty_buffers_async(c, 1); | 
|  | 845 | } | 
|  | 846 |  | 
|  | 847 | /* | 
|  | 848 | * Find a buffer in the hash. | 
|  | 849 | */ | 
|  | 850 | static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) | 
|  | 851 | { | 
|  | 852 | struct dm_buffer *b; | 
|  | 853 | struct hlist_node *hn; | 
|  | 854 |  | 
|  | 855 | hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], | 
|  | 856 | hash_list) { | 
|  | 857 | dm_bufio_cond_resched(); | 
|  | 858 | if (b->block == block) | 
|  | 859 | return b; | 
|  | 860 | } | 
|  | 861 |  | 
|  | 862 | return NULL; | 
|  | 863 | } | 
|  | 864 |  | 
|  | 865 | /*---------------------------------------------------------------- | 
|  | 866 | * Getting a buffer | 
|  | 867 | *--------------------------------------------------------------*/ | 
|  | 868 |  | 
|  | 869 | enum new_flag { | 
|  | 870 | NF_FRESH = 0, | 
|  | 871 | NF_READ = 1, | 
|  | 872 | NF_GET = 2 | 
|  | 873 | }; | 
|  | 874 |  | 
|  | 875 | static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | 
|  | 876 | enum new_flag nf, struct dm_buffer **bp, | 
|  | 877 | int *need_submit) | 
|  | 878 | { | 
|  | 879 | struct dm_buffer *b, *new_b = NULL; | 
|  | 880 |  | 
|  | 881 | *need_submit = 0; | 
|  | 882 |  | 
|  | 883 | b = __find(c, block); | 
|  | 884 | if (b) { | 
|  | 885 | b->hold_count++; | 
|  | 886 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | 
|  | 887 | test_bit(B_WRITING, &b->state)); | 
|  | 888 | return b; | 
|  | 889 | } | 
|  | 890 |  | 
|  | 891 | if (nf == NF_GET) | 
|  | 892 | return NULL; | 
|  | 893 |  | 
|  | 894 | new_b = __alloc_buffer_wait(c); | 
|  | 895 |  | 
|  | 896 | /* | 
|  | 897 | * We've had a period where the mutex was unlocked, so need to | 
|  | 898 | * recheck the hash table. | 
|  | 899 | */ | 
|  | 900 | b = __find(c, block); | 
|  | 901 | if (b) { | 
|  | 902 | __free_buffer_wake(new_b); | 
|  | 903 | b->hold_count++; | 
|  | 904 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | 
|  | 905 | test_bit(B_WRITING, &b->state)); | 
|  | 906 | return b; | 
|  | 907 | } | 
|  | 908 |  | 
|  | 909 | __check_watermark(c); | 
|  | 910 |  | 
|  | 911 | b = new_b; | 
|  | 912 | b->hold_count = 1; | 
|  | 913 | b->read_error = 0; | 
|  | 914 | b->write_error = 0; | 
|  | 915 | __link_buffer(b, block, LIST_CLEAN); | 
|  | 916 |  | 
|  | 917 | if (nf == NF_FRESH) { | 
|  | 918 | b->state = 0; | 
|  | 919 | return b; | 
|  | 920 | } | 
|  | 921 |  | 
|  | 922 | b->state = 1 << B_READING; | 
|  | 923 | *need_submit = 1; | 
|  | 924 |  | 
|  | 925 | return b; | 
|  | 926 | } | 
|  | 927 |  | 
|  | 928 | /* | 
|  | 929 | * The endio routine for reading: set the error, clear the bit and wake up | 
|  | 930 | * anyone waiting on the buffer. | 
|  | 931 | */ | 
|  | 932 | static void read_endio(struct bio *bio, int error) | 
|  | 933 | { | 
|  | 934 | struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); | 
|  | 935 |  | 
|  | 936 | b->read_error = error; | 
|  | 937 |  | 
|  | 938 | BUG_ON(!test_bit(B_READING, &b->state)); | 
|  | 939 |  | 
|  | 940 | smp_mb__before_clear_bit(); | 
|  | 941 | clear_bit(B_READING, &b->state); | 
|  | 942 | smp_mb__after_clear_bit(); | 
|  | 943 |  | 
|  | 944 | wake_up_bit(&b->state, B_READING); | 
|  | 945 | } | 
|  | 946 |  | 
|  | 947 | /* | 
|  | 948 | * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these | 
|  | 949 | * functions is similar except that dm_bufio_new doesn't read the | 
|  | 950 | * buffer from the disk (assuming that the caller overwrites all the data | 
|  | 951 | * and uses dm_bufio_mark_buffer_dirty to write new data back). | 
|  | 952 | */ | 
|  | 953 | static void *new_read(struct dm_bufio_client *c, sector_t block, | 
|  | 954 | enum new_flag nf, struct dm_buffer **bp) | 
|  | 955 | { | 
|  | 956 | int need_submit; | 
|  | 957 | struct dm_buffer *b; | 
|  | 958 |  | 
|  | 959 | dm_bufio_lock(c); | 
|  | 960 | b = __bufio_new(c, block, nf, bp, &need_submit); | 
|  | 961 | dm_bufio_unlock(c); | 
|  | 962 |  | 
|  | 963 | if (!b || IS_ERR(b)) | 
|  | 964 | return b; | 
|  | 965 |  | 
|  | 966 | if (need_submit) | 
|  | 967 | submit_io(b, READ, b->block, read_endio); | 
|  | 968 |  | 
|  | 969 | wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); | 
|  | 970 |  | 
|  | 971 | if (b->read_error) { | 
|  | 972 | int error = b->read_error; | 
|  | 973 |  | 
|  | 974 | dm_bufio_release(b); | 
|  | 975 |  | 
|  | 976 | return ERR_PTR(error); | 
|  | 977 | } | 
|  | 978 |  | 
|  | 979 | *bp = b; | 
|  | 980 |  | 
|  | 981 | return b->data; | 
|  | 982 | } | 
|  | 983 |  | 
|  | 984 | void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, | 
|  | 985 | struct dm_buffer **bp) | 
|  | 986 | { | 
|  | 987 | return new_read(c, block, NF_GET, bp); | 
|  | 988 | } | 
|  | 989 | EXPORT_SYMBOL_GPL(dm_bufio_get); | 
|  | 990 |  | 
|  | 991 | void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, | 
|  | 992 | struct dm_buffer **bp) | 
|  | 993 | { | 
|  | 994 | BUG_ON(dm_bufio_in_request()); | 
|  | 995 |  | 
|  | 996 | return new_read(c, block, NF_READ, bp); | 
|  | 997 | } | 
|  | 998 | EXPORT_SYMBOL_GPL(dm_bufio_read); | 
|  | 999 |  | 
|  | 1000 | void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, | 
|  | 1001 | struct dm_buffer **bp) | 
|  | 1002 | { | 
|  | 1003 | BUG_ON(dm_bufio_in_request()); | 
|  | 1004 |  | 
|  | 1005 | return new_read(c, block, NF_FRESH, bp); | 
|  | 1006 | } | 
|  | 1007 | EXPORT_SYMBOL_GPL(dm_bufio_new); | 
|  | 1008 |  | 
|  | 1009 | void dm_bufio_release(struct dm_buffer *b) | 
|  | 1010 | { | 
|  | 1011 | struct dm_bufio_client *c = b->c; | 
|  | 1012 |  | 
|  | 1013 | dm_bufio_lock(c); | 
|  | 1014 |  | 
|  | 1015 | BUG_ON(test_bit(B_READING, &b->state)); | 
|  | 1016 | BUG_ON(!b->hold_count); | 
|  | 1017 |  | 
|  | 1018 | b->hold_count--; | 
|  | 1019 | if (!b->hold_count) { | 
|  | 1020 | wake_up(&c->free_buffer_wait); | 
|  | 1021 |  | 
|  | 1022 | /* | 
|  | 1023 | * If there were errors on the buffer, and the buffer is not | 
|  | 1024 | * to be written, free the buffer. There is no point in caching | 
|  | 1025 | * invalid buffer. | 
|  | 1026 | */ | 
|  | 1027 | if ((b->read_error || b->write_error) && | 
|  | 1028 | !test_bit(B_WRITING, &b->state) && | 
|  | 1029 | !test_bit(B_DIRTY, &b->state)) { | 
|  | 1030 | __unlink_buffer(b); | 
|  | 1031 | __free_buffer_wake(b); | 
|  | 1032 | } | 
|  | 1033 | } | 
|  | 1034 |  | 
|  | 1035 | dm_bufio_unlock(c); | 
|  | 1036 | } | 
|  | 1037 | EXPORT_SYMBOL_GPL(dm_bufio_release); | 
|  | 1038 |  | 
|  | 1039 | void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) | 
|  | 1040 | { | 
|  | 1041 | struct dm_bufio_client *c = b->c; | 
|  | 1042 |  | 
|  | 1043 | dm_bufio_lock(c); | 
|  | 1044 |  | 
|  | 1045 | if (!test_and_set_bit(B_DIRTY, &b->state)) | 
|  | 1046 | __relink_lru(b, LIST_DIRTY); | 
|  | 1047 |  | 
|  | 1048 | dm_bufio_unlock(c); | 
|  | 1049 | } | 
|  | 1050 | EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); | 
|  | 1051 |  | 
|  | 1052 | void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) | 
|  | 1053 | { | 
|  | 1054 | BUG_ON(dm_bufio_in_request()); | 
|  | 1055 |  | 
|  | 1056 | dm_bufio_lock(c); | 
|  | 1057 | __write_dirty_buffers_async(c, 0); | 
|  | 1058 | dm_bufio_unlock(c); | 
|  | 1059 | } | 
|  | 1060 | EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); | 
|  | 1061 |  | 
|  | 1062 | /* | 
|  | 1063 | * For performance, it is essential that the buffers are written asynchronously | 
|  | 1064 | * and simultaneously (so that the block layer can merge the writes) and then | 
|  | 1065 | * waited upon. | 
|  | 1066 | * | 
|  | 1067 | * Finally, we flush hardware disk cache. | 
|  | 1068 | */ | 
|  | 1069 | int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) | 
|  | 1070 | { | 
|  | 1071 | int a, f; | 
|  | 1072 | unsigned long buffers_processed = 0; | 
|  | 1073 | struct dm_buffer *b, *tmp; | 
|  | 1074 |  | 
|  | 1075 | dm_bufio_lock(c); | 
|  | 1076 | __write_dirty_buffers_async(c, 0); | 
|  | 1077 |  | 
|  | 1078 | again: | 
|  | 1079 | list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { | 
|  | 1080 | int dropped_lock = 0; | 
|  | 1081 |  | 
|  | 1082 | if (buffers_processed < c->n_buffers[LIST_DIRTY]) | 
|  | 1083 | buffers_processed++; | 
|  | 1084 |  | 
|  | 1085 | BUG_ON(test_bit(B_READING, &b->state)); | 
|  | 1086 |  | 
|  | 1087 | if (test_bit(B_WRITING, &b->state)) { | 
|  | 1088 | if (buffers_processed < c->n_buffers[LIST_DIRTY]) { | 
|  | 1089 | dropped_lock = 1; | 
|  | 1090 | b->hold_count++; | 
|  | 1091 | dm_bufio_unlock(c); | 
|  | 1092 | wait_on_bit(&b->state, B_WRITING, | 
|  | 1093 | do_io_schedule, | 
|  | 1094 | TASK_UNINTERRUPTIBLE); | 
|  | 1095 | dm_bufio_lock(c); | 
|  | 1096 | b->hold_count--; | 
|  | 1097 | } else | 
|  | 1098 | wait_on_bit(&b->state, B_WRITING, | 
|  | 1099 | do_io_schedule, | 
|  | 1100 | TASK_UNINTERRUPTIBLE); | 
|  | 1101 | } | 
|  | 1102 |  | 
|  | 1103 | if (!test_bit(B_DIRTY, &b->state) && | 
|  | 1104 | !test_bit(B_WRITING, &b->state)) | 
|  | 1105 | __relink_lru(b, LIST_CLEAN); | 
|  | 1106 |  | 
|  | 1107 | dm_bufio_cond_resched(); | 
|  | 1108 |  | 
|  | 1109 | /* | 
|  | 1110 | * If we dropped the lock, the list is no longer consistent, | 
|  | 1111 | * so we must restart the search. | 
|  | 1112 | * | 
|  | 1113 | * In the most common case, the buffer just processed is | 
|  | 1114 | * relinked to the clean list, so we won't loop scanning the | 
|  | 1115 | * same buffer again and again. | 
|  | 1116 | * | 
|  | 1117 | * This may livelock if there is another thread simultaneously | 
|  | 1118 | * dirtying buffers, so we count the number of buffers walked | 
|  | 1119 | * and if it exceeds the total number of buffers, it means that | 
|  | 1120 | * someone is doing some writes simultaneously with us.  In | 
|  | 1121 | * this case, stop, dropping the lock. | 
|  | 1122 | */ | 
|  | 1123 | if (dropped_lock) | 
|  | 1124 | goto again; | 
|  | 1125 | } | 
|  | 1126 | wake_up(&c->free_buffer_wait); | 
|  | 1127 | dm_bufio_unlock(c); | 
|  | 1128 |  | 
|  | 1129 | a = xchg(&c->async_write_error, 0); | 
|  | 1130 | f = dm_bufio_issue_flush(c); | 
|  | 1131 | if (a) | 
|  | 1132 | return a; | 
|  | 1133 |  | 
|  | 1134 | return f; | 
|  | 1135 | } | 
|  | 1136 | EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); | 
|  | 1137 |  | 
|  | 1138 | /* | 
|  | 1139 | * Use dm-io to send and empty barrier flush the device. | 
|  | 1140 | */ | 
|  | 1141 | int dm_bufio_issue_flush(struct dm_bufio_client *c) | 
|  | 1142 | { | 
|  | 1143 | struct dm_io_request io_req = { | 
|  | 1144 | .bi_rw = REQ_FLUSH, | 
|  | 1145 | .mem.type = DM_IO_KMEM, | 
|  | 1146 | .mem.ptr.addr = NULL, | 
|  | 1147 | .client = c->dm_io, | 
|  | 1148 | }; | 
|  | 1149 | struct dm_io_region io_reg = { | 
|  | 1150 | .bdev = c->bdev, | 
|  | 1151 | .sector = 0, | 
|  | 1152 | .count = 0, | 
|  | 1153 | }; | 
|  | 1154 |  | 
|  | 1155 | BUG_ON(dm_bufio_in_request()); | 
|  | 1156 |  | 
|  | 1157 | return dm_io(&io_req, 1, &io_reg, NULL); | 
|  | 1158 | } | 
|  | 1159 | EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); | 
|  | 1160 |  | 
|  | 1161 | /* | 
|  | 1162 | * We first delete any other buffer that may be at that new location. | 
|  | 1163 | * | 
|  | 1164 | * Then, we write the buffer to the original location if it was dirty. | 
|  | 1165 | * | 
|  | 1166 | * Then, if we are the only one who is holding the buffer, relink the buffer | 
|  | 1167 | * in the hash queue for the new location. | 
|  | 1168 | * | 
|  | 1169 | * If there was someone else holding the buffer, we write it to the new | 
|  | 1170 | * location but not relink it, because that other user needs to have the buffer | 
|  | 1171 | * at the same place. | 
|  | 1172 | */ | 
|  | 1173 | void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) | 
|  | 1174 | { | 
|  | 1175 | struct dm_bufio_client *c = b->c; | 
|  | 1176 | struct dm_buffer *new; | 
|  | 1177 |  | 
|  | 1178 | BUG_ON(dm_bufio_in_request()); | 
|  | 1179 |  | 
|  | 1180 | dm_bufio_lock(c); | 
|  | 1181 |  | 
|  | 1182 | retry: | 
|  | 1183 | new = __find(c, new_block); | 
|  | 1184 | if (new) { | 
|  | 1185 | if (new->hold_count) { | 
|  | 1186 | __wait_for_free_buffer(c); | 
|  | 1187 | goto retry; | 
|  | 1188 | } | 
|  | 1189 |  | 
|  | 1190 | /* | 
|  | 1191 | * FIXME: Is there any point waiting for a write that's going | 
|  | 1192 | * to be overwritten in a bit? | 
|  | 1193 | */ | 
|  | 1194 | __make_buffer_clean(new); | 
|  | 1195 | __unlink_buffer(new); | 
|  | 1196 | __free_buffer_wake(new); | 
|  | 1197 | } | 
|  | 1198 |  | 
|  | 1199 | BUG_ON(!b->hold_count); | 
|  | 1200 | BUG_ON(test_bit(B_READING, &b->state)); | 
|  | 1201 |  | 
|  | 1202 | __write_dirty_buffer(b); | 
|  | 1203 | if (b->hold_count == 1) { | 
|  | 1204 | wait_on_bit(&b->state, B_WRITING, | 
|  | 1205 | do_io_schedule, TASK_UNINTERRUPTIBLE); | 
|  | 1206 | set_bit(B_DIRTY, &b->state); | 
|  | 1207 | __unlink_buffer(b); | 
|  | 1208 | __link_buffer(b, new_block, LIST_DIRTY); | 
|  | 1209 | } else { | 
|  | 1210 | sector_t old_block; | 
|  | 1211 | wait_on_bit_lock(&b->state, B_WRITING, | 
|  | 1212 | do_io_schedule, TASK_UNINTERRUPTIBLE); | 
|  | 1213 | /* | 
|  | 1214 | * Relink buffer to "new_block" so that write_callback | 
|  | 1215 | * sees "new_block" as a block number. | 
|  | 1216 | * After the write, link the buffer back to old_block. | 
|  | 1217 | * All this must be done in bufio lock, so that block number | 
|  | 1218 | * change isn't visible to other threads. | 
|  | 1219 | */ | 
|  | 1220 | old_block = b->block; | 
|  | 1221 | __unlink_buffer(b); | 
|  | 1222 | __link_buffer(b, new_block, b->list_mode); | 
|  | 1223 | submit_io(b, WRITE, new_block, write_endio); | 
|  | 1224 | wait_on_bit(&b->state, B_WRITING, | 
|  | 1225 | do_io_schedule, TASK_UNINTERRUPTIBLE); | 
|  | 1226 | __unlink_buffer(b); | 
|  | 1227 | __link_buffer(b, old_block, b->list_mode); | 
|  | 1228 | } | 
|  | 1229 |  | 
|  | 1230 | dm_bufio_unlock(c); | 
|  | 1231 | dm_bufio_release(b); | 
|  | 1232 | } | 
|  | 1233 | EXPORT_SYMBOL_GPL(dm_bufio_release_move); | 
|  | 1234 |  | 
|  | 1235 | unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) | 
|  | 1236 | { | 
|  | 1237 | return c->block_size; | 
|  | 1238 | } | 
|  | 1239 | EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); | 
|  | 1240 |  | 
|  | 1241 | sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) | 
|  | 1242 | { | 
|  | 1243 | return i_size_read(c->bdev->bd_inode) >> | 
|  | 1244 | (SECTOR_SHIFT + c->sectors_per_block_bits); | 
|  | 1245 | } | 
|  | 1246 | EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); | 
|  | 1247 |  | 
|  | 1248 | sector_t dm_bufio_get_block_number(struct dm_buffer *b) | 
|  | 1249 | { | 
|  | 1250 | return b->block; | 
|  | 1251 | } | 
|  | 1252 | EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); | 
|  | 1253 |  | 
|  | 1254 | void *dm_bufio_get_block_data(struct dm_buffer *b) | 
|  | 1255 | { | 
|  | 1256 | return b->data; | 
|  | 1257 | } | 
|  | 1258 | EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); | 
|  | 1259 |  | 
|  | 1260 | void *dm_bufio_get_aux_data(struct dm_buffer *b) | 
|  | 1261 | { | 
|  | 1262 | return b + 1; | 
|  | 1263 | } | 
|  | 1264 | EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); | 
|  | 1265 |  | 
|  | 1266 | struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) | 
|  | 1267 | { | 
|  | 1268 | return b->c; | 
|  | 1269 | } | 
|  | 1270 | EXPORT_SYMBOL_GPL(dm_bufio_get_client); | 
|  | 1271 |  | 
|  | 1272 | static void drop_buffers(struct dm_bufio_client *c) | 
|  | 1273 | { | 
|  | 1274 | struct dm_buffer *b; | 
|  | 1275 | int i; | 
|  | 1276 |  | 
|  | 1277 | BUG_ON(dm_bufio_in_request()); | 
|  | 1278 |  | 
|  | 1279 | /* | 
|  | 1280 | * An optimization so that the buffers are not written one-by-one. | 
|  | 1281 | */ | 
|  | 1282 | dm_bufio_write_dirty_buffers_async(c); | 
|  | 1283 |  | 
|  | 1284 | dm_bufio_lock(c); | 
|  | 1285 |  | 
|  | 1286 | while ((b = __get_unclaimed_buffer(c))) | 
|  | 1287 | __free_buffer_wake(b); | 
|  | 1288 |  | 
|  | 1289 | for (i = 0; i < LIST_SIZE; i++) | 
|  | 1290 | list_for_each_entry(b, &c->lru[i], lru_list) | 
|  | 1291 | DMERR("leaked buffer %llx, hold count %u, list %d", | 
|  | 1292 | (unsigned long long)b->block, b->hold_count, i); | 
|  | 1293 |  | 
|  | 1294 | for (i = 0; i < LIST_SIZE; i++) | 
|  | 1295 | BUG_ON(!list_empty(&c->lru[i])); | 
|  | 1296 |  | 
|  | 1297 | dm_bufio_unlock(c); | 
|  | 1298 | } | 
|  | 1299 |  | 
|  | 1300 | /* | 
|  | 1301 | * Test if the buffer is unused and too old, and commit it. | 
|  | 1302 | * At if noio is set, we must not do any I/O because we hold | 
|  | 1303 | * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to | 
|  | 1304 | * different bufio client. | 
|  | 1305 | */ | 
|  | 1306 | static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, | 
|  | 1307 | unsigned long max_jiffies) | 
|  | 1308 | { | 
|  | 1309 | if (jiffies - b->last_accessed < max_jiffies) | 
|  | 1310 | return 1; | 
|  | 1311 |  | 
|  | 1312 | if (!(gfp & __GFP_IO)) { | 
|  | 1313 | if (test_bit(B_READING, &b->state) || | 
|  | 1314 | test_bit(B_WRITING, &b->state) || | 
|  | 1315 | test_bit(B_DIRTY, &b->state)) | 
|  | 1316 | return 1; | 
|  | 1317 | } | 
|  | 1318 |  | 
|  | 1319 | if (b->hold_count) | 
|  | 1320 | return 1; | 
|  | 1321 |  | 
|  | 1322 | __make_buffer_clean(b); | 
|  | 1323 | __unlink_buffer(b); | 
|  | 1324 | __free_buffer_wake(b); | 
|  | 1325 |  | 
|  | 1326 | return 0; | 
|  | 1327 | } | 
|  | 1328 |  | 
|  | 1329 | static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, | 
|  | 1330 | struct shrink_control *sc) | 
|  | 1331 | { | 
|  | 1332 | int l; | 
|  | 1333 | struct dm_buffer *b, *tmp; | 
|  | 1334 |  | 
|  | 1335 | for (l = 0; l < LIST_SIZE; l++) { | 
|  | 1336 | list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) | 
|  | 1337 | if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) && | 
|  | 1338 | !--nr_to_scan) | 
|  | 1339 | return; | 
|  | 1340 | dm_bufio_cond_resched(); | 
|  | 1341 | } | 
|  | 1342 | } | 
|  | 1343 |  | 
|  | 1344 | static int shrink(struct shrinker *shrinker, struct shrink_control *sc) | 
|  | 1345 | { | 
|  | 1346 | struct dm_bufio_client *c = | 
|  | 1347 | container_of(shrinker, struct dm_bufio_client, shrinker); | 
|  | 1348 | unsigned long r; | 
|  | 1349 | unsigned long nr_to_scan = sc->nr_to_scan; | 
|  | 1350 |  | 
|  | 1351 | if (sc->gfp_mask & __GFP_IO) | 
|  | 1352 | dm_bufio_lock(c); | 
|  | 1353 | else if (!dm_bufio_trylock(c)) | 
|  | 1354 | return !nr_to_scan ? 0 : -1; | 
|  | 1355 |  | 
|  | 1356 | if (nr_to_scan) | 
|  | 1357 | __scan(c, nr_to_scan, sc); | 
|  | 1358 |  | 
|  | 1359 | r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; | 
|  | 1360 | if (r > INT_MAX) | 
|  | 1361 | r = INT_MAX; | 
|  | 1362 |  | 
|  | 1363 | dm_bufio_unlock(c); | 
|  | 1364 |  | 
|  | 1365 | return r; | 
|  | 1366 | } | 
|  | 1367 |  | 
|  | 1368 | /* | 
|  | 1369 | * Create the buffering interface | 
|  | 1370 | */ | 
|  | 1371 | struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, | 
|  | 1372 | unsigned reserved_buffers, unsigned aux_size, | 
|  | 1373 | void (*alloc_callback)(struct dm_buffer *), | 
|  | 1374 | void (*write_callback)(struct dm_buffer *)) | 
|  | 1375 | { | 
|  | 1376 | int r; | 
|  | 1377 | struct dm_bufio_client *c; | 
|  | 1378 | unsigned i; | 
|  | 1379 |  | 
|  | 1380 | BUG_ON(block_size < 1 << SECTOR_SHIFT || | 
|  | 1381 | (block_size & (block_size - 1))); | 
|  | 1382 |  | 
|  | 1383 | c = kmalloc(sizeof(*c), GFP_KERNEL); | 
|  | 1384 | if (!c) { | 
|  | 1385 | r = -ENOMEM; | 
|  | 1386 | goto bad_client; | 
|  | 1387 | } | 
|  | 1388 | c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); | 
|  | 1389 | if (!c->cache_hash) { | 
|  | 1390 | r = -ENOMEM; | 
|  | 1391 | goto bad_hash; | 
|  | 1392 | } | 
|  | 1393 |  | 
|  | 1394 | c->bdev = bdev; | 
|  | 1395 | c->block_size = block_size; | 
|  | 1396 | c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; | 
|  | 1397 | c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? | 
|  | 1398 | ffs(block_size) - 1 - PAGE_SHIFT : 0; | 
|  | 1399 | c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? | 
|  | 1400 | PAGE_SHIFT - (ffs(block_size) - 1) : 0); | 
|  | 1401 |  | 
|  | 1402 | c->aux_size = aux_size; | 
|  | 1403 | c->alloc_callback = alloc_callback; | 
|  | 1404 | c->write_callback = write_callback; | 
|  | 1405 |  | 
|  | 1406 | for (i = 0; i < LIST_SIZE; i++) { | 
|  | 1407 | INIT_LIST_HEAD(&c->lru[i]); | 
|  | 1408 | c->n_buffers[i] = 0; | 
|  | 1409 | } | 
|  | 1410 |  | 
|  | 1411 | for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) | 
|  | 1412 | INIT_HLIST_HEAD(&c->cache_hash[i]); | 
|  | 1413 |  | 
|  | 1414 | mutex_init(&c->lock); | 
|  | 1415 | INIT_LIST_HEAD(&c->reserved_buffers); | 
|  | 1416 | c->need_reserved_buffers = reserved_buffers; | 
|  | 1417 |  | 
|  | 1418 | init_waitqueue_head(&c->free_buffer_wait); | 
|  | 1419 | c->async_write_error = 0; | 
|  | 1420 |  | 
|  | 1421 | c->dm_io = dm_io_client_create(); | 
|  | 1422 | if (IS_ERR(c->dm_io)) { | 
|  | 1423 | r = PTR_ERR(c->dm_io); | 
|  | 1424 | goto bad_dm_io; | 
|  | 1425 | } | 
|  | 1426 |  | 
|  | 1427 | mutex_lock(&dm_bufio_clients_lock); | 
|  | 1428 | if (c->blocks_per_page_bits) { | 
|  | 1429 | if (!DM_BUFIO_CACHE_NAME(c)) { | 
|  | 1430 | DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); | 
|  | 1431 | if (!DM_BUFIO_CACHE_NAME(c)) { | 
|  | 1432 | r = -ENOMEM; | 
|  | 1433 | mutex_unlock(&dm_bufio_clients_lock); | 
|  | 1434 | goto bad_cache; | 
|  | 1435 | } | 
|  | 1436 | } | 
|  | 1437 |  | 
|  | 1438 | if (!DM_BUFIO_CACHE(c)) { | 
|  | 1439 | DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), | 
|  | 1440 | c->block_size, | 
|  | 1441 | c->block_size, 0, NULL); | 
|  | 1442 | if (!DM_BUFIO_CACHE(c)) { | 
|  | 1443 | r = -ENOMEM; | 
|  | 1444 | mutex_unlock(&dm_bufio_clients_lock); | 
|  | 1445 | goto bad_cache; | 
|  | 1446 | } | 
|  | 1447 | } | 
|  | 1448 | } | 
|  | 1449 | mutex_unlock(&dm_bufio_clients_lock); | 
|  | 1450 |  | 
|  | 1451 | while (c->need_reserved_buffers) { | 
|  | 1452 | struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); | 
|  | 1453 |  | 
|  | 1454 | if (!b) { | 
|  | 1455 | r = -ENOMEM; | 
|  | 1456 | goto bad_buffer; | 
|  | 1457 | } | 
|  | 1458 | __free_buffer_wake(b); | 
|  | 1459 | } | 
|  | 1460 |  | 
|  | 1461 | mutex_lock(&dm_bufio_clients_lock); | 
|  | 1462 | dm_bufio_client_count++; | 
|  | 1463 | list_add(&c->client_list, &dm_bufio_all_clients); | 
|  | 1464 | __cache_size_refresh(); | 
|  | 1465 | mutex_unlock(&dm_bufio_clients_lock); | 
|  | 1466 |  | 
|  | 1467 | c->shrinker.shrink = shrink; | 
|  | 1468 | c->shrinker.seeks = 1; | 
|  | 1469 | c->shrinker.batch = 0; | 
|  | 1470 | register_shrinker(&c->shrinker); | 
|  | 1471 |  | 
|  | 1472 | return c; | 
|  | 1473 |  | 
|  | 1474 | bad_buffer: | 
|  | 1475 | bad_cache: | 
|  | 1476 | while (!list_empty(&c->reserved_buffers)) { | 
|  | 1477 | struct dm_buffer *b = list_entry(c->reserved_buffers.next, | 
|  | 1478 | struct dm_buffer, lru_list); | 
|  | 1479 | list_del(&b->lru_list); | 
|  | 1480 | free_buffer(b); | 
|  | 1481 | } | 
|  | 1482 | dm_io_client_destroy(c->dm_io); | 
|  | 1483 | bad_dm_io: | 
|  | 1484 | vfree(c->cache_hash); | 
|  | 1485 | bad_hash: | 
|  | 1486 | kfree(c); | 
|  | 1487 | bad_client: | 
|  | 1488 | return ERR_PTR(r); | 
|  | 1489 | } | 
|  | 1490 | EXPORT_SYMBOL_GPL(dm_bufio_client_create); | 
|  | 1491 |  | 
|  | 1492 | /* | 
|  | 1493 | * Free the buffering interface. | 
|  | 1494 | * It is required that there are no references on any buffers. | 
|  | 1495 | */ | 
|  | 1496 | void dm_bufio_client_destroy(struct dm_bufio_client *c) | 
|  | 1497 | { | 
|  | 1498 | unsigned i; | 
|  | 1499 |  | 
|  | 1500 | drop_buffers(c); | 
|  | 1501 |  | 
|  | 1502 | unregister_shrinker(&c->shrinker); | 
|  | 1503 |  | 
|  | 1504 | mutex_lock(&dm_bufio_clients_lock); | 
|  | 1505 |  | 
|  | 1506 | list_del(&c->client_list); | 
|  | 1507 | dm_bufio_client_count--; | 
|  | 1508 | __cache_size_refresh(); | 
|  | 1509 |  | 
|  | 1510 | mutex_unlock(&dm_bufio_clients_lock); | 
|  | 1511 |  | 
|  | 1512 | for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) | 
|  | 1513 | BUG_ON(!hlist_empty(&c->cache_hash[i])); | 
|  | 1514 |  | 
|  | 1515 | BUG_ON(c->need_reserved_buffers); | 
|  | 1516 |  | 
|  | 1517 | while (!list_empty(&c->reserved_buffers)) { | 
|  | 1518 | struct dm_buffer *b = list_entry(c->reserved_buffers.next, | 
|  | 1519 | struct dm_buffer, lru_list); | 
|  | 1520 | list_del(&b->lru_list); | 
|  | 1521 | free_buffer(b); | 
|  | 1522 | } | 
|  | 1523 |  | 
|  | 1524 | for (i = 0; i < LIST_SIZE; i++) | 
|  | 1525 | if (c->n_buffers[i]) | 
|  | 1526 | DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); | 
|  | 1527 |  | 
|  | 1528 | for (i = 0; i < LIST_SIZE; i++) | 
|  | 1529 | BUG_ON(c->n_buffers[i]); | 
|  | 1530 |  | 
|  | 1531 | dm_io_client_destroy(c->dm_io); | 
|  | 1532 | vfree(c->cache_hash); | 
|  | 1533 | kfree(c); | 
|  | 1534 | } | 
|  | 1535 | EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); | 
|  | 1536 |  | 
|  | 1537 | static void cleanup_old_buffers(void) | 
|  | 1538 | { | 
|  | 1539 | unsigned long max_age = dm_bufio_max_age; | 
|  | 1540 | struct dm_bufio_client *c; | 
|  | 1541 |  | 
|  | 1542 | barrier(); | 
|  | 1543 |  | 
|  | 1544 | if (max_age > ULONG_MAX / HZ) | 
|  | 1545 | max_age = ULONG_MAX / HZ; | 
|  | 1546 |  | 
|  | 1547 | mutex_lock(&dm_bufio_clients_lock); | 
|  | 1548 | list_for_each_entry(c, &dm_bufio_all_clients, client_list) { | 
|  | 1549 | if (!dm_bufio_trylock(c)) | 
|  | 1550 | continue; | 
|  | 1551 |  | 
|  | 1552 | while (!list_empty(&c->lru[LIST_CLEAN])) { | 
|  | 1553 | struct dm_buffer *b; | 
|  | 1554 | b = list_entry(c->lru[LIST_CLEAN].prev, | 
|  | 1555 | struct dm_buffer, lru_list); | 
|  | 1556 | if (__cleanup_old_buffer(b, 0, max_age * HZ)) | 
|  | 1557 | break; | 
|  | 1558 | dm_bufio_cond_resched(); | 
|  | 1559 | } | 
|  | 1560 |  | 
|  | 1561 | dm_bufio_unlock(c); | 
|  | 1562 | dm_bufio_cond_resched(); | 
|  | 1563 | } | 
|  | 1564 | mutex_unlock(&dm_bufio_clients_lock); | 
|  | 1565 | } | 
|  | 1566 |  | 
|  | 1567 | static struct workqueue_struct *dm_bufio_wq; | 
|  | 1568 | static struct delayed_work dm_bufio_work; | 
|  | 1569 |  | 
|  | 1570 | static void work_fn(struct work_struct *w) | 
|  | 1571 | { | 
|  | 1572 | cleanup_old_buffers(); | 
|  | 1573 |  | 
|  | 1574 | queue_delayed_work(dm_bufio_wq, &dm_bufio_work, | 
|  | 1575 | DM_BUFIO_WORK_TIMER_SECS * HZ); | 
|  | 1576 | } | 
|  | 1577 |  | 
|  | 1578 | /*---------------------------------------------------------------- | 
|  | 1579 | * Module setup | 
|  | 1580 | *--------------------------------------------------------------*/ | 
|  | 1581 |  | 
|  | 1582 | /* | 
|  | 1583 | * This is called only once for the whole dm_bufio module. | 
|  | 1584 | * It initializes memory limit. | 
|  | 1585 | */ | 
|  | 1586 | static int __init dm_bufio_init(void) | 
|  | 1587 | { | 
|  | 1588 | __u64 mem; | 
|  | 1589 |  | 
|  | 1590 | memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); | 
|  | 1591 | memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); | 
|  | 1592 |  | 
|  | 1593 | mem = (__u64)((totalram_pages - totalhigh_pages) * | 
|  | 1594 | DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; | 
|  | 1595 |  | 
|  | 1596 | if (mem > ULONG_MAX) | 
|  | 1597 | mem = ULONG_MAX; | 
|  | 1598 |  | 
|  | 1599 | #ifdef CONFIG_MMU | 
|  | 1600 | /* | 
|  | 1601 | * Get the size of vmalloc space the same way as VMALLOC_TOTAL | 
|  | 1602 | * in fs/proc/internal.h | 
|  | 1603 | */ | 
|  | 1604 | if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) | 
|  | 1605 | mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; | 
|  | 1606 | #endif | 
|  | 1607 |  | 
|  | 1608 | dm_bufio_default_cache_size = mem; | 
|  | 1609 |  | 
|  | 1610 | mutex_lock(&dm_bufio_clients_lock); | 
|  | 1611 | __cache_size_refresh(); | 
|  | 1612 | mutex_unlock(&dm_bufio_clients_lock); | 
|  | 1613 |  | 
|  | 1614 | dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); | 
|  | 1615 | if (!dm_bufio_wq) | 
|  | 1616 | return -ENOMEM; | 
|  | 1617 |  | 
|  | 1618 | INIT_DELAYED_WORK(&dm_bufio_work, work_fn); | 
|  | 1619 | queue_delayed_work(dm_bufio_wq, &dm_bufio_work, | 
|  | 1620 | DM_BUFIO_WORK_TIMER_SECS * HZ); | 
|  | 1621 |  | 
|  | 1622 | return 0; | 
|  | 1623 | } | 
|  | 1624 |  | 
|  | 1625 | /* | 
|  | 1626 | * This is called once when unloading the dm_bufio module. | 
|  | 1627 | */ | 
|  | 1628 | static void __exit dm_bufio_exit(void) | 
|  | 1629 | { | 
|  | 1630 | int bug = 0; | 
|  | 1631 | int i; | 
|  | 1632 |  | 
|  | 1633 | cancel_delayed_work_sync(&dm_bufio_work); | 
|  | 1634 | destroy_workqueue(dm_bufio_wq); | 
|  | 1635 |  | 
|  | 1636 | for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { | 
|  | 1637 | struct kmem_cache *kc = dm_bufio_caches[i]; | 
|  | 1638 |  | 
|  | 1639 | if (kc) | 
|  | 1640 | kmem_cache_destroy(kc); | 
|  | 1641 | } | 
|  | 1642 |  | 
|  | 1643 | for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) | 
|  | 1644 | kfree(dm_bufio_cache_names[i]); | 
|  | 1645 |  | 
|  | 1646 | if (dm_bufio_client_count) { | 
|  | 1647 | DMCRIT("%s: dm_bufio_client_count leaked: %d", | 
|  | 1648 | __func__, dm_bufio_client_count); | 
|  | 1649 | bug = 1; | 
|  | 1650 | } | 
|  | 1651 |  | 
|  | 1652 | if (dm_bufio_current_allocated) { | 
|  | 1653 | DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", | 
|  | 1654 | __func__, dm_bufio_current_allocated); | 
|  | 1655 | bug = 1; | 
|  | 1656 | } | 
|  | 1657 |  | 
|  | 1658 | if (dm_bufio_allocated_get_free_pages) { | 
|  | 1659 | DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", | 
|  | 1660 | __func__, dm_bufio_allocated_get_free_pages); | 
|  | 1661 | bug = 1; | 
|  | 1662 | } | 
|  | 1663 |  | 
|  | 1664 | if (dm_bufio_allocated_vmalloc) { | 
|  | 1665 | DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", | 
|  | 1666 | __func__, dm_bufio_allocated_vmalloc); | 
|  | 1667 | bug = 1; | 
|  | 1668 | } | 
|  | 1669 |  | 
|  | 1670 | if (bug) | 
|  | 1671 | BUG(); | 
|  | 1672 | } | 
|  | 1673 |  | 
|  | 1674 | module_init(dm_bufio_init) | 
|  | 1675 | module_exit(dm_bufio_exit) | 
|  | 1676 |  | 
|  | 1677 | module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); | 
|  | 1678 | MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); | 
|  | 1679 |  | 
|  | 1680 | module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); | 
|  | 1681 | MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); | 
|  | 1682 |  | 
|  | 1683 | module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); | 
|  | 1684 | MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); | 
|  | 1685 |  | 
|  | 1686 | module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); | 
|  | 1687 | MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); | 
|  | 1688 |  | 
|  | 1689 | module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); | 
|  | 1690 | MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); | 
|  | 1691 |  | 
|  | 1692 | module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); | 
|  | 1693 | MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); | 
|  | 1694 |  | 
|  | 1695 | module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); | 
|  | 1696 | MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); | 
|  | 1697 |  | 
|  | 1698 | MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); | 
|  | 1699 | MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); | 
|  | 1700 | MODULE_LICENSE("GPL"); |