Blame - fs/buffer.c - android_kernel_htc_msm8960

blob: 517860f2d75b0d575e65af71f54745840a3a28ad [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/kernel.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fs.h>
				24	#include <linux/mm.h>
				25	#include <linux/percpu.h>
				26	#include <linux/slab.h>
				27	#include <linux/smp_lock.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	28	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	29	#include <linux/blkdev.h>
				30	#include <linux/file.h>
				31	#include <linux/quotaops.h>
				32	#include <linux/highmem.h>
				33	#include <linux/module.h>
				34	#include <linux/writeback.h>
				35	#include <linux/hash.h>
				36	#include <linux/suspend.h>
				37	#include <linux/buffer_head.h>
				38	#include <linux/bio.h>
				39	#include <linux/notifier.h>
				40	#include <linux/cpu.h>
				41	#include <linux/bitops.h>
				42	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	43	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	44
				45	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
				46	static void invalidate_bh_lrus(void);
				47
				48	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				49
				50	inline void
				51	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				52	{
				53	bh->b_end_io = handler;
				54	bh->b_private = private;
				55	}
				56
				57	static int sync_buffer(void *word)
				58	{
				59	struct block_device *bd;
				60	struct buffer_head *bh
				61	= container_of(word, struct buffer_head, b_state);
				62
				63	smp_mb();
				64	bd = bh->b_bdev;
				65	if (bd)
				66	blk_run_address_space(bd->bd_inode->i_mapping);
				67	io_schedule();
				68	return 0;
				69	}
				70
				71	void fastcall __lock_buffer(struct buffer_head *bh)
				72	{
				73	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				74	TASK_UNINTERRUPTIBLE);
				75	}
				76	EXPORT_SYMBOL(__lock_buffer);
				77
				78	void fastcall unlock_buffer(struct buffer_head *bh)
				79	{
				80	clear_buffer_locked(bh);
				81	smp_mb__after_clear_bit();
				82	wake_up_bit(&bh->b_state, BH_Lock);
				83	}
				84
				85	/*
				86	* Block until a buffer comes unlocked. This doesn't stop it
				87	* from becoming locked again - you have to lock it yourself
				88	* if you want to preserve its state.
				89	*/
				90	void __wait_on_buffer(struct buffer_head * bh)
				91	{
				92	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				93	}
				94
				95	static void
				96	__clear_page_buffers(struct page *page)
				97	{
				98	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	99	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	100	page_cache_release(page);
				101	}
				102
				103	static void buffer_io_error(struct buffer_head *bh)
				104	{
				105	char b[BDEVNAME_SIZE];
				106
				107	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				108	bdevname(bh->b_bdev, b),
				109	(unsigned long long)bh->b_blocknr);
				110	}
				111
				112	/*
				113	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				114	* unlock the buffer. This is what ll_rw_block uses too.
				115	*/
				116	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				117	{
				118	if (uptodate) {
				119	set_buffer_uptodate(bh);
				120	} else {
				121	/* This happens, due to failed READA attempts. */
				122	clear_buffer_uptodate(bh);
				123	}
				124	unlock_buffer(bh);
				125	put_bh(bh);
				126	}
				127
				128	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				129	{
				130	char b[BDEVNAME_SIZE];
				131
				132	if (uptodate) {
				133	set_buffer_uptodate(bh);
				134	} else {
				135	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				136	buffer_io_error(bh);
				137	printk(KERN_WARNING "lost page write due to "
				138	"I/O error on %s\n",
				139	bdevname(bh->b_bdev, b));
				140	}
				141	set_buffer_write_io_error(bh);
				142	clear_buffer_uptodate(bh);
				143	}
				144	unlock_buffer(bh);
				145	put_bh(bh);
				146	}
				147
				148	/*
				149	* Write out and wait upon all the dirty data associated with a block
				150	* device via its mapping. Does not take the superblock lock.
				151	*/
				152	int sync_blockdev(struct block_device *bdev)
				153	{
				154	int ret = 0;
				155
OGAWA Hirofumi	28fd129	2006-01-08 01:02:14 -0800	[diff] [blame]	156	if (bdev)
				157	ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	158	return ret;
				159	}
				160	EXPORT_SYMBOL(sync_blockdev);
				161
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	162	/*
				163	* Write out and wait upon all dirty data associated with this
				164	* device. Filesystem data as well as the underlying block
				165	* device. Takes the superblock lock.
				166	*/
				167	int fsync_bdev(struct block_device *bdev)
				168	{
				169	struct super_block *sb = get_super(bdev);
				170	if (sb) {
				171	int res = fsync_super(sb);
				172	drop_super(sb);
				173	return res;
				174	}
				175	return sync_blockdev(bdev);
				176	}
				177
				178	/**
				179	* freeze_bdev -- lock a filesystem and force it into a consistent state
				180	* @bdev: blockdevice to lock
				181	*
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	182	* This takes the block device bd_mount_mutex to make sure no new mounts
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	183	* happen on bdev until thaw_bdev() is called.
				184	* If a superblock is found on this device, we take the s_umount semaphore
				185	* on it to make sure nobody unmounts until the snapshot creation is done.
				186	*/
				187	struct super_block freeze_bdev(struct block_device bdev)
				188	{
				189	struct super_block *sb;
				190
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	191	mutex_lock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	192	sb = get_super(bdev);
				193	if (sb && !(sb->s_flags & MS_RDONLY)) {
				194	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	195	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	196
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	197	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	198
				199	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	200	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	201
				202	sync_blockdev(sb->s_bdev);
				203
				204	if (sb->s_op->write_super_lockfs)
				205	sb->s_op->write_super_lockfs(sb);
				206	}
				207
				208	sync_blockdev(bdev);
				209	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				210	}
				211	EXPORT_SYMBOL(freeze_bdev);
				212
				213	/**
				214	* thaw_bdev -- unlock filesystem
				215	* @bdev: blockdevice to unlock
				216	* @sb: associated superblock
				217	*
				218	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				219	*/
				220	void thaw_bdev(struct block_device bdev, struct super_block sb)
				221	{
				222	if (sb) {
				223	BUG_ON(sb->s_bdev != bdev);
				224
				225	if (sb->s_op->unlockfs)
				226	sb->s_op->unlockfs(sb);
				227	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	228	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	229	wake_up(&sb->s_wait_unfrozen);
				230	drop_super(sb);
				231	}
				232
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	233	mutex_unlock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	234	}
				235	EXPORT_SYMBOL(thaw_bdev);
				236
				237	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	238	* Various filesystems appear to want __find_get_block to be non-blocking.
				239	* But it's the page lock which protects the buffers. To get around this,
				240	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				241	* private_lock.
				242	*
				243	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				244	* may be quite high. This code could TryLock the page, and if that
				245	* succeeds, there is no need to take private_lock. (But if
				246	* private_lock is contended then so is mapping->tree_lock).
				247	*/
				248	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	249	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	250	{
				251	struct inode *bd_inode = bdev->bd_inode;
				252	struct address_space *bd_mapping = bd_inode->i_mapping;
				253	struct buffer_head *ret = NULL;
				254	pgoff_t index;
				255	struct buffer_head *bh;
				256	struct buffer_head *head;
				257	struct page *page;
				258	int all_mapped = 1;
				259
				260	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				261	page = find_get_page(bd_mapping, index);
				262	if (!page)
				263	goto out;
				264
				265	spin_lock(&bd_mapping->private_lock);
				266	if (!page_has_buffers(page))
				267	goto out_unlock;
				268	head = page_buffers(page);
				269	bh = head;
				270	do {
				271	if (bh->b_blocknr == block) {
				272	ret = bh;
				273	get_bh(bh);
				274	goto out_unlock;
				275	}
				276	if (!buffer_mapped(bh))
				277	all_mapped = 0;
				278	bh = bh->b_this_page;
				279	} while (bh != head);
				280
				281	/* we might be here because some of the buffers on this page are
				282	* not mapped. This is due to various races between
				283	* file io on the block device and getblk. It gets dealt with
				284	* elsewhere, don't buffer_error if we had some unmapped buffers
				285	*/
				286	if (all_mapped) {
				287	printk("__find_get_block_slow() failed. "
				288	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	289	(unsigned long long)block,
				290	(unsigned long long)bh->b_blocknr);
				291	printk("b_state=0x%08lx, b_size=%zu\n",
				292	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	293	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				294	}
				295	out_unlock:
				296	spin_unlock(&bd_mapping->private_lock);
				297	page_cache_release(page);
				298	out:
				299	return ret;
				300	}
				301
				302	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				303	of fs corruption is going on. Trashing dirty data always imply losing
				304	information that was supposed to be just stored on the physical layer
				305	by the user.
				306
				307	Thus invalidate_buffers in general usage is not allwowed to trash
				308	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				309	be preserved. These buffers are simply skipped.
				310
				311	We also skip buffers which are still in use. For example this can
				312	happen if a userspace program is reading the block device.
				313
				314	NOTE: In the case where the user removed a removable-media-disk even if
				315	there's still dirty data not synced on disk (due a bug in the device driver
				316	or due an error of the user), by not destroying the dirty buffers we could
				317	generate corruption also on the next media inserted, thus a parameter is
				318	necessary to handle this case in the most safe way possible (trying
				319	to not corrupt also the new disk inserted with the data belonging to
				320	the old now corrupted disk). Also for the ramdisk the natural thing
				321	to do in order to release the ramdisk memory is to destroy dirty buffers.
				322
				323	These are two special cases. Normal usage imply the device driver
				324	to issue a sync on the device (without waiting I/O completion) and
				325	then an invalidate_buffers call that doesn't trash dirty buffers.
				326
				327	For handling cache coherency with the blkdev pagecache the 'update' case
				328	is been introduced. It is needed to re-read from disk any pinned
				329	buffer. NOTE: re-reading from disk is destructive so we can do it only
				330	when we assume nobody is changing the buffercache under our I/O and when
				331	we think the disk contains more recent information than the buffercache.
				332	The update == 1 pass marks the buffers we need to update, the update == 2
				333	pass does the actual I/O. */
				334	void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
				335	{
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	336	struct address_space *mapping = bdev->bd_inode->i_mapping;
				337
				338	if (mapping->nrpages == 0)
				339	return;
				340
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	341	invalidate_bh_lrus();
				342	/*
				343	* FIXME: what about destroy_dirty_buffers?
				344	* We really want to use invalidate_inode_pages2() for
				345	* that, but not until that's cleaned up.
				346	*/
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	347	invalidate_inode_pages(mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348	}
				349
				350	/*
				351	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				352	*/
				353	static void free_more_memory(void)
				354	{
				355	struct zone **zones;
				356	pg_data_t *pgdat;
				357
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	358	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	359	yield();
				360
KAMEZAWA Hiroyuki	ec936fc	2006-03-27 01:15:59 -0800	[diff] [blame]	361	for_each_online_pgdat(pgdat) {
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	362	zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	363	if (*zones)
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	364	try_to_free_pages(zones, GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	365	}
				366	}
				367
				368	/*
				369	* I/O completion handler for block_read_full_page() - pages
				370	* which come unlocked at the end of I/O.
				371	*/
				372	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				373	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	374	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	375	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	376	struct buffer_head *tmp;
				377	struct page *page;
				378	int page_uptodate = 1;
				379
				380	BUG_ON(!buffer_async_read(bh));
				381
				382	page = bh->b_page;
				383	if (uptodate) {
				384	set_buffer_uptodate(bh);
				385	} else {
				386	clear_buffer_uptodate(bh);
				387	if (printk_ratelimit())
				388	buffer_io_error(bh);
				389	SetPageError(page);
				390	}
				391
				392	/*
				393	* Be _very_ careful from here on. Bad things can happen if
				394	* two buffer heads end IO at almost the same time and both
				395	* decide that the page is now completely done.
				396	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	397	first = page_buffers(page);
				398	local_irq_save(flags);
				399	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	400	clear_buffer_async_read(bh);
				401	unlock_buffer(bh);
				402	tmp = bh;
				403	do {
				404	if (!buffer_uptodate(tmp))
				405	page_uptodate = 0;
				406	if (buffer_async_read(tmp)) {
				407	BUG_ON(!buffer_locked(tmp));
				408	goto still_busy;
				409	}
				410	tmp = tmp->b_this_page;
				411	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	412	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				413	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	414
				415	/*
				416	* If none of the buffers had errors and they are all
				417	* uptodate then we can set the page uptodate.
				418	*/
				419	if (page_uptodate && !PageError(page))
				420	SetPageUptodate(page);
				421	unlock_page(page);
				422	return;
				423
				424	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	425	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				426	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	427	return;
				428	}
				429
				430	/*
				431	* Completion handler for block_write_full_page() - pages which are unlocked
				432	* during I/O, and which have PageWriteback cleared upon I/O completion.
				433	*/
Adrian Bunk	b6cd0b7	2006-06-27 02:53:54 -0700	[diff] [blame]	434	static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	435	{
				436	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	437	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	438	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	439	struct buffer_head *tmp;
				440	struct page *page;
				441
				442	BUG_ON(!buffer_async_write(bh));
				443
				444	page = bh->b_page;
				445	if (uptodate) {
				446	set_buffer_uptodate(bh);
				447	} else {
				448	if (printk_ratelimit()) {
				449	buffer_io_error(bh);
				450	printk(KERN_WARNING "lost page write due to "
				451	"I/O error on %s\n",
				452	bdevname(bh->b_bdev, b));
				453	}
				454	set_bit(AS_EIO, &page->mapping->flags);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	455	set_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	456	clear_buffer_uptodate(bh);
				457	SetPageError(page);
				458	}
				459
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	460	first = page_buffers(page);
				461	local_irq_save(flags);
				462	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				463
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	464	clear_buffer_async_write(bh);
				465	unlock_buffer(bh);
				466	tmp = bh->b_this_page;
				467	while (tmp != bh) {
				468	if (buffer_async_write(tmp)) {
				469	BUG_ON(!buffer_locked(tmp));
				470	goto still_busy;
				471	}
				472	tmp = tmp->b_this_page;
				473	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	474	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				475	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	476	end_page_writeback(page);
				477	return;
				478
				479	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	480	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				481	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	482	return;
				483	}
				484
				485	/*
				486	* If a page's buffers are under async readin (end_buffer_async_read
				487	* completion) then there is a possibility that another thread of
				488	* control could lock one of the buffers after it has completed
				489	* but while some of the other buffers have not completed. This
				490	* locked buffer would confuse end_buffer_async_read() into not unlocking
				491	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				492	* that this buffer is not under async I/O.
				493	*
				494	* The page comes unlocked when it has no locked buffer_async buffers
				495	* left.
				496	*
				497	* PageLocked prevents anyone starting new async I/O reads any of
				498	* the buffers.
				499	*
				500	* PageWriteback is used to prevent simultaneous writeout of the same
				501	* page.
				502	*
				503	* PageLocked prevents anyone from starting writeback of a page which is
				504	* under read I/O (PageWriteback is only ever set against a locked page).
				505	*/
				506	static void mark_buffer_async_read(struct buffer_head *bh)
				507	{
				508	bh->b_end_io = end_buffer_async_read;
				509	set_buffer_async_read(bh);
				510	}
				511
				512	void mark_buffer_async_write(struct buffer_head *bh)
				513	{
				514	bh->b_end_io = end_buffer_async_write;
				515	set_buffer_async_write(bh);
				516	}
				517	EXPORT_SYMBOL(mark_buffer_async_write);
				518
				519
				520	/*
				521	* fs/buffer.c contains helper functions for buffer-backed address space's
				522	* fsync functions. A common requirement for buffer-based filesystems is
				523	* that certain data from the backing blockdev needs to be written out for
				524	* a successful fsync(). For example, ext2 indirect blocks need to be
				525	* written back and waited upon before fsync() returns.
				526	*
				527	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				528	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				529	* management of a list of dependent buffers at ->i_mapping->private_list.
				530	*
				531	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				532	* from their controlling inode's queue when they are being freed. But
				533	* try_to_free_buffers() will be operating against the blockdev mapping
				534	* at the time, not against the S_ISREG file which depends on those buffers.
				535	* So the locking for private_list is via the private_lock in the address_space
				536	* which backs the buffers. Which is different from the address_space
				537	* against which the buffers are listed. So for a particular address_space,
				538	* mapping->private_lock does not protect mapping->private_list! In fact,
				539	* mapping->private_list will always be protected by the backing blockdev's
				540	* ->private_lock.
				541	*
				542	* Which introduces a requirement: all buffers on an address_space's
				543	* ->private_list must be from the same address_space: the blockdev's.
				544	*
				545	* address_spaces which do not place buffers at ->private_list via these
				546	* utility functions are free to use private_lock and private_list for
				547	* whatever they want. The only requirement is that list_empty(private_list)
				548	* be true at clear_inode() time.
				549	*
				550	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				551	* filesystems should do that. invalidate_inode_buffers() should just go
				552	* BUG_ON(!list_empty).
				553	*
				554	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				555	* take an address_space, not an inode. And it should be called
				556	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				557	* queued up.
				558	*
				559	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				560	* list if it is already on a list. Because if the buffer is on a list,
				561	* it must already be on the right one. If not, the filesystem is being
				562	* silly. This will save a ton of locking. But first we have to ensure
				563	* that buffers are taken off the old inode's list when they are freed
				564	* (presumably in truncate). That requires careful auditing of all
				565	* filesystems (do it inside bforget()). It could also be done by bringing
				566	* b_inode back.
				567	*/
				568
				569	/*
				570	* The buffer's backing address_space's private_lock must be held
				571	*/
				572	static inline void __remove_assoc_queue(struct buffer_head *bh)
				573	{
				574	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	575	WARN_ON(!bh->b_assoc_map);
				576	if (buffer_write_io_error(bh))
				577	set_bit(AS_EIO, &bh->b_assoc_map->flags);
				578	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	579	}
				580
				581	int inode_has_buffers(struct inode *inode)
				582	{
				583	return !list_empty(&inode->i_data.private_list);
				584	}
				585
				586	/*
				587	* osync is designed to support O_SYNC io. It waits synchronously for
				588	* all already-submitted IO to complete, but does not queue any new
				589	* writes to the disk.
				590	*
				591	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				592	* you dirty the buffers, and then use osync_inode_buffers to wait for
				593	* completion. Any other dirty buffers which are not yet queued for
				594	* write will not be flushed to disk by the osync.
				595	*/
				596	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				597	{
				598	struct buffer_head *bh;
				599	struct list_head *p;
				600	int err = 0;
				601
				602	spin_lock(lock);
				603	repeat:
				604	list_for_each_prev(p, list) {
				605	bh = BH_ENTRY(p);
				606	if (buffer_locked(bh)) {
				607	get_bh(bh);
				608	spin_unlock(lock);
				609	wait_on_buffer(bh);
				610	if (!buffer_uptodate(bh))
				611	err = -EIO;
				612	brelse(bh);
				613	spin_lock(lock);
				614	goto repeat;
				615	}
				616	}
				617	spin_unlock(lock);
				618	return err;
				619	}
				620
				621	/**
				622	* sync_mapping_buffers - write out and wait upon a mapping's "associated"
				623	* buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	624	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	625	*
				626	* Starts I/O against the buffers at mapping->private_list, and waits upon
				627	* that I/O.
				628	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	629	* Basically, this is a convenience function for fsync().
				630	* @mapping is a file or directory which needs those buffers to be written for
				631	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	632	*/
				633	int sync_mapping_buffers(struct address_space *mapping)
				634	{
				635	struct address_space *buffer_mapping = mapping->assoc_mapping;
				636
				637	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				638	return 0;
				639
				640	return fsync_buffers_list(&buffer_mapping->private_lock,
				641	&mapping->private_list);
				642	}
				643	EXPORT_SYMBOL(sync_mapping_buffers);
				644
				645	/*
				646	* Called when we've recently written block `bblock', and it is known that
				647	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				648	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				649	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				650	*/
				651	void write_boundary_block(struct block_device *bdev,
				652	sector_t bblock, unsigned blocksize)
				653	{
				654	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				655	if (bh) {
				656	if (buffer_dirty(bh))
				657	ll_rw_block(WRITE, 1, &bh);
				658	put_bh(bh);
				659	}
				660	}
				661
				662	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				663	{
				664	struct address_space *mapping = inode->i_mapping;
				665	struct address_space *buffer_mapping = bh->b_page->mapping;
				666
				667	mark_buffer_dirty(bh);
				668	if (!mapping->assoc_mapping) {
				669	mapping->assoc_mapping = buffer_mapping;
				670	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	671	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	672	}
				673	if (list_empty(&bh->b_assoc_buffers)) {
				674	spin_lock(&buffer_mapping->private_lock);
				675	list_move_tail(&bh->b_assoc_buffers,
				676	&mapping->private_list);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	677	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	678	spin_unlock(&buffer_mapping->private_lock);
				679	}
				680	}
				681	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				682
				683	/*
				684	* Add a page to the dirty page list.
				685	*
				686	* It is a sad fact of life that this function is called from several places
				687	* deeply under spinlocking. It may not sleep.
				688	*
				689	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				690	* dirty-state coherency between the page and the buffers. It the page does
				691	* not have buffers then when they are later attached they will all be set
				692	* dirty.
				693	*
				694	* The buffers are dirtied before the page is dirtied. There's a small race
				695	* window in which a writepage caller may see the page cleanness but not the
				696	* buffer dirtiness. That's fine. If this code were to set the page dirty
				697	* before the buffers, a concurrent writepage caller could clear the page dirty
				698	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				699	* page on the dirty page list.
				700	*
				701	* We use private_lock to lock against try_to_free_buffers while using the
				702	* page's buffer list. Also use this to protect against clean buffers being
				703	* added to the page after it was set dirty.
				704	*
				705	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				706	* address_space though.
				707	*/
				708	int __set_page_dirty_buffers(struct page *page)
				709	{
Nick Piggin	ebf7a22	2006-10-10 04:36:54 +0200	[diff] [blame]	710	struct address_space * const mapping = page_mapping(page);
				711
				712	if (unlikely(!mapping))
				713	return !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	714
				715	spin_lock(&mapping->private_lock);
				716	if (page_has_buffers(page)) {
				717	struct buffer_head *head = page_buffers(page);
				718	struct buffer_head *bh = head;
				719
				720	do {
				721	set_buffer_dirty(bh);
				722	bh = bh->b_this_page;
				723	} while (bh != head);
				724	}
				725	spin_unlock(&mapping->private_lock);
				726
				727	if (!TestSetPageDirty(page)) {
				728	write_lock_irq(&mapping->tree_lock);
				729	if (page->mapping) { /* Race with truncate? */
				730	if (mapping_cap_account_dirty(mapping))
Christoph Lameter	b1e7a8f	2006-06-30 01:55:39 -0700	[diff] [blame]	731	__inc_zone_page_state(page, NR_FILE_DIRTY);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	732	radix_tree_tag_set(&mapping->page_tree,
				733	page_index(page),
				734	PAGECACHE_TAG_DIRTY);
				735	}
				736	write_unlock_irq(&mapping->tree_lock);
				737	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Andrew Morton	4741c9f	2006-03-24 03:18:11 -0800	[diff] [blame]	738	return 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	739	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	740	return 0;
				741	}
				742	EXPORT_SYMBOL(__set_page_dirty_buffers);
				743
				744	/*
				745	* Write out and wait upon a list of buffers.
				746	*
				747	* We have conflicting pressures: we want to make sure that all
				748	* initially dirty buffers get waited on, but that any subsequently
				749	* dirtied buffers don't. After all, we don't want fsync to last
				750	* forever if somebody is actively writing to the file.
				751	*
				752	* Do this in two main stages: first we copy dirty buffers to a
				753	* temporary inode list, queueing the writes as we go. Then we clean
				754	* up, waiting for those writes to complete.
				755	*
				756	* During this second stage, any subsequent updates to the file may end
				757	* up refiling the buffer on the original inode's dirty list again, so
				758	* there is a chance we will end up with a buffer queued for write but
				759	* not yet completed on that list. So, as a final cleanup we go through
				760	* the osync code to catch these locked, dirty buffers without requeuing
				761	* any newly dirty buffers for write.
				762	*/
				763	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				764	{
				765	struct buffer_head *bh;
				766	struct list_head tmp;
				767	int err = 0, err2;
				768
				769	INIT_LIST_HEAD(&tmp);
				770
				771	spin_lock(lock);
				772	while (!list_empty(list)) {
				773	bh = BH_ENTRY(list->next);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	774	__remove_assoc_queue(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	775	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				776	list_add(&bh->b_assoc_buffers, &tmp);
				777	if (buffer_dirty(bh)) {
				778	get_bh(bh);
				779	spin_unlock(lock);
				780	/*
				781	* Ensure any pending I/O completes so that
				782	* ll_rw_block() actually writes the current
				783	* contents - it is a noop if I/O is still in
				784	* flight on potentially older contents.
				785	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	786	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	787	brelse(bh);
				788	spin_lock(lock);
				789	}
				790	}
				791	}
				792
				793	while (!list_empty(&tmp)) {
				794	bh = BH_ENTRY(tmp.prev);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	795	list_del_init(&bh->b_assoc_buffers);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	796	get_bh(bh);
				797	spin_unlock(lock);
				798	wait_on_buffer(bh);
				799	if (!buffer_uptodate(bh))
				800	err = -EIO;
				801	brelse(bh);
				802	spin_lock(lock);
				803	}
				804
				805	spin_unlock(lock);
				806	err2 = osync_buffers_list(lock, list);
				807	if (err)
				808	return err;
				809	else
				810	return err2;
				811	}
				812
				813	/*
				814	* Invalidate any and all dirty buffers on a given inode. We are
				815	* probably unmounting the fs, but that doesn't mean we have already
				816	* done a sync(). Just drop the buffers from the inode list.
				817	*
				818	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				819	* assumes that all the buffers are against the blockdev. Not true
				820	* for reiserfs.
				821	*/
				822	void invalidate_inode_buffers(struct inode *inode)
				823	{
				824	if (inode_has_buffers(inode)) {
				825	struct address_space *mapping = &inode->i_data;
				826	struct list_head *list = &mapping->private_list;
				827	struct address_space *buffer_mapping = mapping->assoc_mapping;
				828
				829	spin_lock(&buffer_mapping->private_lock);
				830	while (!list_empty(list))
				831	__remove_assoc_queue(BH_ENTRY(list->next));
				832	spin_unlock(&buffer_mapping->private_lock);
				833	}
				834	}
				835
				836	/*
				837	* Remove any clean buffers from the inode's buffer list. This is called
				838	* when we're trying to free the inode itself. Those buffers can pin it.
				839	*
				840	* Returns true if all buffers were removed.
				841	*/
				842	int remove_inode_buffers(struct inode *inode)
				843	{
				844	int ret = 1;
				845
				846	if (inode_has_buffers(inode)) {
				847	struct address_space *mapping = &inode->i_data;
				848	struct list_head *list = &mapping->private_list;
				849	struct address_space *buffer_mapping = mapping->assoc_mapping;
				850
				851	spin_lock(&buffer_mapping->private_lock);
				852	while (!list_empty(list)) {
				853	struct buffer_head *bh = BH_ENTRY(list->next);
				854	if (buffer_dirty(bh)) {
				855	ret = 0;
				856	break;
				857	}
				858	__remove_assoc_queue(bh);
				859	}
				860	spin_unlock(&buffer_mapping->private_lock);
				861	}
				862	return ret;
				863	}
				864
				865	/*
				866	* Create the appropriate buffers when given a page for data area and
				867	* the size of each buffer.. Use the bh->b_this_page linked list to
				868	* follow the buffers created. Return NULL if unable to create more
				869	* buffers.
				870	*
				871	* The retry flag is used to differentiate async IO (paging, swapping)
				872	* which may not fail from ordinary buffer allocations.
				873	*/
				874	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				875	int retry)
				876	{
				877	struct buffer_head bh, head;
				878	long offset;
				879
				880	try_again:
				881	head = NULL;
				882	offset = PAGE_SIZE;
				883	while ((offset -= size) >= 0) {
				884	bh = alloc_buffer_head(GFP_NOFS);
				885	if (!bh)
				886	goto no_grow;
				887
				888	bh->b_bdev = NULL;
				889	bh->b_this_page = head;
				890	bh->b_blocknr = -1;
				891	head = bh;
				892
				893	bh->b_state = 0;
				894	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	895	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	896	bh->b_size = size;
				897
				898	/* Link the buffer to its page */
				899	set_bh_page(bh, page, offset);
				900
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	901	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	902	}
				903	return head;
				904	/*
				905	* In case anything failed, we just free everything we got.
				906	*/
				907	no_grow:
				908	if (head) {
				909	do {
				910	bh = head;
				911	head = head->b_this_page;
				912	free_buffer_head(bh);
				913	} while (head);
				914	}
				915
				916	/*
				917	* Return failure for non-async IO requests. Async IO requests
				918	* are not allowed to fail, so we have to wait until buffer heads
				919	* become available. But we don't want tasks sleeping with
				920	* partially complete buffers, so all were released above.
				921	*/
				922	if (!retry)
				923	return NULL;
				924
				925	/* We're _really_ low on memory. Now we just
				926	* wait for old buffer heads to become free due to
				927	* finishing IO. Since this is an async request and
				928	* the reserve list is empty, we're sure there are
				929	* async buffer heads in use.
				930	*/
				931	free_more_memory();
				932	goto try_again;
				933	}
				934	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				935
				936	static inline void
				937	link_dev_buffers(struct page page, struct buffer_head head)
				938	{
				939	struct buffer_head bh, tail;
				940
				941	bh = head;
				942	do {
				943	tail = bh;
				944	bh = bh->b_this_page;
				945	} while (bh);
				946	tail->b_this_page = head;
				947	attach_page_buffers(page, head);
				948	}
				949
				950	/*
				951	* Initialise the state of a blockdev page's buffers.
				952	*/
				953	static void
				954	init_page_buffers(struct page page, struct block_device bdev,
				955	sector_t block, int size)
				956	{
				957	struct buffer_head *head = page_buffers(page);
				958	struct buffer_head *bh = head;
				959	int uptodate = PageUptodate(page);
				960
				961	do {
				962	if (!buffer_mapped(bh)) {
				963	init_buffer(bh, NULL, NULL);
				964	bh->b_bdev = bdev;
				965	bh->b_blocknr = block;
				966	if (uptodate)
				967	set_buffer_uptodate(bh);
				968	set_buffer_mapped(bh);
				969	}
				970	block++;
				971	bh = bh->b_this_page;
				972	} while (bh != head);
				973	}
				974
				975	/*
				976	* Create the page-cache page that contains the requested block.
				977	*
				978	* This is user purely for blockdev mappings.
				979	*/
				980	static struct page *
				981	grow_dev_page(struct block_device *bdev, sector_t block,
				982	pgoff_t index, int size)
				983	{
				984	struct inode *inode = bdev->bd_inode;
				985	struct page *page;
				986	struct buffer_head *bh;
				987
				988	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
				989	if (!page)
				990	return NULL;
				991
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	992	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	993
				994	if (page_has_buffers(page)) {
				995	bh = page_buffers(page);
				996	if (bh->b_size == size) {
				997	init_page_buffers(page, bdev, block, size);
				998	return page;
				999	}
				1000	if (!try_to_free_buffers(page))
				1001	goto failed;
				1002	}
				1003
				1004	/*
				1005	* Allocate some buffers for this page
				1006	*/
				1007	bh = alloc_page_buffers(page, size, 0);
				1008	if (!bh)
				1009	goto failed;
				1010
				1011	/*
				1012	* Link the page to the buffers and initialise them. Take the
				1013	* lock to be atomic wrt __find_get_block(), which does not
				1014	* run under the page lock.
				1015	*/
				1016	spin_lock(&inode->i_mapping->private_lock);
				1017	link_dev_buffers(page, bh);
				1018	init_page_buffers(page, bdev, block, size);
				1019	spin_unlock(&inode->i_mapping->private_lock);
				1020	return page;
				1021
				1022	failed:
				1023	BUG();
				1024	unlock_page(page);
				1025	page_cache_release(page);
				1026	return NULL;
				1027	}
				1028
				1029	/*
				1030	* Create buffers for the specified block device block's page. If
				1031	* that page was dirty, the buffers are set dirty also.
				1032	*
				1033	* Except that's a bug. Attaching dirty buffers to a dirty
				1034	* blockdev's page can result in filesystem corruption, because
				1035	* some of those buffers may be aliases of filesystem data.
				1036	* grow_dev_page() will go BUG() if this happens.
				1037	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1038	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1039	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1040	{
				1041	struct page *page;
				1042	pgoff_t index;
				1043	int sizebits;
				1044
				1045	sizebits = -1;
				1046	do {
				1047	sizebits++;
				1048	} while ((size << sizebits) < PAGE_SIZE);
				1049
				1050	index = block >> sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1051
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1052	/*
				1053	* Check for a block which wants to lie outside our maximum possible
				1054	* pagecache index. (this comparison is done using sector_t types).
				1055	*/
				1056	if (unlikely(index != block >> sizebits)) {
				1057	char b[BDEVNAME_SIZE];
				1058
				1059	printk(KERN_ERR "%s: requested out-of-range block %llu for "
				1060	"device %s\n",
				1061	__FUNCTION__, (unsigned long long)block,
				1062	bdevname(bdev, b));
				1063	return -EIO;
				1064	}
				1065	block = index << sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1066	/* Create a page with the proper size buffers.. */
				1067	page = grow_dev_page(bdev, block, index, size);
				1068	if (!page)
				1069	return 0;
				1070	unlock_page(page);
				1071	page_cache_release(page);
				1072	return 1;
				1073	}
				1074
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1075	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1076	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1077	{
				1078	/* Size must be multiple of hard sectorsize */
				1079	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1080	(size < 512 \|\| size > PAGE_SIZE))) {
				1081	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1082	size);
				1083	printk(KERN_ERR "hardsect size: %d\n",
				1084	bdev_hardsect_size(bdev));
				1085
				1086	dump_stack();
				1087	return NULL;
				1088	}
				1089
				1090	for (;;) {
				1091	struct buffer_head * bh;
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1092	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1093
				1094	bh = __find_get_block(bdev, block, size);
				1095	if (bh)
				1096	return bh;
				1097
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1098	ret = grow_buffers(bdev, block, size);
				1099	if (ret < 0)
				1100	return NULL;
				1101	if (ret == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1102	free_more_memory();
				1103	}
				1104	}
				1105
				1106	/*
				1107	* The relationship between dirty buffers and dirty pages:
				1108	*
				1109	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1110	* the page is tagged dirty in its radix tree.
				1111	*
				1112	* At all times, the dirtiness of the buffers represents the dirtiness of
				1113	* subsections of the page. If the page has buffers, the page dirty bit is
				1114	* merely a hint about the true dirty state.
				1115	*
				1116	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1117	* (if the page has buffers).
				1118	*
				1119	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1120	* buffers are not.
				1121	*
				1122	* Also. When blockdev buffers are explicitly read with bread(), they
				1123	* individually become uptodate. But their backing page remains not
				1124	* uptodate - even if all of its buffers are uptodate. A subsequent
				1125	* block_read_full_page() against that page will discover all the uptodate
				1126	* buffers, will set the page uptodate and will perform no I/O.
				1127	*/
				1128
				1129	/**
				1130	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1131	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1132	*
				1133	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1134	* backing page dirty, then tag the page as dirty in its address_space's radix
				1135	* tree and then attach the address_space's inode to its superblock's dirty
				1136	* inode list.
				1137	*
				1138	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1139	* mapping->tree_lock and the global inode_lock.
				1140	*/
				1141	void fastcall mark_buffer_dirty(struct buffer_head *bh)
				1142	{
				1143	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
				1144	__set_page_dirty_nobuffers(bh->b_page);
				1145	}
				1146
				1147	/*
				1148	* Decrement a buffer_head's reference count. If all buffers against a page
				1149	* have zero reference count, are clean and unlocked, and if the page is clean
				1150	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1151	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1152	* a page but it ends up not being freed, and buffers may later be reattached).
				1153	*/
				1154	void __brelse(struct buffer_head * buf)
				1155	{
				1156	if (atomic_read(&buf->b_count)) {
				1157	put_bh(buf);
				1158	return;
				1159	}
				1160	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1161	WARN_ON(1);
				1162	}
				1163
				1164	/*
				1165	* bforget() is like brelse(), except it discards any
				1166	* potentially dirty data.
				1167	*/
				1168	void __bforget(struct buffer_head *bh)
				1169	{
				1170	clear_buffer_dirty(bh);
				1171	if (!list_empty(&bh->b_assoc_buffers)) {
				1172	struct address_space *buffer_mapping = bh->b_page->mapping;
				1173
				1174	spin_lock(&buffer_mapping->private_lock);
				1175	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	1176	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1177	spin_unlock(&buffer_mapping->private_lock);
				1178	}
				1179	__brelse(bh);
				1180	}
				1181
				1182	static struct buffer_head __bread_slow(struct buffer_head bh)
				1183	{
				1184	lock_buffer(bh);
				1185	if (buffer_uptodate(bh)) {
				1186	unlock_buffer(bh);
				1187	return bh;
				1188	} else {
				1189	get_bh(bh);
				1190	bh->b_end_io = end_buffer_read_sync;
				1191	submit_bh(READ, bh);
				1192	wait_on_buffer(bh);
				1193	if (buffer_uptodate(bh))
				1194	return bh;
				1195	}
				1196	brelse(bh);
				1197	return NULL;
				1198	}
				1199
				1200	/*
				1201	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1202	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1203	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1204	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1205	* CPU's LRUs at the same time.
				1206	*
				1207	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1208	* sb_find_get_block().
				1209	*
				1210	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1211	* a local interrupt disable for that.
				1212	*/
				1213
				1214	#define BH_LRU_SIZE 8
				1215
				1216	struct bh_lru {
				1217	struct buffer_head *bhs[BH_LRU_SIZE];
				1218	};
				1219
				1220	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1221
				1222	#ifdef CONFIG_SMP
				1223	#define bh_lru_lock() local_irq_disable()
				1224	#define bh_lru_unlock() local_irq_enable()
				1225	#else
				1226	#define bh_lru_lock() preempt_disable()
				1227	#define bh_lru_unlock() preempt_enable()
				1228	#endif
				1229
				1230	static inline void check_irqs_on(void)
				1231	{
				1232	#ifdef irqs_disabled
				1233	BUG_ON(irqs_disabled());
				1234	#endif
				1235	}
				1236
				1237	/*
				1238	* The LRU management algorithm is dopey-but-simple. Sorry.
				1239	*/
				1240	static void bh_lru_install(struct buffer_head *bh)
				1241	{
				1242	struct buffer_head *evictee = NULL;
				1243	struct bh_lru *lru;
				1244
				1245	check_irqs_on();
				1246	bh_lru_lock();
				1247	lru = &__get_cpu_var(bh_lrus);
				1248	if (lru->bhs[0] != bh) {
				1249	struct buffer_head *bhs[BH_LRU_SIZE];
				1250	int in;
				1251	int out = 0;
				1252
				1253	get_bh(bh);
				1254	bhs[out++] = bh;
				1255	for (in = 0; in < BH_LRU_SIZE; in++) {
				1256	struct buffer_head *bh2 = lru->bhs[in];
				1257
				1258	if (bh2 == bh) {
				1259	__brelse(bh2);
				1260	} else {
				1261	if (out >= BH_LRU_SIZE) {
				1262	BUG_ON(evictee != NULL);
				1263	evictee = bh2;
				1264	} else {
				1265	bhs[out++] = bh2;
				1266	}
				1267	}
				1268	}
				1269	while (out < BH_LRU_SIZE)
				1270	bhs[out++] = NULL;
				1271	memcpy(lru->bhs, bhs, sizeof(bhs));
				1272	}
				1273	bh_lru_unlock();
				1274
				1275	if (evictee)
				1276	__brelse(evictee);
				1277	}
				1278
				1279	/*
				1280	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1281	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1282	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1283	lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
				1284	{
				1285	struct buffer_head *ret = NULL;
				1286	struct bh_lru *lru;
				1287	int i;
				1288
				1289	check_irqs_on();
				1290	bh_lru_lock();
				1291	lru = &__get_cpu_var(bh_lrus);
				1292	for (i = 0; i < BH_LRU_SIZE; i++) {
				1293	struct buffer_head *bh = lru->bhs[i];
				1294
				1295	if (bh && bh->b_bdev == bdev &&
				1296	bh->b_blocknr == block && bh->b_size == size) {
				1297	if (i) {
				1298	while (i) {
				1299	lru->bhs[i] = lru->bhs[i - 1];
				1300	i--;
				1301	}
				1302	lru->bhs[0] = bh;
				1303	}
				1304	get_bh(bh);
				1305	ret = bh;
				1306	break;
				1307	}
				1308	}
				1309	bh_lru_unlock();
				1310	return ret;
				1311	}
				1312
				1313	/*
				1314	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1315	* it in the LRU and mark it as accessed. If it is not present then return
				1316	* NULL
				1317	*/
				1318	struct buffer_head *
				1319	__find_get_block(struct block_device *bdev, sector_t block, int size)
				1320	{
				1321	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1322
				1323	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1324	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1325	if (bh)
				1326	bh_lru_install(bh);
				1327	}
				1328	if (bh)
				1329	touch_buffer(bh);
				1330	return bh;
				1331	}
				1332	EXPORT_SYMBOL(__find_get_block);
				1333
				1334	/*
				1335	* __getblk will locate (and, if necessary, create) the buffer_head
				1336	* which corresponds to the passed block_device, block and size. The
				1337	* returned buffer has its reference count incremented.
				1338	*
				1339	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1340	* illegal block number, __getblk() will happily return a buffer_head
				1341	* which represents the non-existent block. Very weird.
				1342	*
				1343	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1344	* attempt is failing. FIXME, perhaps?
				1345	*/
				1346	struct buffer_head *
				1347	__getblk(struct block_device *bdev, sector_t block, int size)
				1348	{
				1349	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1350
				1351	might_sleep();
				1352	if (bh == NULL)
				1353	bh = __getblk_slow(bdev, block, size);
				1354	return bh;
				1355	}
				1356	EXPORT_SYMBOL(__getblk);
				1357
				1358	/*
				1359	* Do async read-ahead on a buffer..
				1360	*/
				1361	void __breadahead(struct block_device *bdev, sector_t block, int size)
				1362	{
				1363	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1364	if (likely(bh)) {
				1365	ll_rw_block(READA, 1, &bh);
				1366	brelse(bh);
				1367	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1368	}
				1369	EXPORT_SYMBOL(__breadahead);
				1370
				1371	/**
				1372	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1373	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1374	* @block: number of block
				1375	* @size: size (in bytes) to read
				1376	*
				1377	* Reads a specified block, and returns buffer head that contains it.
				1378	* It returns NULL if the block was unreadable.
				1379	*/
				1380	struct buffer_head *
				1381	__bread(struct block_device *bdev, sector_t block, int size)
				1382	{
				1383	struct buffer_head *bh = __getblk(bdev, block, size);
				1384
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1385	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1386	bh = __bread_slow(bh);
				1387	return bh;
				1388	}
				1389	EXPORT_SYMBOL(__bread);
				1390
				1391	/*
				1392	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1393	* This doesn't race because it runs in each cpu either in irq
				1394	* or with preempt disabled.
				1395	*/
				1396	static void invalidate_bh_lru(void *arg)
				1397	{
				1398	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1399	int i;
				1400
				1401	for (i = 0; i < BH_LRU_SIZE; i++) {
				1402	brelse(b->bhs[i]);
				1403	b->bhs[i] = NULL;
				1404	}
				1405	put_cpu_var(bh_lrus);
				1406	}
				1407
				1408	static void invalidate_bh_lrus(void)
				1409	{
				1410	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1411	}
				1412
				1413	void set_bh_page(struct buffer_head *bh,
				1414	struct page *page, unsigned long offset)
				1415	{
				1416	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1417	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1418	if (PageHighMem(page))
				1419	/*
				1420	* This catches illegal uses and preserves the offset:
				1421	*/
				1422	bh->b_data = (char *)(0 + offset);
				1423	else
				1424	bh->b_data = page_address(page) + offset;
				1425	}
				1426	EXPORT_SYMBOL(set_bh_page);
				1427
				1428	/*
				1429	* Called when truncating a buffer on a page completely.
				1430	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1431	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1432	{
				1433	lock_buffer(bh);
				1434	clear_buffer_dirty(bh);
				1435	bh->b_bdev = NULL;
				1436	clear_buffer_mapped(bh);
				1437	clear_buffer_req(bh);
				1438	clear_buffer_new(bh);
				1439	clear_buffer_delay(bh);
				1440	unlock_buffer(bh);
				1441	}
				1442
				1443	/**
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1444	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1445	*
				1446	* @page: the page which is affected
				1447	* @offset: the index of the truncation point
				1448	*
				1449	* block_invalidatepage() is called when all or part of the page has become
				1450	* invalidatedby a truncate operation.
				1451	*
				1452	* block_invalidatepage() does not have to release all buffers, but it must
				1453	* ensure that no dirty buffer is left outside @offset and that no I/O
				1454	* is underway against any of the blocks which are outside the truncation
				1455	* point. Because the caller is about to free (and possibly reuse) those
				1456	* blocks on-disk.
				1457	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1458	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1459	{
				1460	struct buffer_head head, bh, *next;
				1461	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1462
				1463	BUG_ON(!PageLocked(page));
				1464	if (!page_has_buffers(page))
				1465	goto out;
				1466
				1467	head = page_buffers(page);
				1468	bh = head;
				1469	do {
				1470	unsigned int next_off = curr_off + bh->b_size;
				1471	next = bh->b_this_page;
				1472
				1473	/*
				1474	* is this block fully invalidated?
				1475	*/
				1476	if (offset <= curr_off)
				1477	discard_buffer(bh);
				1478	curr_off = next_off;
				1479	bh = next;
				1480	} while (bh != head);
				1481
				1482	/*
				1483	* We release buffers only if the entire page is being invalidated.
				1484	* The get_block cached value has been unconditionally invalidated,
				1485	* so real IO is not possible anymore.
				1486	*/
				1487	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1488	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1489	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1490	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1491	}
				1492	EXPORT_SYMBOL(block_invalidatepage);
				1493
				1494	/*
				1495	* We attach and possibly dirty the buffers atomically wrt
				1496	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1497	* is already excluded via the page lock.
				1498	*/
				1499	void create_empty_buffers(struct page *page,
				1500	unsigned long blocksize, unsigned long b_state)
				1501	{
				1502	struct buffer_head bh, head, *tail;
				1503
				1504	head = alloc_page_buffers(page, blocksize, 1);
				1505	bh = head;
				1506	do {
				1507	bh->b_state \|= b_state;
				1508	tail = bh;
				1509	bh = bh->b_this_page;
				1510	} while (bh);
				1511	tail->b_this_page = head;
				1512
				1513	spin_lock(&page->mapping->private_lock);
				1514	if (PageUptodate(page) \|\| PageDirty(page)) {
				1515	bh = head;
				1516	do {
				1517	if (PageDirty(page))
				1518	set_buffer_dirty(bh);
				1519	if (PageUptodate(page))
				1520	set_buffer_uptodate(bh);
				1521	bh = bh->b_this_page;
				1522	} while (bh != head);
				1523	}
				1524	attach_page_buffers(page, head);
				1525	spin_unlock(&page->mapping->private_lock);
				1526	}
				1527	EXPORT_SYMBOL(create_empty_buffers);
				1528
				1529	/*
				1530	* We are taking a block for data and we don't want any output from any
				1531	* buffer-cache aliases starting from return from that function and
				1532	* until the moment when something will explicitly mark the buffer
				1533	* dirty (hopefully that will not happen until we will free that block ;-)
				1534	* We don't even need to mark it not-uptodate - nobody can expect
				1535	* anything from a newly allocated buffer anyway. We used to used
				1536	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1537	* don't want to mark the alias unmapped, for example - it would confuse
				1538	* anyone who might pick it with bread() afterwards...
				1539	*
				1540	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1541	* be writeout I/O going on against recently-freed buffers. We don't
				1542	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1543	* only if we really need to. That happens here.
				1544	*/
				1545	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1546	{
				1547	struct buffer_head *old_bh;
				1548
				1549	might_sleep();
				1550
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1551	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1552	if (old_bh) {
				1553	clear_buffer_dirty(old_bh);
				1554	wait_on_buffer(old_bh);
				1555	clear_buffer_req(old_bh);
				1556	__brelse(old_bh);
				1557	}
				1558	}
				1559	EXPORT_SYMBOL(unmap_underlying_metadata);
				1560
				1561	/*
				1562	* NOTE! All mapped/uptodate combinations are valid:
				1563	*
				1564	* Mapped Uptodate Meaning
				1565	*
				1566	* No No "unknown" - must do get_block()
				1567	* No Yes "hole" - zero-filled
				1568	* Yes No "allocated" - allocated on disk, not read in
				1569	* Yes Yes "valid" - allocated and up-to-date in memory.
				1570	*
				1571	* "Dirty" is valid only with the last case (mapped+uptodate).
				1572	*/
				1573
				1574	/*
				1575	* While block_write_full_page is writing back the dirty buffers under
				1576	* the page lock, whoever dirtied the buffers may decide to clean them
				1577	* again at any time. We handle that by only looking at the buffer
				1578	* state inside lock_buffer().
				1579	*
				1580	* If block_write_full_page() is called for regular writeback
				1581	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1582	* locked buffer. This only can happen if someone has written the buffer
				1583	* directly, with submit_bh(). At the address_space level PageWriteback
				1584	* prevents this contention from occurring.
				1585	*/
				1586	static int __block_write_full_page(struct inode inode, struct page page,
				1587	get_block_t get_block, struct writeback_control wbc)
				1588	{
				1589	int err;
				1590	sector_t block;
				1591	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1592	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1593	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1594	int nr_underway = 0;
				1595
				1596	BUG_ON(!PageLocked(page));
				1597
				1598	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1599
				1600	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1601	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1602	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1603	}
				1604
				1605	/*
				1606	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1607	* here, and the (potentially unmapped) buffers may become dirty at
				1608	* any time. If a buffer becomes dirty here after we've inspected it
				1609	* then we just miss that fact, and the page stays dirty.
				1610	*
				1611	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1612	* handle that here by just cleaning them.
				1613	*/
				1614
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1615	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1616	head = page_buffers(page);
				1617	bh = head;
				1618
				1619	/*
				1620	* Get all the dirty buffers mapped to disk addresses and
				1621	* handle any aliases from the underlying blockdev's mapping.
				1622	*/
				1623	do {
				1624	if (block > last_block) {
				1625	/*
				1626	* mapped buffers outside i_size will occur, because
				1627	* this page can be outside i_size when there is a
				1628	* truncate in progress.
				1629	*/
				1630	/*
				1631	* The buffer was zeroed by block_write_full_page()
				1632	*/
				1633	clear_buffer_dirty(bh);
				1634	set_buffer_uptodate(bh);
				1635	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1636	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1637	err = get_block(inode, block, bh, 1);
				1638	if (err)
				1639	goto recover;
				1640	if (buffer_new(bh)) {
				1641	/* blockdev mappings never come here */
				1642	clear_buffer_new(bh);
				1643	unmap_underlying_metadata(bh->b_bdev,
				1644	bh->b_blocknr);
				1645	}
				1646	}
				1647	bh = bh->b_this_page;
				1648	block++;
				1649	} while (bh != head);
				1650
				1651	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1652	if (!buffer_mapped(bh))
				1653	continue;
				1654	/*
				1655	* If it's a fully non-blocking write attempt and we cannot
				1656	* lock the buffer then redirty the page. Note that this can
				1657	* potentially cause a busy-wait loop from pdflush and kswapd
				1658	* activity, but those code paths have their own higher-level
				1659	* throttling.
				1660	*/
				1661	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1662	lock_buffer(bh);
				1663	} else if (test_set_buffer_locked(bh)) {
				1664	redirty_page_for_writepage(wbc, page);
				1665	continue;
				1666	}
				1667	if (test_clear_buffer_dirty(bh)) {
				1668	mark_buffer_async_write(bh);
				1669	} else {
				1670	unlock_buffer(bh);
				1671	}
				1672	} while ((bh = bh->b_this_page) != head);
				1673
				1674	/*
				1675	* The page and its buffers are protected by PageWriteback(), so we can
				1676	* drop the bh refcounts early.
				1677	*/
				1678	BUG_ON(PageWriteback(page));
				1679	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1680
				1681	do {
				1682	struct buffer_head *next = bh->b_this_page;
				1683	if (buffer_async_write(bh)) {
				1684	submit_bh(WRITE, bh);
				1685	nr_underway++;
				1686	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1687	bh = next;
				1688	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1689	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1690
				1691	err = 0;
				1692	done:
				1693	if (nr_underway == 0) {
				1694	/*
				1695	* The page was marked dirty, but the buffers were
				1696	* clean. Someone wrote them back by hand with
				1697	* ll_rw_block/submit_bh. A rare case.
				1698	*/
				1699	int uptodate = 1;
				1700	do {
				1701	if (!buffer_uptodate(bh)) {
				1702	uptodate = 0;
				1703	break;
				1704	}
				1705	bh = bh->b_this_page;
				1706	} while (bh != head);
				1707	if (uptodate)
				1708	SetPageUptodate(page);
				1709	end_page_writeback(page);
				1710	/*
				1711	* The page and buffer_heads can be released at any time from
				1712	* here on.
				1713	*/
				1714	wbc->pages_skipped++; /* We didn't write this page */
				1715	}
				1716	return err;
				1717
				1718	recover:
				1719	/*
				1720	* ENOSPC, or some other error. We may already have added some
				1721	* blocks to the file, so we need to write these out to avoid
				1722	* exposing stale data.
				1723	* The page is currently locked and not marked for writeback
				1724	*/
				1725	bh = head;
				1726	/* Recovery: lock and submit the mapped buffers */
				1727	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1728	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1729	lock_buffer(bh);
				1730	mark_buffer_async_write(bh);
				1731	} else {
				1732	/*
				1733	* The buffer may have been set dirty during
				1734	* attachment to a dirty page.
				1735	*/
				1736	clear_buffer_dirty(bh);
				1737	}
				1738	} while ((bh = bh->b_this_page) != head);
				1739	SetPageError(page);
				1740	BUG_ON(PageWriteback(page));
				1741	set_page_writeback(page);
				1742	unlock_page(page);
				1743	do {
				1744	struct buffer_head *next = bh->b_this_page;
				1745	if (buffer_async_write(bh)) {
				1746	clear_buffer_dirty(bh);
				1747	submit_bh(WRITE, bh);
				1748	nr_underway++;
				1749	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1750	bh = next;
				1751	} while (bh != head);
				1752	goto done;
				1753	}
				1754
				1755	static int __block_prepare_write(struct inode inode, struct page page,
				1756	unsigned from, unsigned to, get_block_t *get_block)
				1757	{
				1758	unsigned block_start, block_end;
				1759	sector_t block;
				1760	int err = 0;
				1761	unsigned blocksize, bbits;
				1762	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1763
				1764	BUG_ON(!PageLocked(page));
				1765	BUG_ON(from > PAGE_CACHE_SIZE);
				1766	BUG_ON(to > PAGE_CACHE_SIZE);
				1767	BUG_ON(from > to);
				1768
				1769	blocksize = 1 << inode->i_blkbits;
				1770	if (!page_has_buffers(page))
				1771	create_empty_buffers(page, blocksize, 0);
				1772	head = page_buffers(page);
				1773
				1774	bbits = inode->i_blkbits;
				1775	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1776
				1777	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1778	block++, block_start=block_end, bh = bh->b_this_page) {
				1779	block_end = block_start + blocksize;
				1780	if (block_end <= from \|\| block_start >= to) {
				1781	if (PageUptodate(page)) {
				1782	if (!buffer_uptodate(bh))
				1783	set_buffer_uptodate(bh);
				1784	}
				1785	continue;
				1786	}
				1787	if (buffer_new(bh))
				1788	clear_buffer_new(bh);
				1789	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1790	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1791	err = get_block(inode, block, bh, 1);
				1792	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1793	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1794	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1795	unmap_underlying_metadata(bh->b_bdev,
				1796	bh->b_blocknr);
				1797	if (PageUptodate(page)) {
				1798	set_buffer_uptodate(bh);
				1799	continue;
				1800	}
				1801	if (block_end > to \|\| block_start < from) {
				1802	void *kaddr;
				1803
				1804	kaddr = kmap_atomic(page, KM_USER0);
				1805	if (block_end > to)
				1806	memset(kaddr+to, 0,
				1807	block_end-to);
				1808	if (block_start < from)
				1809	memset(kaddr+block_start,
				1810	0, from-block_start);
				1811	flush_dcache_page(page);
				1812	kunmap_atomic(kaddr, KM_USER0);
				1813	}
				1814	continue;
				1815	}
				1816	}
				1817	if (PageUptodate(page)) {
				1818	if (!buffer_uptodate(bh))
				1819	set_buffer_uptodate(bh);
				1820	continue;
				1821	}
				1822	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
				1823	(block_start < from \|\| block_end > to)) {
				1824	ll_rw_block(READ, 1, &bh);
				1825	*wait_bh++=bh;
				1826	}
				1827	}
				1828	/*
				1829	* If we issued read requests - let them complete.
				1830	*/
				1831	while(wait_bh > wait) {
				1832	wait_on_buffer(*--wait_bh);
				1833	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1834	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1835	}
Anton Altaparmakov	152becd	2005-06-23 00:10:21 -0700	[diff] [blame]	1836	if (!err) {
				1837	bh = head;
				1838	do {
				1839	if (buffer_new(bh))
				1840	clear_buffer_new(bh);
				1841	} while ((bh = bh->b_this_page) != head);
				1842	return 0;
				1843	}
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1844	/* Error case: */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1845	/*
				1846	* Zero out any newly allocated blocks to avoid exposing stale
				1847	* data. If BH_New is set, we know that the block was newly
				1848	* allocated in the above loop.
				1849	*/
				1850	bh = head;
				1851	block_start = 0;
				1852	do {
				1853	block_end = block_start+blocksize;
				1854	if (block_end <= from)
				1855	goto next_bh;
				1856	if (block_start >= to)
				1857	break;
				1858	if (buffer_new(bh)) {
				1859	void *kaddr;
				1860
				1861	clear_buffer_new(bh);
				1862	kaddr = kmap_atomic(page, KM_USER0);
				1863	memset(kaddr+block_start, 0, bh->b_size);
Monakhov Dmitriy	8c58165	2006-10-11 01:22:00 -0700	[diff] [blame]	1864	flush_dcache_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1865	kunmap_atomic(kaddr, KM_USER0);
				1866	set_buffer_uptodate(bh);
				1867	mark_buffer_dirty(bh);
				1868	}
				1869	next_bh:
				1870	block_start = block_end;
				1871	bh = bh->b_this_page;
				1872	} while (bh != head);
				1873	return err;
				1874	}
				1875
				1876	static int __block_commit_write(struct inode inode, struct page page,
				1877	unsigned from, unsigned to)
				1878	{
				1879	unsigned block_start, block_end;
				1880	int partial = 0;
				1881	unsigned blocksize;
				1882	struct buffer_head bh, head;
				1883
				1884	blocksize = 1 << inode->i_blkbits;
				1885
				1886	for(bh = head = page_buffers(page), block_start = 0;
				1887	bh != head \|\| !block_start;
				1888	block_start=block_end, bh = bh->b_this_page) {
				1889	block_end = block_start + blocksize;
				1890	if (block_end <= from \|\| block_start >= to) {
				1891	if (!buffer_uptodate(bh))
				1892	partial = 1;
				1893	} else {
				1894	set_buffer_uptodate(bh);
				1895	mark_buffer_dirty(bh);
				1896	}
				1897	}
				1898
				1899	/*
				1900	* If this is a partial write which happened to make all buffers
				1901	* uptodate then we can optimize away a bogus readpage() for
				1902	* the next read(). Here we 'discover' whether the page went
				1903	* uptodate as a result of this (potentially partial) write.
				1904	*/
				1905	if (!partial)
				1906	SetPageUptodate(page);
				1907	return 0;
				1908	}
				1909
				1910	/*
				1911	* Generic "read page" function for block devices that have the normal
				1912	* get_block functionality. This is most of the block device filesystems.
				1913	* Reads the page asynchronously --- the unlock_buffer() and
				1914	* set/clear_buffer_uptodate() functions propagate buffer state into the
				1915	* page struct once IO has completed.
				1916	*/
				1917	int block_read_full_page(struct page page, get_block_t get_block)
				1918	{
				1919	struct inode *inode = page->mapping->host;
				1920	sector_t iblock, lblock;
				1921	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				1922	unsigned int blocksize;
				1923	int nr, i;
				1924	int fully_mapped = 1;
				1925
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	1926	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1927	blocksize = 1 << inode->i_blkbits;
				1928	if (!page_has_buffers(page))
				1929	create_empty_buffers(page, blocksize, 0);
				1930	head = page_buffers(page);
				1931
				1932	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				1933	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				1934	bh = head;
				1935	nr = 0;
				1936	i = 0;
				1937
				1938	do {
				1939	if (buffer_uptodate(bh))
				1940	continue;
				1941
				1942	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	1943	int err = 0;
				1944
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1945	fully_mapped = 0;
				1946	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1947	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	1948	err = get_block(inode, iblock, bh, 0);
				1949	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1950	SetPageError(page);
				1951	}
				1952	if (!buffer_mapped(bh)) {
				1953	void *kaddr = kmap_atomic(page, KM_USER0);
				1954	memset(kaddr + i * blocksize, 0, blocksize);
				1955	flush_dcache_page(page);
				1956	kunmap_atomic(kaddr, KM_USER0);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	1957	if (!err)
				1958	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1959	continue;
				1960	}
				1961	/*
				1962	* get_block() might have updated the buffer
				1963	* synchronously
				1964	*/
				1965	if (buffer_uptodate(bh))
				1966	continue;
				1967	}
				1968	arr[nr++] = bh;
				1969	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				1970
				1971	if (fully_mapped)
				1972	SetPageMappedToDisk(page);
				1973
				1974	if (!nr) {
				1975	/*
				1976	* All buffers are uptodate - we can set the page uptodate
				1977	* as well. But not if get_block() returned an error.
				1978	*/
				1979	if (!PageError(page))
				1980	SetPageUptodate(page);
				1981	unlock_page(page);
				1982	return 0;
				1983	}
				1984
				1985	/* Stage two: lock the buffers */
				1986	for (i = 0; i < nr; i++) {
				1987	bh = arr[i];
				1988	lock_buffer(bh);
				1989	mark_buffer_async_read(bh);
				1990	}
				1991
				1992	/*
				1993	* Stage 3: start the IO. Check for uptodateness
				1994	* inside the buffer lock in case another process reading
				1995	* the underlying blockdev brought it uptodate (the sct fix).
				1996	*/
				1997	for (i = 0; i < nr; i++) {
				1998	bh = arr[i];
				1999	if (buffer_uptodate(bh))
				2000	end_buffer_async_read(bh, 1);
				2001	else
				2002	submit_bh(READ, bh);
				2003	}
				2004	return 0;
				2005	}
				2006
				2007	/* utility function for filesystems that need to do work on expanding
				2008	* truncates. Uses prepare/commit_write to allow the filesystem to
				2009	* deal with the hole.
				2010	*/
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2011	static int __generic_cont_expand(struct inode *inode, loff_t size,
				2012	pgoff_t index, unsigned int offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2013	{
				2014	struct address_space *mapping = inode->i_mapping;
				2015	struct page *page;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2016	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2017	int err;
				2018
				2019	err = -EFBIG;
				2020	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2021	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2022	send_sig(SIGXFSZ, current, 0);
				2023	goto out;
				2024	}
				2025	if (size > inode->i_sb->s_maxbytes)
				2026	goto out;
				2027
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2028	err = -ENOMEM;
				2029	page = grab_cache_page(mapping, index);
				2030	if (!page)
				2031	goto out;
				2032	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2033	if (err) {
				2034	/*
				2035	* ->prepare_write() may have instantiated a few blocks
				2036	* outside i_size. Trim these off again.
				2037	*/
				2038	unlock_page(page);
				2039	page_cache_release(page);
				2040	vmtruncate(inode, inode->i_size);
				2041	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2042	}
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2043
				2044	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
				2045
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2046	unlock_page(page);
				2047	page_cache_release(page);
				2048	if (err > 0)
				2049	err = 0;
				2050	out:
				2051	return err;
				2052	}
				2053
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2054	int generic_cont_expand(struct inode *inode, loff_t size)
				2055	{
				2056	pgoff_t index;
				2057	unsigned int offset;
				2058
				2059	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
				2060
				2061	/* ugh. in prepare/commit_write, if from==to==start of block, we
				2062	** skip the prepare. make sure we never send an offset for the start
				2063	** of a block
				2064	*/
				2065	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				2066	/* caller must handle this extra byte. */
				2067	offset++;
				2068	}
				2069	index = size >> PAGE_CACHE_SHIFT;
				2070
				2071	return __generic_cont_expand(inode, size, index, offset);
				2072	}
				2073
				2074	int generic_cont_expand_simple(struct inode *inode, loff_t size)
				2075	{
				2076	loff_t pos = size - 1;
				2077	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
				2078	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
				2079
				2080	/* prepare/commit_write can handle even if from==to==start of block. */
				2081	return __generic_cont_expand(inode, size, index, offset);
				2082	}
				2083
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2084	/*
				2085	* For moronic filesystems that do not allow holes in file.
				2086	* We may have to extend the file.
				2087	*/
				2088
				2089	int cont_prepare_write(struct page *page, unsigned offset,
				2090	unsigned to, get_block_t get_block, loff_t bytes)
				2091	{
				2092	struct address_space *mapping = page->mapping;
				2093	struct inode *inode = mapping->host;
				2094	struct page *new_page;
				2095	pgoff_t pgpos;
				2096	long status;
				2097	unsigned zerofrom;
				2098	unsigned blocksize = 1 << inode->i_blkbits;
				2099	void *kaddr;
				2100
				2101	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
				2102	status = -ENOMEM;
				2103	new_page = grab_cache_page(mapping, pgpos);
				2104	if (!new_page)
				2105	goto out;
				2106	/* we might sleep */
				2107	if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
				2108	unlock_page(new_page);
				2109	page_cache_release(new_page);
				2110	continue;
				2111	}
				2112	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2113	if (zerofrom & (blocksize-1)) {
				2114	*bytes \|= (blocksize-1);
				2115	(*bytes)++;
				2116	}
				2117	status = __block_prepare_write(inode, new_page, zerofrom,
				2118	PAGE_CACHE_SIZE, get_block);
				2119	if (status)
				2120	goto out_unmap;
				2121	kaddr = kmap_atomic(new_page, KM_USER0);
				2122	memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
				2123	flush_dcache_page(new_page);
				2124	kunmap_atomic(kaddr, KM_USER0);
				2125	generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
				2126	unlock_page(new_page);
				2127	page_cache_release(new_page);
				2128	}
				2129
				2130	if (page->index < pgpos) {
				2131	/* completely inside the area */
				2132	zerofrom = offset;
				2133	} else {
				2134	/* page covers the boundary, find the boundary offset */
				2135	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2136
				2137	/* if we will expand the thing last block will be filled */
				2138	if (to > zerofrom && (zerofrom & (blocksize-1))) {
				2139	*bytes \|= (blocksize-1);
				2140	(*bytes)++;
				2141	}
				2142
				2143	/* starting below the boundary? Nothing to zero out */
				2144	if (offset <= zerofrom)
				2145	zerofrom = offset;
				2146	}
				2147	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
				2148	if (status)
				2149	goto out1;
				2150	if (zerofrom < offset) {
				2151	kaddr = kmap_atomic(page, KM_USER0);
				2152	memset(kaddr+zerofrom, 0, offset-zerofrom);
				2153	flush_dcache_page(page);
				2154	kunmap_atomic(kaddr, KM_USER0);
				2155	__block_commit_write(inode, page, zerofrom, offset);
				2156	}
				2157	return 0;
				2158	out1:
				2159	ClearPageUptodate(page);
				2160	return status;
				2161
				2162	out_unmap:
				2163	ClearPageUptodate(new_page);
				2164	unlock_page(new_page);
				2165	page_cache_release(new_page);
				2166	out:
				2167	return status;
				2168	}
				2169
				2170	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2171	get_block_t *get_block)
				2172	{
				2173	struct inode *inode = page->mapping->host;
				2174	int err = __block_prepare_write(inode, page, from, to, get_block);
				2175	if (err)
				2176	ClearPageUptodate(page);
				2177	return err;
				2178	}
				2179
				2180	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2181	{
				2182	struct inode *inode = page->mapping->host;
				2183	__block_commit_write(inode,page,from,to);
				2184	return 0;
				2185	}
				2186
				2187	int generic_commit_write(struct file file, struct page page,
				2188	unsigned from, unsigned to)
				2189	{
				2190	struct inode *inode = page->mapping->host;
				2191	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2192	__block_commit_write(inode,page,from,to);
				2193	/*
				2194	* No need to use i_size_read() here, the i_size
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	2195	* cannot change under us because we hold i_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2196	*/
				2197	if (pos > inode->i_size) {
				2198	i_size_write(inode, pos);
				2199	mark_inode_dirty(inode);
				2200	}
				2201	return 0;
				2202	}
				2203
				2204
				2205	/*
				2206	* nobh_prepare_write()'s prereads are special: the buffer_heads are freed
				2207	* immediately, while under the page lock. So it needs a special end_io
				2208	* handler which does not touch the bh after unlocking it.
				2209	*
				2210	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				2211	* a race there is benign: unlock_buffer() only use the bh's address for
				2212	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				2213	* itself.
				2214	*/
				2215	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2216	{
				2217	if (uptodate) {
				2218	set_buffer_uptodate(bh);
				2219	} else {
				2220	/* This happens, due to failed READA attempts. */
				2221	clear_buffer_uptodate(bh);
				2222	}
				2223	unlock_buffer(bh);
				2224	}
				2225
				2226	/*
				2227	* On entry, the page is fully not uptodate.
				2228	* On exit the page is fully uptodate in the areas outside (from,to)
				2229	*/
				2230	int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
				2231	get_block_t *get_block)
				2232	{
				2233	struct inode *inode = page->mapping->host;
				2234	const unsigned blkbits = inode->i_blkbits;
				2235	const unsigned blocksize = 1 << blkbits;
				2236	struct buffer_head map_bh;
				2237	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
				2238	unsigned block_in_page;
				2239	unsigned block_start;
				2240	sector_t block_in_file;
				2241	char *kaddr;
				2242	int nr_reads = 0;
				2243	int i;
				2244	int ret = 0;
				2245	int is_mapped_to_disk = 1;
				2246	int dirtied_it = 0;
				2247
				2248	if (PageMappedToDisk(page))
				2249	return 0;
				2250
				2251	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
				2252	map_bh.b_page = page;
				2253
				2254	/*
				2255	* We loop across all blocks in the page, whether or not they are
				2256	* part of the affected region. This is so we can discover if the
				2257	* page is fully mapped-to-disk.
				2258	*/
				2259	for (block_start = 0, block_in_page = 0;
				2260	block_start < PAGE_CACHE_SIZE;
				2261	block_in_page++, block_start += blocksize) {
				2262	unsigned block_end = block_start + blocksize;
				2263	int create;
				2264
				2265	map_bh.b_state = 0;
				2266	create = 1;
				2267	if (block_start >= to)
				2268	create = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2269	map_bh.b_size = blocksize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2270	ret = get_block(inode, block_in_file + block_in_page,
				2271	&map_bh, create);
				2272	if (ret)
				2273	goto failed;
				2274	if (!buffer_mapped(&map_bh))
				2275	is_mapped_to_disk = 0;
				2276	if (buffer_new(&map_bh))
				2277	unmap_underlying_metadata(map_bh.b_bdev,
				2278	map_bh.b_blocknr);
				2279	if (PageUptodate(page))
				2280	continue;
				2281	if (buffer_new(&map_bh) \|\| !buffer_mapped(&map_bh)) {
				2282	kaddr = kmap_atomic(page, KM_USER0);
				2283	if (block_start < from) {
				2284	memset(kaddr+block_start, 0, from-block_start);
				2285	dirtied_it = 1;
				2286	}
				2287	if (block_end > to) {
				2288	memset(kaddr + to, 0, block_end - to);
				2289	dirtied_it = 1;
				2290	}
				2291	flush_dcache_page(page);
				2292	kunmap_atomic(kaddr, KM_USER0);
				2293	continue;
				2294	}
				2295	if (buffer_uptodate(&map_bh))
				2296	continue; /* reiserfs does this */
				2297	if (block_start < from \|\| block_end > to) {
				2298	struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
				2299
				2300	if (!bh) {
				2301	ret = -ENOMEM;
				2302	goto failed;
				2303	}
				2304	bh->b_state = map_bh.b_state;
				2305	atomic_set(&bh->b_count, 0);
				2306	bh->b_this_page = NULL;
				2307	bh->b_page = page;
				2308	bh->b_blocknr = map_bh.b_blocknr;
				2309	bh->b_size = blocksize;
				2310	bh->b_data = (char *)(long)block_start;
				2311	bh->b_bdev = map_bh.b_bdev;
				2312	bh->b_private = NULL;
				2313	read_bh[nr_reads++] = bh;
				2314	}
				2315	}
				2316
				2317	if (nr_reads) {
				2318	struct buffer_head *bh;
				2319
				2320	/*
				2321	* The page is locked, so these buffers are protected from
				2322	* any VM or truncate activity. Hence we don't need to care
				2323	* for the buffer_head refcounts.
				2324	*/
				2325	for (i = 0; i < nr_reads; i++) {
				2326	bh = read_bh[i];
				2327	lock_buffer(bh);
				2328	bh->b_end_io = end_buffer_read_nobh;
				2329	submit_bh(READ, bh);
				2330	}
				2331	for (i = 0; i < nr_reads; i++) {
				2332	bh = read_bh[i];
				2333	wait_on_buffer(bh);
				2334	if (!buffer_uptodate(bh))
				2335	ret = -EIO;
				2336	free_buffer_head(bh);
				2337	read_bh[i] = NULL;
				2338	}
				2339	if (ret)
				2340	goto failed;
				2341	}
				2342
				2343	if (is_mapped_to_disk)
				2344	SetPageMappedToDisk(page);
				2345	SetPageUptodate(page);
				2346
				2347	/*
				2348	* Setting the page dirty here isn't necessary for the prepare_write
				2349	* function - commit_write will do that. But if/when this function is
				2350	* used within the pagefault handler to ensure that all mmapped pages
				2351	* have backing space in the filesystem, we will need to dirty the page
				2352	* if its contents were altered.
				2353	*/
				2354	if (dirtied_it)
				2355	set_page_dirty(page);
				2356
				2357	return 0;
				2358
				2359	failed:
				2360	for (i = 0; i < nr_reads; i++) {
				2361	if (read_bh[i])
				2362	free_buffer_head(read_bh[i]);
				2363	}
				2364
				2365	/*
				2366	* Error recovery is pretty slack. Clear the page and mark it dirty
				2367	* so we'll later zero out any blocks which _were_ allocated.
				2368	*/
				2369	kaddr = kmap_atomic(page, KM_USER0);
				2370	memset(kaddr, 0, PAGE_CACHE_SIZE);
Monakhov Dmitriy	8c58165	2006-10-11 01:22:00 -0700	[diff] [blame]	2371	flush_dcache_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2372	kunmap_atomic(kaddr, KM_USER0);
				2373	SetPageUptodate(page);
				2374	set_page_dirty(page);
				2375	return ret;
				2376	}
				2377	EXPORT_SYMBOL(nobh_prepare_write);
				2378
				2379	int nobh_commit_write(struct file file, struct page page,
				2380	unsigned from, unsigned to)
				2381	{
				2382	struct inode *inode = page->mapping->host;
				2383	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2384
				2385	set_page_dirty(page);
				2386	if (pos > inode->i_size) {
				2387	i_size_write(inode, pos);
				2388	mark_inode_dirty(inode);
				2389	}
				2390	return 0;
				2391	}
				2392	EXPORT_SYMBOL(nobh_commit_write);
				2393
				2394	/*
				2395	* nobh_writepage() - based on block_full_write_page() except
				2396	* that it tries to operate without attaching bufferheads to
				2397	* the page.
				2398	*/
				2399	int nobh_writepage(struct page page, get_block_t get_block,
				2400	struct writeback_control *wbc)
				2401	{
				2402	struct inode * const inode = page->mapping->host;
				2403	loff_t i_size = i_size_read(inode);
				2404	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2405	unsigned offset;
				2406	void *kaddr;
				2407	int ret;
				2408
				2409	/* Is the page fully inside i_size? */
				2410	if (page->index < end_index)
				2411	goto out;
				2412
				2413	/* Is the page fully outside i_size? (truncate in progress) */
				2414	offset = i_size & (PAGE_CACHE_SIZE-1);
				2415	if (page->index >= end_index+1 \|\| !offset) {
				2416	/*
				2417	* The page may have dirty, unmapped buffers. For example,
				2418	* they may have been added in ext3_writepage(). Make them
				2419	* freeable here, so the page does not leak.
				2420	*/
				2421	#if 0
				2422	/* Not really sure about this - do we need this ? */
				2423	if (page->mapping->a_ops->invalidatepage)
				2424	page->mapping->a_ops->invalidatepage(page, offset);
				2425	#endif
				2426	unlock_page(page);
				2427	return 0; /* don't care */
				2428	}
				2429
				2430	/*
				2431	* The page straddles i_size. It must be zeroed out on each and every
				2432	* writepage invocation because it may be mmapped. "A file is mapped
				2433	* in multiples of the page size. For a file that is not a multiple of
				2434	* the page size, the remaining memory is zeroed when mapped, and
				2435	* writes to that region are not written out to the file."
				2436	*/
				2437	kaddr = kmap_atomic(page, KM_USER0);
				2438	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2439	flush_dcache_page(page);
				2440	kunmap_atomic(kaddr, KM_USER0);
				2441	out:
				2442	ret = mpage_writepage(page, get_block, wbc);
				2443	if (ret == -EAGAIN)
				2444	ret = __block_write_full_page(inode, page, get_block, wbc);
				2445	return ret;
				2446	}
				2447	EXPORT_SYMBOL(nobh_writepage);
				2448
				2449	/*
				2450	* This function assumes that ->prepare_write() uses nobh_prepare_write().
				2451	*/
				2452	int nobh_truncate_page(struct address_space *mapping, loff_t from)
				2453	{
				2454	struct inode *inode = mapping->host;
				2455	unsigned blocksize = 1 << inode->i_blkbits;
				2456	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2457	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2458	unsigned to;
				2459	struct page *page;
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	2460	const struct address_space_operations *a_ops = mapping->a_ops;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2461	char *kaddr;
				2462	int ret = 0;
				2463
				2464	if ((offset & (blocksize - 1)) == 0)
				2465	goto out;
				2466
				2467	ret = -ENOMEM;
				2468	page = grab_cache_page(mapping, index);
				2469	if (!page)
				2470	goto out;
				2471
				2472	to = (offset + blocksize) & ~(blocksize - 1);
				2473	ret = a_ops->prepare_write(NULL, page, offset, to);
				2474	if (ret == 0) {
				2475	kaddr = kmap_atomic(page, KM_USER0);
				2476	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2477	flush_dcache_page(page);
				2478	kunmap_atomic(kaddr, KM_USER0);
				2479	set_page_dirty(page);
				2480	}
				2481	unlock_page(page);
				2482	page_cache_release(page);
				2483	out:
				2484	return ret;
				2485	}
				2486	EXPORT_SYMBOL(nobh_truncate_page);
				2487
				2488	int block_truncate_page(struct address_space *mapping,
				2489	loff_t from, get_block_t *get_block)
				2490	{
				2491	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2492	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2493	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2494	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2495	unsigned length, pos;
				2496	struct inode *inode = mapping->host;
				2497	struct page *page;
				2498	struct buffer_head *bh;
				2499	void *kaddr;
				2500	int err;
				2501
				2502	blocksize = 1 << inode->i_blkbits;
				2503	length = offset & (blocksize - 1);
				2504
				2505	/* Block boundary? Nothing to do */
				2506	if (!length)
				2507	return 0;
				2508
				2509	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2510	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2511
				2512	page = grab_cache_page(mapping, index);
				2513	err = -ENOMEM;
				2514	if (!page)
				2515	goto out;
				2516
				2517	if (!page_has_buffers(page))
				2518	create_empty_buffers(page, blocksize, 0);
				2519
				2520	/* Find the buffer that contains "offset" */
				2521	bh = page_buffers(page);
				2522	pos = blocksize;
				2523	while (offset >= pos) {
				2524	bh = bh->b_this_page;
				2525	iblock++;
				2526	pos += blocksize;
				2527	}
				2528
				2529	err = 0;
				2530	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2531	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2532	err = get_block(inode, iblock, bh, 0);
				2533	if (err)
				2534	goto unlock;
				2535	/* unmapped? It's a hole - nothing to do */
				2536	if (!buffer_mapped(bh))
				2537	goto unlock;
				2538	}
				2539
				2540	/* Ok, it's mapped. Make sure it's up-to-date */
				2541	if (PageUptodate(page))
				2542	set_buffer_uptodate(bh);
				2543
				2544	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
				2545	err = -EIO;
				2546	ll_rw_block(READ, 1, &bh);
				2547	wait_on_buffer(bh);
				2548	/* Uhhuh. Read error. Complain and punt. */
				2549	if (!buffer_uptodate(bh))
				2550	goto unlock;
				2551	}
				2552
				2553	kaddr = kmap_atomic(page, KM_USER0);
				2554	memset(kaddr + offset, 0, length);
				2555	flush_dcache_page(page);
				2556	kunmap_atomic(kaddr, KM_USER0);
				2557
				2558	mark_buffer_dirty(bh);
				2559	err = 0;
				2560
				2561	unlock:
				2562	unlock_page(page);
				2563	page_cache_release(page);
				2564	out:
				2565	return err;
				2566	}
				2567
				2568	/*
				2569	* The generic ->writepage function for buffer-backed address_spaces
				2570	*/
				2571	int block_write_full_page(struct page page, get_block_t get_block,
				2572	struct writeback_control *wbc)
				2573	{
				2574	struct inode * const inode = page->mapping->host;
				2575	loff_t i_size = i_size_read(inode);
				2576	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2577	unsigned offset;
				2578	void *kaddr;
				2579
				2580	/* Is the page fully inside i_size? */
				2581	if (page->index < end_index)
				2582	return __block_write_full_page(inode, page, get_block, wbc);
				2583
				2584	/* Is the page fully outside i_size? (truncate in progress) */
				2585	offset = i_size & (PAGE_CACHE_SIZE-1);
				2586	if (page->index >= end_index+1 \|\| !offset) {
				2587	/*
				2588	* The page may have dirty, unmapped buffers. For example,
				2589	* they may have been added in ext3_writepage(). Make them
				2590	* freeable here, so the page does not leak.
				2591	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2592	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2593	unlock_page(page);
				2594	return 0; /* don't care */
				2595	}
				2596
				2597	/*
				2598	* The page straddles i_size. It must be zeroed out on each and every
				2599	* writepage invokation because it may be mmapped. "A file is mapped
				2600	* in multiples of the page size. For a file that is not a multiple of
				2601	* the page size, the remaining memory is zeroed when mapped, and
				2602	* writes to that region are not written out to the file."
				2603	*/
				2604	kaddr = kmap_atomic(page, KM_USER0);
				2605	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2606	flush_dcache_page(page);
				2607	kunmap_atomic(kaddr, KM_USER0);
				2608	return __block_write_full_page(inode, page, get_block, wbc);
				2609	}
				2610
				2611	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2612	get_block_t *get_block)
				2613	{
				2614	struct buffer_head tmp;
				2615	struct inode *inode = mapping->host;
				2616	tmp.b_state = 0;
				2617	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2618	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2619	get_block(inode, block, &tmp, 0);
				2620	return tmp.b_blocknr;
				2621	}
				2622
				2623	static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
				2624	{
				2625	struct buffer_head *bh = bio->bi_private;
				2626
				2627	if (bio->bi_size)
				2628	return 1;
				2629
				2630	if (err == -EOPNOTSUPP) {
				2631	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2632	set_bit(BH_Eopnotsupp, &bh->b_state);
				2633	}
				2634
				2635	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2636	bio_put(bio);
				2637	return 0;
				2638	}
				2639
				2640	int submit_bh(int rw, struct buffer_head * bh)
				2641	{
				2642	struct bio *bio;
				2643	int ret = 0;
				2644
				2645	BUG_ON(!buffer_locked(bh));
				2646	BUG_ON(!buffer_mapped(bh));
				2647	BUG_ON(!bh->b_end_io);
				2648
				2649	if (buffer_ordered(bh) && (rw == WRITE))
				2650	rw = WRITE_BARRIER;
				2651
				2652	/*
				2653	* Only clear out a write error when rewriting, should this
				2654	* include WRITE_SYNC as well?
				2655	*/
				2656	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2657	clear_buffer_write_io_error(bh);
				2658
				2659	/*
				2660	* from here on down, it's all bio -- do the initial mapping,
				2661	* submit_bio -> generic_make_request may further map this bio around
				2662	*/
				2663	bio = bio_alloc(GFP_NOIO, 1);
				2664
				2665	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2666	bio->bi_bdev = bh->b_bdev;
				2667	bio->bi_io_vec[0].bv_page = bh->b_page;
				2668	bio->bi_io_vec[0].bv_len = bh->b_size;
				2669	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2670
				2671	bio->bi_vcnt = 1;
				2672	bio->bi_idx = 0;
				2673	bio->bi_size = bh->b_size;
				2674
				2675	bio->bi_end_io = end_bio_bh_io_sync;
				2676	bio->bi_private = bh;
				2677
				2678	bio_get(bio);
				2679	submit_bio(rw, bio);
				2680
				2681	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2682	ret = -EOPNOTSUPP;
				2683
				2684	bio_put(bio);
				2685	return ret;
				2686	}
				2687
				2688	/**
				2689	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2690	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2691	* @nr: number of &struct buffer_heads in the array
				2692	* @bhs: array of pointers to &struct buffer_head
				2693	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2694	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2695	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2696	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2697	* are sent to disk. The fourth %READA option is described in the documentation
				2698	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2699	*
				2700	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2701	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2702	* clean when doing a write request, and any buffer that appears to be
				2703	* up-to-date when doing read request. Further it marks as clean buffers that
				2704	* are processed for writing (the buffer cache won't assume that they are
				2705	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2706	*
				2707	* ll_rw_block sets b_end_io to simple completion handler that marks
				2708	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2709	* any waiters.
				2710	*
				2711	* All of the buffers must be for the same device, and must also be a
				2712	* multiple of the current approved size for the device.
				2713	*/
				2714	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2715	{
				2716	int i;
				2717
				2718	for (i = 0; i < nr; i++) {
				2719	struct buffer_head *bh = bhs[i];
				2720
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2721	if (rw == SWRITE)
				2722	lock_buffer(bh);
				2723	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2724	continue;
				2725
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2726	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2727	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2728	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2729	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2730	submit_bh(WRITE, bh);
				2731	continue;
				2732	}
				2733	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2734	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2735	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2736	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2737	submit_bh(rw, bh);
				2738	continue;
				2739	}
				2740	}
				2741	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2742	}
				2743	}
				2744
				2745	/*
				2746	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2747	* and then start new I/O and then wait upon it. The caller must have a ref on
				2748	* the buffer_head.
				2749	*/
				2750	int sync_dirty_buffer(struct buffer_head *bh)
				2751	{
				2752	int ret = 0;
				2753
				2754	WARN_ON(atomic_read(&bh->b_count) < 1);
				2755	lock_buffer(bh);
				2756	if (test_clear_buffer_dirty(bh)) {
				2757	get_bh(bh);
				2758	bh->b_end_io = end_buffer_write_sync;
				2759	ret = submit_bh(WRITE, bh);
				2760	wait_on_buffer(bh);
				2761	if (buffer_eopnotsupp(bh)) {
				2762	clear_buffer_eopnotsupp(bh);
				2763	ret = -EOPNOTSUPP;
				2764	}
				2765	if (!ret && !buffer_uptodate(bh))
				2766	ret = -EIO;
				2767	} else {
				2768	unlock_buffer(bh);
				2769	}
				2770	return ret;
				2771	}
				2772
				2773	/*
				2774	* try_to_free_buffers() checks if all the buffers on this particular page
				2775	* are unused, and releases them if so.
				2776	*
				2777	* Exclusion against try_to_free_buffers may be obtained by either
				2778	* locking the page or by holding its mapping's private_lock.
				2779	*
				2780	* If the page is dirty but all the buffers are clean then we need to
				2781	* be sure to mark the page clean as well. This is because the page
				2782	* may be against a block device, and a later reattachment of buffers
				2783	* to a dirty page will set all buffers dirty. Which would corrupt
				2784	* filesystem data on the same device.
				2785	*
				2786	* The same applies to regular filesystem pages: if all the buffers are
				2787	* clean then we set the page clean and proceed. To do that, we require
				2788	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				2789	* private_lock.
				2790	*
				2791	* try_to_free_buffers() is non-blocking.
				2792	*/
				2793	static inline int buffer_busy(struct buffer_head *bh)
				2794	{
				2795	return atomic_read(&bh->b_count) \|
				2796	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				2797	}
				2798
				2799	static int
				2800	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				2801	{
				2802	struct buffer_head *head = page_buffers(page);
				2803	struct buffer_head *bh;
				2804
				2805	bh = head;
				2806	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	2807	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2808	set_bit(AS_EIO, &page->mapping->flags);
				2809	if (buffer_busy(bh))
				2810	goto failed;
				2811	bh = bh->b_this_page;
				2812	} while (bh != head);
				2813
				2814	do {
				2815	struct buffer_head *next = bh->b_this_page;
				2816
				2817	if (!list_empty(&bh->b_assoc_buffers))
				2818	__remove_assoc_queue(bh);
				2819	bh = next;
				2820	} while (bh != head);
				2821	*buffers_to_free = head;
				2822	__clear_page_buffers(page);
				2823	return 1;
				2824	failed:
				2825	return 0;
				2826	}
				2827
				2828	int try_to_free_buffers(struct page *page)
				2829	{
				2830	struct address_space * const mapping = page->mapping;
				2831	struct buffer_head *buffers_to_free = NULL;
				2832	int ret = 0;
				2833
				2834	BUG_ON(!PageLocked(page));
				2835	if (PageWriteback(page))
				2836	return 0;
				2837
				2838	if (mapping == NULL) { /* can this still happen? */
				2839	ret = drop_buffers(page, &buffers_to_free);
				2840	goto out;
				2841	}
				2842
				2843	spin_lock(&mapping->private_lock);
				2844	ret = drop_buffers(page, &buffers_to_free);
Peter Zijlstra	d08b385	2006-09-25 23:30:57 -0700	[diff] [blame]	2845	spin_unlock(&mapping->private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2846	if (ret) {
				2847	/*
				2848	* If the filesystem writes its buffers by hand (eg ext3)
				2849	* then we can have clean buffers against a dirty page. We
				2850	* clean the page here; otherwise later reattachment of buffers
				2851	* could encounter a non-uptodate page, which is unresolvable.
				2852	* This only applies in the rare case where try_to_free_buffers
				2853	* succeeds but the page is not freed.
				2854	*/
				2855	clear_page_dirty(page);
				2856	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2857	out:
				2858	if (buffers_to_free) {
				2859	struct buffer_head *bh = buffers_to_free;
				2860
				2861	do {
				2862	struct buffer_head *next = bh->b_this_page;
				2863	free_buffer_head(bh);
				2864	bh = next;
				2865	} while (bh != buffers_to_free);
				2866	}
				2867	return ret;
				2868	}
				2869	EXPORT_SYMBOL(try_to_free_buffers);
				2870
NeilBrown	3978d71	2006-03-26 01:37:17 -0800	[diff] [blame]	2871	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2872	{
				2873	struct address_space *mapping;
				2874
				2875	smp_mb();
				2876	mapping = page_mapping(page);
				2877	if (mapping)
				2878	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2879	}
				2880
				2881	/*
				2882	* There are no bdflush tunables left. But distributions are
				2883	* still running obsolete flush daemons, so we terminate them here.
				2884	*
				2885	* Use of bdflush() is deprecated and will be removed in a future kernel.
				2886	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				2887	*/
				2888	asmlinkage long sys_bdflush(int func, long data)
				2889	{
				2890	static int msg_count;
				2891
				2892	if (!capable(CAP_SYS_ADMIN))
				2893	return -EPERM;
				2894
				2895	if (msg_count < 5) {
				2896	msg_count++;
				2897	printk(KERN_INFO
				2898	"warning: process `%s' used the obsolete bdflush"
				2899	" system call\n", current->comm);
				2900	printk(KERN_INFO "Fix your initscripts?\n");
				2901	}
				2902
				2903	if (func == 1)
				2904	do_exit(0);
				2905	return 0;
				2906	}
				2907
				2908	/*
				2909	* Buffer-head allocation
				2910	*/
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	2911	static struct kmem_cache *bh_cachep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2912
				2913	/*
				2914	* Once the number of bh's in the machine exceeds this level, we start
				2915	* stripping them in writeback.
				2916	*/
				2917	static int max_buffer_heads;
				2918
				2919	int buffer_heads_over_limit;
				2920
				2921	struct bh_accounting {
				2922	int nr; /* Number of live bh's */
				2923	int ratelimit; /* Limit cacheline bouncing */
				2924	};
				2925
				2926	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				2927
				2928	static void recalc_bh_state(void)
				2929	{
				2930	int i;
				2931	int tot = 0;
				2932
				2933	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				2934	return;
				2935	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	2936	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2937	tot += per_cpu(bh_accounting, i).nr;
				2938	buffer_heads_over_limit = (tot > max_buffer_heads);
				2939	}
				2940
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	2941	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2942	{
				2943	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
				2944	if (ret) {
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2945	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2946	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2947	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2948	}
				2949	return ret;
				2950	}
				2951	EXPORT_SYMBOL(alloc_buffer_head);
				2952
				2953	void free_buffer_head(struct buffer_head *bh)
				2954	{
				2955	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				2956	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2957	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2958	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2959	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2960	}
				2961	EXPORT_SYMBOL(free_buffer_head);
				2962
				2963	static void
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	2964	init_buffer_head(void data, struct kmem_cache cachep, unsigned long flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2965	{
				2966	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
				2967	SLAB_CTOR_CONSTRUCTOR) {
				2968	struct buffer_head * bh = (struct buffer_head *)data;
				2969
				2970	memset(bh, 0, sizeof(*bh));
				2971	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				2972	}
				2973	}
				2974
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2975	static void buffer_exit_cpu(int cpu)
				2976	{
				2977	int i;
				2978	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				2979
				2980	for (i = 0; i < BH_LRU_SIZE; i++) {
				2981	brelse(b->bhs[i]);
				2982	b->bhs[i] = NULL;
				2983	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	2984	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				2985	per_cpu(bh_accounting, cpu).nr = 0;
				2986	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2987	}
				2988
				2989	static int buffer_cpu_notify(struct notifier_block *self,
				2990	unsigned long action, void *hcpu)
				2991	{
				2992	if (action == CPU_DEAD)
				2993	buffer_exit_cpu((unsigned long)hcpu);
				2994	return NOTIFY_OK;
				2995	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2996
				2997	void __init buffer_init(void)
				2998	{
				2999	int nrpages;
				3000
				3001	bh_cachep = kmem_cache_create("buffer_head",
Paul Jackson	b019600	2006-03-24 03:16:09 -0800	[diff] [blame]	3002	sizeof(struct buffer_head), 0,
				3003	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3004	SLAB_MEM_SPREAD),
				3005	init_buffer_head,
				3006	NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3007
				3008	/*
				3009	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3010	*/
				3011	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3012	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3013	hotcpu_notifier(buffer_cpu_notify, 0);
				3014	}
				3015
				3016	EXPORT_SYMBOL(__bforget);
				3017	EXPORT_SYMBOL(__brelse);
				3018	EXPORT_SYMBOL(__wait_on_buffer);
				3019	EXPORT_SYMBOL(block_commit_write);
				3020	EXPORT_SYMBOL(block_prepare_write);
				3021	EXPORT_SYMBOL(block_read_full_page);
				3022	EXPORT_SYMBOL(block_sync_page);
				3023	EXPORT_SYMBOL(block_truncate_page);
				3024	EXPORT_SYMBOL(block_write_full_page);
				3025	EXPORT_SYMBOL(cont_prepare_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3026	EXPORT_SYMBOL(end_buffer_read_sync);
				3027	EXPORT_SYMBOL(end_buffer_write_sync);
				3028	EXPORT_SYMBOL(file_fsync);
				3029	EXPORT_SYMBOL(fsync_bdev);
				3030	EXPORT_SYMBOL(generic_block_bmap);
				3031	EXPORT_SYMBOL(generic_commit_write);
				3032	EXPORT_SYMBOL(generic_cont_expand);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3033	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3034	EXPORT_SYMBOL(init_buffer);
				3035	EXPORT_SYMBOL(invalidate_bdev);
				3036	EXPORT_SYMBOL(ll_rw_block);
				3037	EXPORT_SYMBOL(mark_buffer_dirty);
				3038	EXPORT_SYMBOL(submit_bh);
				3039	EXPORT_SYMBOL(sync_dirty_buffer);
				3040	EXPORT_SYMBOL(unlock_buffer);