Blame - fs/buffer.c - android_kernel_oneplus_msm8996

blob: 23f1f3a68077b87c947a350ee57b30ee3aa94ed6 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
				21	#include <linux/config.h>
				22	#include <linux/kernel.h>
				23	#include <linux/syscalls.h>
				24	#include <linux/fs.h>
				25	#include <linux/mm.h>
				26	#include <linux/percpu.h>
				27	#include <linux/slab.h>
				28	#include <linux/smp_lock.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	29	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	30	#include <linux/blkdev.h>
				31	#include <linux/file.h>
				32	#include <linux/quotaops.h>
				33	#include <linux/highmem.h>
				34	#include <linux/module.h>
				35	#include <linux/writeback.h>
				36	#include <linux/hash.h>
				37	#include <linux/suspend.h>
				38	#include <linux/buffer_head.h>
				39	#include <linux/bio.h>
				40	#include <linux/notifier.h>
				41	#include <linux/cpu.h>
				42	#include <linux/bitops.h>
				43	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	44	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	45
				46	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
				47	static void invalidate_bh_lrus(void);
				48
				49	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				50
				51	inline void
				52	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				53	{
				54	bh->b_end_io = handler;
				55	bh->b_private = private;
				56	}
				57
				58	static int sync_buffer(void *word)
				59	{
				60	struct block_device *bd;
				61	struct buffer_head *bh
				62	= container_of(word, struct buffer_head, b_state);
				63
				64	smp_mb();
				65	bd = bh->b_bdev;
				66	if (bd)
				67	blk_run_address_space(bd->bd_inode->i_mapping);
				68	io_schedule();
				69	return 0;
				70	}
				71
				72	void fastcall __lock_buffer(struct buffer_head *bh)
				73	{
				74	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				75	TASK_UNINTERRUPTIBLE);
				76	}
				77	EXPORT_SYMBOL(__lock_buffer);
				78
				79	void fastcall unlock_buffer(struct buffer_head *bh)
				80	{
				81	clear_buffer_locked(bh);
				82	smp_mb__after_clear_bit();
				83	wake_up_bit(&bh->b_state, BH_Lock);
				84	}
				85
				86	/*
				87	* Block until a buffer comes unlocked. This doesn't stop it
				88	* from becoming locked again - you have to lock it yourself
				89	* if you want to preserve its state.
				90	*/
				91	void __wait_on_buffer(struct buffer_head * bh)
				92	{
				93	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				94	}
				95
				96	static void
				97	__clear_page_buffers(struct page *page)
				98	{
				99	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	100	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	101	page_cache_release(page);
				102	}
				103
				104	static void buffer_io_error(struct buffer_head *bh)
				105	{
				106	char b[BDEVNAME_SIZE];
				107
				108	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				109	bdevname(bh->b_bdev, b),
				110	(unsigned long long)bh->b_blocknr);
				111	}
				112
				113	/*
				114	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				115	* unlock the buffer. This is what ll_rw_block uses too.
				116	*/
				117	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				118	{
				119	if (uptodate) {
				120	set_buffer_uptodate(bh);
				121	} else {
				122	/* This happens, due to failed READA attempts. */
				123	clear_buffer_uptodate(bh);
				124	}
				125	unlock_buffer(bh);
				126	put_bh(bh);
				127	}
				128
				129	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				130	{
				131	char b[BDEVNAME_SIZE];
				132
				133	if (uptodate) {
				134	set_buffer_uptodate(bh);
				135	} else {
				136	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				137	buffer_io_error(bh);
				138	printk(KERN_WARNING "lost page write due to "
				139	"I/O error on %s\n",
				140	bdevname(bh->b_bdev, b));
				141	}
				142	set_buffer_write_io_error(bh);
				143	clear_buffer_uptodate(bh);
				144	}
				145	unlock_buffer(bh);
				146	put_bh(bh);
				147	}
				148
				149	/*
				150	* Write out and wait upon all the dirty data associated with a block
				151	* device via its mapping. Does not take the superblock lock.
				152	*/
				153	int sync_blockdev(struct block_device *bdev)
				154	{
				155	int ret = 0;
				156
OGAWA Hirofumi	28fd129	2006-01-08 01:02:14 -0800	[diff] [blame]	157	if (bdev)
				158	ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	159	return ret;
				160	}
				161	EXPORT_SYMBOL(sync_blockdev);
				162
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	163	static void __fsync_super(struct super_block *sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	164	{
				165	sync_inodes_sb(sb, 0);
				166	DQUOT_SYNC(sb);
				167	lock_super(sb);
				168	if (sb->s_dirt && sb->s_op->write_super)
				169	sb->s_op->write_super(sb);
				170	unlock_super(sb);
				171	if (sb->s_op->sync_fs)
				172	sb->s_op->sync_fs(sb, 1);
				173	sync_blockdev(sb->s_bdev);
				174	sync_inodes_sb(sb, 1);
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	175	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	176
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	177	/*
				178	* Write out and wait upon all dirty data associated with this
				179	* superblock. Filesystem data as well as the underlying block
				180	* device. Takes the superblock lock.
				181	*/
				182	int fsync_super(struct super_block *sb)
				183	{
				184	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	185	return sync_blockdev(sb->s_bdev);
				186	}
				187
				188	/*
				189	* Write out and wait upon all dirty data associated with this
				190	* device. Filesystem data as well as the underlying block
				191	* device. Takes the superblock lock.
				192	*/
				193	int fsync_bdev(struct block_device *bdev)
				194	{
				195	struct super_block *sb = get_super(bdev);
				196	if (sb) {
				197	int res = fsync_super(sb);
				198	drop_super(sb);
				199	return res;
				200	}
				201	return sync_blockdev(bdev);
				202	}
				203
				204	/**
				205	* freeze_bdev -- lock a filesystem and force it into a consistent state
				206	* @bdev: blockdevice to lock
				207	*
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	208	* This takes the block device bd_mount_mutex to make sure no new mounts
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	209	* happen on bdev until thaw_bdev() is called.
				210	* If a superblock is found on this device, we take the s_umount semaphore
				211	* on it to make sure nobody unmounts until the snapshot creation is done.
				212	*/
				213	struct super_block freeze_bdev(struct block_device bdev)
				214	{
				215	struct super_block *sb;
				216
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	217	mutex_lock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	218	sb = get_super(bdev);
				219	if (sb && !(sb->s_flags & MS_RDONLY)) {
				220	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	221	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	222
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	223	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	224
				225	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	226	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	227
				228	sync_blockdev(sb->s_bdev);
				229
				230	if (sb->s_op->write_super_lockfs)
				231	sb->s_op->write_super_lockfs(sb);
				232	}
				233
				234	sync_blockdev(bdev);
				235	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				236	}
				237	EXPORT_SYMBOL(freeze_bdev);
				238
				239	/**
				240	* thaw_bdev -- unlock filesystem
				241	* @bdev: blockdevice to unlock
				242	* @sb: associated superblock
				243	*
				244	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				245	*/
				246	void thaw_bdev(struct block_device bdev, struct super_block sb)
				247	{
				248	if (sb) {
				249	BUG_ON(sb->s_bdev != bdev);
				250
				251	if (sb->s_op->unlockfs)
				252	sb->s_op->unlockfs(sb);
				253	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	254	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	255	wake_up(&sb->s_wait_unfrozen);
				256	drop_super(sb);
				257	}
				258
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	259	mutex_unlock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	260	}
				261	EXPORT_SYMBOL(thaw_bdev);
				262
				263	/*
				264	* sync everything. Start out by waking pdflush, because that writes back
				265	* all queues in parallel.
				266	*/
				267	static void do_sync(unsigned long wait)
				268	{
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	269	wakeup_pdflush(0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	270	sync_inodes(0); /* All mappings, inodes and their blockdevs */
				271	DQUOT_SYNC(NULL);
				272	sync_supers(); /* Write the superblocks */
				273	sync_filesystems(0); /* Start syncing the filesystems */
				274	sync_filesystems(wait); /* Waitingly sync the filesystems */
				275	sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
				276	if (!wait)
				277	printk("Emergency Sync complete\n");
				278	if (unlikely(laptop_mode))
				279	laptop_sync_completion();
				280	}
				281
				282	asmlinkage long sys_sync(void)
				283	{
				284	do_sync(1);
				285	return 0;
				286	}
				287
				288	void emergency_sync(void)
				289	{
				290	pdflush_operation(do_sync, 0);
				291	}
				292
				293	/*
				294	* Generic function to fsync a file.
				295	*
				296	* filp may be NULL if called via the msync of a vma.
				297	*/
				298
				299	int file_fsync(struct file filp, struct dentry dentry, int datasync)
				300	{
				301	struct inode * inode = dentry->d_inode;
				302	struct super_block * sb;
				303	int ret, err;
				304
				305	/* sync the inode to buffers */
				306	ret = write_inode_now(inode, 0);
				307
				308	/* sync the superblock to buffers */
				309	sb = inode->i_sb;
				310	lock_super(sb);
				311	if (sb->s_op->write_super)
				312	sb->s_op->write_super(sb);
				313	unlock_super(sb);
				314
				315	/* .. finally sync the buffers to disk */
				316	err = sync_blockdev(sb->s_bdev);
				317	if (!ret)
				318	ret = err;
				319	return ret;
				320	}
				321
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	322	long do_fsync(struct file *file, int datasync)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	323	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	324	int ret;
				325	int err;
				326	struct address_space *mapping = file->f_mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	327
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	328	if (!file->f_op \|\| !file->f_op->fsync) {
				329	/* Why? We can still call filemap_fdatawrite */
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	330	ret = -EINVAL;
				331	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	332	}
				333
				334	current->flags \|= PF_SYNCWRITE;
				335	ret = filemap_fdatawrite(mapping);
				336
				337	/*
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	338	* We need to protect against concurrent writers, which could cause
				339	* livelocks in fsync_buffers_list().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	340	*/
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	341	mutex_lock(&mapping->host->i_mutex);
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	342	err = file->f_op->fsync(file, file->f_dentry, datasync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	343	if (!ret)
				344	ret = err;
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	345	mutex_unlock(&mapping->host->i_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	346	err = filemap_fdatawait(mapping);
				347	if (!ret)
				348	ret = err;
				349	current->flags &= ~PF_SYNCWRITE;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	350	out:
				351	return ret;
				352	}
				353
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	354	static long __do_fsync(unsigned int fd, int datasync)
				355	{
				356	struct file *file;
				357	int ret = -EBADF;
				358
				359	file = fget(fd);
				360	if (file) {
				361	ret = do_fsync(file, datasync);
				362	fput(file);
				363	}
				364	return ret;
				365	}
				366
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	367	asmlinkage long sys_fsync(unsigned int fd)
				368	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	369	return __do_fsync(fd, 0);
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	370	}
				371
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	372	asmlinkage long sys_fdatasync(unsigned int fd)
				373	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	374	return __do_fsync(fd, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	375	}
				376
				377	/*
				378	* Various filesystems appear to want __find_get_block to be non-blocking.
				379	* But it's the page lock which protects the buffers. To get around this,
				380	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				381	* private_lock.
				382	*
				383	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				384	* may be quite high. This code could TryLock the page, and if that
				385	* succeeds, there is no need to take private_lock. (But if
				386	* private_lock is contended then so is mapping->tree_lock).
				387	*/
				388	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	389	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	390	{
				391	struct inode *bd_inode = bdev->bd_inode;
				392	struct address_space *bd_mapping = bd_inode->i_mapping;
				393	struct buffer_head *ret = NULL;
				394	pgoff_t index;
				395	struct buffer_head *bh;
				396	struct buffer_head *head;
				397	struct page *page;
				398	int all_mapped = 1;
				399
				400	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				401	page = find_get_page(bd_mapping, index);
				402	if (!page)
				403	goto out;
				404
				405	spin_lock(&bd_mapping->private_lock);
				406	if (!page_has_buffers(page))
				407	goto out_unlock;
				408	head = page_buffers(page);
				409	bh = head;
				410	do {
				411	if (bh->b_blocknr == block) {
				412	ret = bh;
				413	get_bh(bh);
				414	goto out_unlock;
				415	}
				416	if (!buffer_mapped(bh))
				417	all_mapped = 0;
				418	bh = bh->b_this_page;
				419	} while (bh != head);
				420
				421	/* we might be here because some of the buffers on this page are
				422	* not mapped. This is due to various races between
				423	* file io on the block device and getblk. It gets dealt with
				424	* elsewhere, don't buffer_error if we had some unmapped buffers
				425	*/
				426	if (all_mapped) {
				427	printk("__find_get_block_slow() failed. "
				428	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	429	(unsigned long long)block,
				430	(unsigned long long)bh->b_blocknr);
				431	printk("b_state=0x%08lx, b_size=%zu\n",
				432	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	433	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				434	}
				435	out_unlock:
				436	spin_unlock(&bd_mapping->private_lock);
				437	page_cache_release(page);
				438	out:
				439	return ret;
				440	}
				441
				442	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				443	of fs corruption is going on. Trashing dirty data always imply losing
				444	information that was supposed to be just stored on the physical layer
				445	by the user.
				446
				447	Thus invalidate_buffers in general usage is not allwowed to trash
				448	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				449	be preserved. These buffers are simply skipped.
				450
				451	We also skip buffers which are still in use. For example this can
				452	happen if a userspace program is reading the block device.
				453
				454	NOTE: In the case where the user removed a removable-media-disk even if
				455	there's still dirty data not synced on disk (due a bug in the device driver
				456	or due an error of the user), by not destroying the dirty buffers we could
				457	generate corruption also on the next media inserted, thus a parameter is
				458	necessary to handle this case in the most safe way possible (trying
				459	to not corrupt also the new disk inserted with the data belonging to
				460	the old now corrupted disk). Also for the ramdisk the natural thing
				461	to do in order to release the ramdisk memory is to destroy dirty buffers.
				462
				463	These are two special cases. Normal usage imply the device driver
				464	to issue a sync on the device (without waiting I/O completion) and
				465	then an invalidate_buffers call that doesn't trash dirty buffers.
				466
				467	For handling cache coherency with the blkdev pagecache the 'update' case
				468	is been introduced. It is needed to re-read from disk any pinned
				469	buffer. NOTE: re-reading from disk is destructive so we can do it only
				470	when we assume nobody is changing the buffercache under our I/O and when
				471	we think the disk contains more recent information than the buffercache.
				472	The update == 1 pass marks the buffers we need to update, the update == 2
				473	pass does the actual I/O. */
				474	void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
				475	{
				476	invalidate_bh_lrus();
				477	/*
				478	* FIXME: what about destroy_dirty_buffers?
				479	* We really want to use invalidate_inode_pages2() for
				480	* that, but not until that's cleaned up.
				481	*/
				482	invalidate_inode_pages(bdev->bd_inode->i_mapping);
				483	}
				484
				485	/*
				486	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				487	*/
				488	static void free_more_memory(void)
				489	{
				490	struct zone **zones;
				491	pg_data_t *pgdat;
				492
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	493	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	494	yield();
				495
KAMEZAWA Hiroyuki	ec936fc	2006-03-27 01:15:59 -0800	[diff] [blame]	496	for_each_online_pgdat(pgdat) {
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	497	zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	498	if (*zones)
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	499	try_to_free_pages(zones, GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	500	}
				501	}
				502
				503	/*
				504	* I/O completion handler for block_read_full_page() - pages
				505	* which come unlocked at the end of I/O.
				506	*/
				507	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				508	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	509	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	510	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	511	struct buffer_head *tmp;
				512	struct page *page;
				513	int page_uptodate = 1;
				514
				515	BUG_ON(!buffer_async_read(bh));
				516
				517	page = bh->b_page;
				518	if (uptodate) {
				519	set_buffer_uptodate(bh);
				520	} else {
				521	clear_buffer_uptodate(bh);
				522	if (printk_ratelimit())
				523	buffer_io_error(bh);
				524	SetPageError(page);
				525	}
				526
				527	/*
				528	* Be _very_ careful from here on. Bad things can happen if
				529	* two buffer heads end IO at almost the same time and both
				530	* decide that the page is now completely done.
				531	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	532	first = page_buffers(page);
				533	local_irq_save(flags);
				534	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	535	clear_buffer_async_read(bh);
				536	unlock_buffer(bh);
				537	tmp = bh;
				538	do {
				539	if (!buffer_uptodate(tmp))
				540	page_uptodate = 0;
				541	if (buffer_async_read(tmp)) {
				542	BUG_ON(!buffer_locked(tmp));
				543	goto still_busy;
				544	}
				545	tmp = tmp->b_this_page;
				546	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	547	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				548	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	549
				550	/*
				551	* If none of the buffers had errors and they are all
				552	* uptodate then we can set the page uptodate.
				553	*/
				554	if (page_uptodate && !PageError(page))
				555	SetPageUptodate(page);
				556	unlock_page(page);
				557	return;
				558
				559	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	560	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				561	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	562	return;
				563	}
				564
				565	/*
				566	* Completion handler for block_write_full_page() - pages which are unlocked
				567	* during I/O, and which have PageWriteback cleared upon I/O completion.
				568	*/
				569	void end_buffer_async_write(struct buffer_head *bh, int uptodate)
				570	{
				571	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	572	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	573	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	574	struct buffer_head *tmp;
				575	struct page *page;
				576
				577	BUG_ON(!buffer_async_write(bh));
				578
				579	page = bh->b_page;
				580	if (uptodate) {
				581	set_buffer_uptodate(bh);
				582	} else {
				583	if (printk_ratelimit()) {
				584	buffer_io_error(bh);
				585	printk(KERN_WARNING "lost page write due to "
				586	"I/O error on %s\n",
				587	bdevname(bh->b_bdev, b));
				588	}
				589	set_bit(AS_EIO, &page->mapping->flags);
				590	clear_buffer_uptodate(bh);
				591	SetPageError(page);
				592	}
				593
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	594	first = page_buffers(page);
				595	local_irq_save(flags);
				596	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				597
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	598	clear_buffer_async_write(bh);
				599	unlock_buffer(bh);
				600	tmp = bh->b_this_page;
				601	while (tmp != bh) {
				602	if (buffer_async_write(tmp)) {
				603	BUG_ON(!buffer_locked(tmp));
				604	goto still_busy;
				605	}
				606	tmp = tmp->b_this_page;
				607	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	608	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				609	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	610	end_page_writeback(page);
				611	return;
				612
				613	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	614	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				615	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	616	return;
				617	}
				618
				619	/*
				620	* If a page's buffers are under async readin (end_buffer_async_read
				621	* completion) then there is a possibility that another thread of
				622	* control could lock one of the buffers after it has completed
				623	* but while some of the other buffers have not completed. This
				624	* locked buffer would confuse end_buffer_async_read() into not unlocking
				625	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				626	* that this buffer is not under async I/O.
				627	*
				628	* The page comes unlocked when it has no locked buffer_async buffers
				629	* left.
				630	*
				631	* PageLocked prevents anyone starting new async I/O reads any of
				632	* the buffers.
				633	*
				634	* PageWriteback is used to prevent simultaneous writeout of the same
				635	* page.
				636	*
				637	* PageLocked prevents anyone from starting writeback of a page which is
				638	* under read I/O (PageWriteback is only ever set against a locked page).
				639	*/
				640	static void mark_buffer_async_read(struct buffer_head *bh)
				641	{
				642	bh->b_end_io = end_buffer_async_read;
				643	set_buffer_async_read(bh);
				644	}
				645
				646	void mark_buffer_async_write(struct buffer_head *bh)
				647	{
				648	bh->b_end_io = end_buffer_async_write;
				649	set_buffer_async_write(bh);
				650	}
				651	EXPORT_SYMBOL(mark_buffer_async_write);
				652
				653
				654	/*
				655	* fs/buffer.c contains helper functions for buffer-backed address space's
				656	* fsync functions. A common requirement for buffer-based filesystems is
				657	* that certain data from the backing blockdev needs to be written out for
				658	* a successful fsync(). For example, ext2 indirect blocks need to be
				659	* written back and waited upon before fsync() returns.
				660	*
				661	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				662	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				663	* management of a list of dependent buffers at ->i_mapping->private_list.
				664	*
				665	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				666	* from their controlling inode's queue when they are being freed. But
				667	* try_to_free_buffers() will be operating against the blockdev mapping
				668	* at the time, not against the S_ISREG file which depends on those buffers.
				669	* So the locking for private_list is via the private_lock in the address_space
				670	* which backs the buffers. Which is different from the address_space
				671	* against which the buffers are listed. So for a particular address_space,
				672	* mapping->private_lock does not protect mapping->private_list! In fact,
				673	* mapping->private_list will always be protected by the backing blockdev's
				674	* ->private_lock.
				675	*
				676	* Which introduces a requirement: all buffers on an address_space's
				677	* ->private_list must be from the same address_space: the blockdev's.
				678	*
				679	* address_spaces which do not place buffers at ->private_list via these
				680	* utility functions are free to use private_lock and private_list for
				681	* whatever they want. The only requirement is that list_empty(private_list)
				682	* be true at clear_inode() time.
				683	*
				684	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				685	* filesystems should do that. invalidate_inode_buffers() should just go
				686	* BUG_ON(!list_empty).
				687	*
				688	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				689	* take an address_space, not an inode. And it should be called
				690	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				691	* queued up.
				692	*
				693	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				694	* list if it is already on a list. Because if the buffer is on a list,
				695	* it must already be on the right one. If not, the filesystem is being
				696	* silly. This will save a ton of locking. But first we have to ensure
				697	* that buffers are taken off the old inode's list when they are freed
				698	* (presumably in truncate). That requires careful auditing of all
				699	* filesystems (do it inside bforget()). It could also be done by bringing
				700	* b_inode back.
				701	*/
				702
				703	/*
				704	* The buffer's backing address_space's private_lock must be held
				705	*/
				706	static inline void __remove_assoc_queue(struct buffer_head *bh)
				707	{
				708	list_del_init(&bh->b_assoc_buffers);
				709	}
				710
				711	int inode_has_buffers(struct inode *inode)
				712	{
				713	return !list_empty(&inode->i_data.private_list);
				714	}
				715
				716	/*
				717	* osync is designed to support O_SYNC io. It waits synchronously for
				718	* all already-submitted IO to complete, but does not queue any new
				719	* writes to the disk.
				720	*
				721	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				722	* you dirty the buffers, and then use osync_inode_buffers to wait for
				723	* completion. Any other dirty buffers which are not yet queued for
				724	* write will not be flushed to disk by the osync.
				725	*/
				726	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				727	{
				728	struct buffer_head *bh;
				729	struct list_head *p;
				730	int err = 0;
				731
				732	spin_lock(lock);
				733	repeat:
				734	list_for_each_prev(p, list) {
				735	bh = BH_ENTRY(p);
				736	if (buffer_locked(bh)) {
				737	get_bh(bh);
				738	spin_unlock(lock);
				739	wait_on_buffer(bh);
				740	if (!buffer_uptodate(bh))
				741	err = -EIO;
				742	brelse(bh);
				743	spin_lock(lock);
				744	goto repeat;
				745	}
				746	}
				747	spin_unlock(lock);
				748	return err;
				749	}
				750
				751	/**
				752	* sync_mapping_buffers - write out and wait upon a mapping's "associated"
				753	* buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	754	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	755	*
				756	* Starts I/O against the buffers at mapping->private_list, and waits upon
				757	* that I/O.
				758	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	759	* Basically, this is a convenience function for fsync().
				760	* @mapping is a file or directory which needs those buffers to be written for
				761	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	762	*/
				763	int sync_mapping_buffers(struct address_space *mapping)
				764	{
				765	struct address_space *buffer_mapping = mapping->assoc_mapping;
				766
				767	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				768	return 0;
				769
				770	return fsync_buffers_list(&buffer_mapping->private_lock,
				771	&mapping->private_list);
				772	}
				773	EXPORT_SYMBOL(sync_mapping_buffers);
				774
				775	/*
				776	* Called when we've recently written block `bblock', and it is known that
				777	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				778	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				779	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				780	*/
				781	void write_boundary_block(struct block_device *bdev,
				782	sector_t bblock, unsigned blocksize)
				783	{
				784	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				785	if (bh) {
				786	if (buffer_dirty(bh))
				787	ll_rw_block(WRITE, 1, &bh);
				788	put_bh(bh);
				789	}
				790	}
				791
				792	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				793	{
				794	struct address_space *mapping = inode->i_mapping;
				795	struct address_space *buffer_mapping = bh->b_page->mapping;
				796
				797	mark_buffer_dirty(bh);
				798	if (!mapping->assoc_mapping) {
				799	mapping->assoc_mapping = buffer_mapping;
				800	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	801	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	802	}
				803	if (list_empty(&bh->b_assoc_buffers)) {
				804	spin_lock(&buffer_mapping->private_lock);
				805	list_move_tail(&bh->b_assoc_buffers,
				806	&mapping->private_list);
				807	spin_unlock(&buffer_mapping->private_lock);
				808	}
				809	}
				810	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				811
				812	/*
				813	* Add a page to the dirty page list.
				814	*
				815	* It is a sad fact of life that this function is called from several places
				816	* deeply under spinlocking. It may not sleep.
				817	*
				818	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				819	* dirty-state coherency between the page and the buffers. It the page does
				820	* not have buffers then when they are later attached they will all be set
				821	* dirty.
				822	*
				823	* The buffers are dirtied before the page is dirtied. There's a small race
				824	* window in which a writepage caller may see the page cleanness but not the
				825	* buffer dirtiness. That's fine. If this code were to set the page dirty
				826	* before the buffers, a concurrent writepage caller could clear the page dirty
				827	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				828	* page on the dirty page list.
				829	*
				830	* We use private_lock to lock against try_to_free_buffers while using the
				831	* page's buffer list. Also use this to protect against clean buffers being
				832	* added to the page after it was set dirty.
				833	*
				834	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				835	* address_space though.
				836	*/
				837	int __set_page_dirty_buffers(struct page *page)
				838	{
				839	struct address_space * const mapping = page->mapping;
				840
				841	spin_lock(&mapping->private_lock);
				842	if (page_has_buffers(page)) {
				843	struct buffer_head *head = page_buffers(page);
				844	struct buffer_head *bh = head;
				845
				846	do {
				847	set_buffer_dirty(bh);
				848	bh = bh->b_this_page;
				849	} while (bh != head);
				850	}
				851	spin_unlock(&mapping->private_lock);
				852
				853	if (!TestSetPageDirty(page)) {
				854	write_lock_irq(&mapping->tree_lock);
				855	if (page->mapping) { /* Race with truncate? */
				856	if (mapping_cap_account_dirty(mapping))
				857	inc_page_state(nr_dirty);
				858	radix_tree_tag_set(&mapping->page_tree,
				859	page_index(page),
				860	PAGECACHE_TAG_DIRTY);
				861	}
				862	write_unlock_irq(&mapping->tree_lock);
				863	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Andrew Morton	4741c9f	2006-03-24 03:18:11 -0800	[diff] [blame]	864	return 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	865	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	866	return 0;
				867	}
				868	EXPORT_SYMBOL(__set_page_dirty_buffers);
				869
				870	/*
				871	* Write out and wait upon a list of buffers.
				872	*
				873	* We have conflicting pressures: we want to make sure that all
				874	* initially dirty buffers get waited on, but that any subsequently
				875	* dirtied buffers don't. After all, we don't want fsync to last
				876	* forever if somebody is actively writing to the file.
				877	*
				878	* Do this in two main stages: first we copy dirty buffers to a
				879	* temporary inode list, queueing the writes as we go. Then we clean
				880	* up, waiting for those writes to complete.
				881	*
				882	* During this second stage, any subsequent updates to the file may end
				883	* up refiling the buffer on the original inode's dirty list again, so
				884	* there is a chance we will end up with a buffer queued for write but
				885	* not yet completed on that list. So, as a final cleanup we go through
				886	* the osync code to catch these locked, dirty buffers without requeuing
				887	* any newly dirty buffers for write.
				888	*/
				889	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				890	{
				891	struct buffer_head *bh;
				892	struct list_head tmp;
				893	int err = 0, err2;
				894
				895	INIT_LIST_HEAD(&tmp);
				896
				897	spin_lock(lock);
				898	while (!list_empty(list)) {
				899	bh = BH_ENTRY(list->next);
				900	list_del_init(&bh->b_assoc_buffers);
				901	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				902	list_add(&bh->b_assoc_buffers, &tmp);
				903	if (buffer_dirty(bh)) {
				904	get_bh(bh);
				905	spin_unlock(lock);
				906	/*
				907	* Ensure any pending I/O completes so that
				908	* ll_rw_block() actually writes the current
				909	* contents - it is a noop if I/O is still in
				910	* flight on potentially older contents.
				911	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	912	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	913	brelse(bh);
				914	spin_lock(lock);
				915	}
				916	}
				917	}
				918
				919	while (!list_empty(&tmp)) {
				920	bh = BH_ENTRY(tmp.prev);
				921	__remove_assoc_queue(bh);
				922	get_bh(bh);
				923	spin_unlock(lock);
				924	wait_on_buffer(bh);
				925	if (!buffer_uptodate(bh))
				926	err = -EIO;
				927	brelse(bh);
				928	spin_lock(lock);
				929	}
				930
				931	spin_unlock(lock);
				932	err2 = osync_buffers_list(lock, list);
				933	if (err)
				934	return err;
				935	else
				936	return err2;
				937	}
				938
				939	/*
				940	* Invalidate any and all dirty buffers on a given inode. We are
				941	* probably unmounting the fs, but that doesn't mean we have already
				942	* done a sync(). Just drop the buffers from the inode list.
				943	*
				944	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				945	* assumes that all the buffers are against the blockdev. Not true
				946	* for reiserfs.
				947	*/
				948	void invalidate_inode_buffers(struct inode *inode)
				949	{
				950	if (inode_has_buffers(inode)) {
				951	struct address_space *mapping = &inode->i_data;
				952	struct list_head *list = &mapping->private_list;
				953	struct address_space *buffer_mapping = mapping->assoc_mapping;
				954
				955	spin_lock(&buffer_mapping->private_lock);
				956	while (!list_empty(list))
				957	__remove_assoc_queue(BH_ENTRY(list->next));
				958	spin_unlock(&buffer_mapping->private_lock);
				959	}
				960	}
				961
				962	/*
				963	* Remove any clean buffers from the inode's buffer list. This is called
				964	* when we're trying to free the inode itself. Those buffers can pin it.
				965	*
				966	* Returns true if all buffers were removed.
				967	*/
				968	int remove_inode_buffers(struct inode *inode)
				969	{
				970	int ret = 1;
				971
				972	if (inode_has_buffers(inode)) {
				973	struct address_space *mapping = &inode->i_data;
				974	struct list_head *list = &mapping->private_list;
				975	struct address_space *buffer_mapping = mapping->assoc_mapping;
				976
				977	spin_lock(&buffer_mapping->private_lock);
				978	while (!list_empty(list)) {
				979	struct buffer_head *bh = BH_ENTRY(list->next);
				980	if (buffer_dirty(bh)) {
				981	ret = 0;
				982	break;
				983	}
				984	__remove_assoc_queue(bh);
				985	}
				986	spin_unlock(&buffer_mapping->private_lock);
				987	}
				988	return ret;
				989	}
				990
				991	/*
				992	* Create the appropriate buffers when given a page for data area and
				993	* the size of each buffer.. Use the bh->b_this_page linked list to
				994	* follow the buffers created. Return NULL if unable to create more
				995	* buffers.
				996	*
				997	* The retry flag is used to differentiate async IO (paging, swapping)
				998	* which may not fail from ordinary buffer allocations.
				999	*/
				1000	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				1001	int retry)
				1002	{
				1003	struct buffer_head bh, head;
				1004	long offset;
				1005
				1006	try_again:
				1007	head = NULL;
				1008	offset = PAGE_SIZE;
				1009	while ((offset -= size) >= 0) {
				1010	bh = alloc_buffer_head(GFP_NOFS);
				1011	if (!bh)
				1012	goto no_grow;
				1013
				1014	bh->b_bdev = NULL;
				1015	bh->b_this_page = head;
				1016	bh->b_blocknr = -1;
				1017	head = bh;
				1018
				1019	bh->b_state = 0;
				1020	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	1021	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1022	bh->b_size = size;
				1023
				1024	/* Link the buffer to its page */
				1025	set_bh_page(bh, page, offset);
				1026
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	1027	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1028	}
				1029	return head;
				1030	/*
				1031	* In case anything failed, we just free everything we got.
				1032	*/
				1033	no_grow:
				1034	if (head) {
				1035	do {
				1036	bh = head;
				1037	head = head->b_this_page;
				1038	free_buffer_head(bh);
				1039	} while (head);
				1040	}
				1041
				1042	/*
				1043	* Return failure for non-async IO requests. Async IO requests
				1044	* are not allowed to fail, so we have to wait until buffer heads
				1045	* become available. But we don't want tasks sleeping with
				1046	* partially complete buffers, so all were released above.
				1047	*/
				1048	if (!retry)
				1049	return NULL;
				1050
				1051	/* We're _really_ low on memory. Now we just
				1052	* wait for old buffer heads to become free due to
				1053	* finishing IO. Since this is an async request and
				1054	* the reserve list is empty, we're sure there are
				1055	* async buffer heads in use.
				1056	*/
				1057	free_more_memory();
				1058	goto try_again;
				1059	}
				1060	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				1061
				1062	static inline void
				1063	link_dev_buffers(struct page page, struct buffer_head head)
				1064	{
				1065	struct buffer_head bh, tail;
				1066
				1067	bh = head;
				1068	do {
				1069	tail = bh;
				1070	bh = bh->b_this_page;
				1071	} while (bh);
				1072	tail->b_this_page = head;
				1073	attach_page_buffers(page, head);
				1074	}
				1075
				1076	/*
				1077	* Initialise the state of a blockdev page's buffers.
				1078	*/
				1079	static void
				1080	init_page_buffers(struct page page, struct block_device bdev,
				1081	sector_t block, int size)
				1082	{
				1083	struct buffer_head *head = page_buffers(page);
				1084	struct buffer_head *bh = head;
				1085	int uptodate = PageUptodate(page);
				1086
				1087	do {
				1088	if (!buffer_mapped(bh)) {
				1089	init_buffer(bh, NULL, NULL);
				1090	bh->b_bdev = bdev;
				1091	bh->b_blocknr = block;
				1092	if (uptodate)
				1093	set_buffer_uptodate(bh);
				1094	set_buffer_mapped(bh);
				1095	}
				1096	block++;
				1097	bh = bh->b_this_page;
				1098	} while (bh != head);
				1099	}
				1100
				1101	/*
				1102	* Create the page-cache page that contains the requested block.
				1103	*
				1104	* This is user purely for blockdev mappings.
				1105	*/
				1106	static struct page *
				1107	grow_dev_page(struct block_device *bdev, sector_t block,
				1108	pgoff_t index, int size)
				1109	{
				1110	struct inode *inode = bdev->bd_inode;
				1111	struct page *page;
				1112	struct buffer_head *bh;
				1113
				1114	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
				1115	if (!page)
				1116	return NULL;
				1117
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1118	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1119
				1120	if (page_has_buffers(page)) {
				1121	bh = page_buffers(page);
				1122	if (bh->b_size == size) {
				1123	init_page_buffers(page, bdev, block, size);
				1124	return page;
				1125	}
				1126	if (!try_to_free_buffers(page))
				1127	goto failed;
				1128	}
				1129
				1130	/*
				1131	* Allocate some buffers for this page
				1132	*/
				1133	bh = alloc_page_buffers(page, size, 0);
				1134	if (!bh)
				1135	goto failed;
				1136
				1137	/*
				1138	* Link the page to the buffers and initialise them. Take the
				1139	* lock to be atomic wrt __find_get_block(), which does not
				1140	* run under the page lock.
				1141	*/
				1142	spin_lock(&inode->i_mapping->private_lock);
				1143	link_dev_buffers(page, bh);
				1144	init_page_buffers(page, bdev, block, size);
				1145	spin_unlock(&inode->i_mapping->private_lock);
				1146	return page;
				1147
				1148	failed:
				1149	BUG();
				1150	unlock_page(page);
				1151	page_cache_release(page);
				1152	return NULL;
				1153	}
				1154
				1155	/*
				1156	* Create buffers for the specified block device block's page. If
				1157	* that page was dirty, the buffers are set dirty also.
				1158	*
				1159	* Except that's a bug. Attaching dirty buffers to a dirty
				1160	* blockdev's page can result in filesystem corruption, because
				1161	* some of those buffers may be aliases of filesystem data.
				1162	* grow_dev_page() will go BUG() if this happens.
				1163	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1164	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1165	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1166	{
				1167	struct page *page;
				1168	pgoff_t index;
				1169	int sizebits;
				1170
				1171	sizebits = -1;
				1172	do {
				1173	sizebits++;
				1174	} while ((size << sizebits) < PAGE_SIZE);
				1175
				1176	index = block >> sizebits;
				1177	block = index << sizebits;
				1178
				1179	/* Create a page with the proper size buffers.. */
				1180	page = grow_dev_page(bdev, block, index, size);
				1181	if (!page)
				1182	return 0;
				1183	unlock_page(page);
				1184	page_cache_release(page);
				1185	return 1;
				1186	}
				1187
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1188	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1189	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1190	{
				1191	/* Size must be multiple of hard sectorsize */
				1192	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1193	(size < 512 \|\| size > PAGE_SIZE))) {
				1194	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1195	size);
				1196	printk(KERN_ERR "hardsect size: %d\n",
				1197	bdev_hardsect_size(bdev));
				1198
				1199	dump_stack();
				1200	return NULL;
				1201	}
				1202
				1203	for (;;) {
				1204	struct buffer_head * bh;
				1205
				1206	bh = __find_get_block(bdev, block, size);
				1207	if (bh)
				1208	return bh;
				1209
				1210	if (!grow_buffers(bdev, block, size))
				1211	free_more_memory();
				1212	}
				1213	}
				1214
				1215	/*
				1216	* The relationship between dirty buffers and dirty pages:
				1217	*
				1218	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1219	* the page is tagged dirty in its radix tree.
				1220	*
				1221	* At all times, the dirtiness of the buffers represents the dirtiness of
				1222	* subsections of the page. If the page has buffers, the page dirty bit is
				1223	* merely a hint about the true dirty state.
				1224	*
				1225	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1226	* (if the page has buffers).
				1227	*
				1228	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1229	* buffers are not.
				1230	*
				1231	* Also. When blockdev buffers are explicitly read with bread(), they
				1232	* individually become uptodate. But their backing page remains not
				1233	* uptodate - even if all of its buffers are uptodate. A subsequent
				1234	* block_read_full_page() against that page will discover all the uptodate
				1235	* buffers, will set the page uptodate and will perform no I/O.
				1236	*/
				1237
				1238	/**
				1239	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1240	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1241	*
				1242	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1243	* backing page dirty, then tag the page as dirty in its address_space's radix
				1244	* tree and then attach the address_space's inode to its superblock's dirty
				1245	* inode list.
				1246	*
				1247	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1248	* mapping->tree_lock and the global inode_lock.
				1249	*/
				1250	void fastcall mark_buffer_dirty(struct buffer_head *bh)
				1251	{
				1252	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
				1253	__set_page_dirty_nobuffers(bh->b_page);
				1254	}
				1255
				1256	/*
				1257	* Decrement a buffer_head's reference count. If all buffers against a page
				1258	* have zero reference count, are clean and unlocked, and if the page is clean
				1259	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1260	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1261	* a page but it ends up not being freed, and buffers may later be reattached).
				1262	*/
				1263	void __brelse(struct buffer_head * buf)
				1264	{
				1265	if (atomic_read(&buf->b_count)) {
				1266	put_bh(buf);
				1267	return;
				1268	}
				1269	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1270	WARN_ON(1);
				1271	}
				1272
				1273	/*
				1274	* bforget() is like brelse(), except it discards any
				1275	* potentially dirty data.
				1276	*/
				1277	void __bforget(struct buffer_head *bh)
				1278	{
				1279	clear_buffer_dirty(bh);
				1280	if (!list_empty(&bh->b_assoc_buffers)) {
				1281	struct address_space *buffer_mapping = bh->b_page->mapping;
				1282
				1283	spin_lock(&buffer_mapping->private_lock);
				1284	list_del_init(&bh->b_assoc_buffers);
				1285	spin_unlock(&buffer_mapping->private_lock);
				1286	}
				1287	__brelse(bh);
				1288	}
				1289
				1290	static struct buffer_head __bread_slow(struct buffer_head bh)
				1291	{
				1292	lock_buffer(bh);
				1293	if (buffer_uptodate(bh)) {
				1294	unlock_buffer(bh);
				1295	return bh;
				1296	} else {
				1297	get_bh(bh);
				1298	bh->b_end_io = end_buffer_read_sync;
				1299	submit_bh(READ, bh);
				1300	wait_on_buffer(bh);
				1301	if (buffer_uptodate(bh))
				1302	return bh;
				1303	}
				1304	brelse(bh);
				1305	return NULL;
				1306	}
				1307
				1308	/*
				1309	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1310	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1311	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1312	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1313	* CPU's LRUs at the same time.
				1314	*
				1315	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1316	* sb_find_get_block().
				1317	*
				1318	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1319	* a local interrupt disable for that.
				1320	*/
				1321
				1322	#define BH_LRU_SIZE 8
				1323
				1324	struct bh_lru {
				1325	struct buffer_head *bhs[BH_LRU_SIZE];
				1326	};
				1327
				1328	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1329
				1330	#ifdef CONFIG_SMP
				1331	#define bh_lru_lock() local_irq_disable()
				1332	#define bh_lru_unlock() local_irq_enable()
				1333	#else
				1334	#define bh_lru_lock() preempt_disable()
				1335	#define bh_lru_unlock() preempt_enable()
				1336	#endif
				1337
				1338	static inline void check_irqs_on(void)
				1339	{
				1340	#ifdef irqs_disabled
				1341	BUG_ON(irqs_disabled());
				1342	#endif
				1343	}
				1344
				1345	/*
				1346	* The LRU management algorithm is dopey-but-simple. Sorry.
				1347	*/
				1348	static void bh_lru_install(struct buffer_head *bh)
				1349	{
				1350	struct buffer_head *evictee = NULL;
				1351	struct bh_lru *lru;
				1352
				1353	check_irqs_on();
				1354	bh_lru_lock();
				1355	lru = &__get_cpu_var(bh_lrus);
				1356	if (lru->bhs[0] != bh) {
				1357	struct buffer_head *bhs[BH_LRU_SIZE];
				1358	int in;
				1359	int out = 0;
				1360
				1361	get_bh(bh);
				1362	bhs[out++] = bh;
				1363	for (in = 0; in < BH_LRU_SIZE; in++) {
				1364	struct buffer_head *bh2 = lru->bhs[in];
				1365
				1366	if (bh2 == bh) {
				1367	__brelse(bh2);
				1368	} else {
				1369	if (out >= BH_LRU_SIZE) {
				1370	BUG_ON(evictee != NULL);
				1371	evictee = bh2;
				1372	} else {
				1373	bhs[out++] = bh2;
				1374	}
				1375	}
				1376	}
				1377	while (out < BH_LRU_SIZE)
				1378	bhs[out++] = NULL;
				1379	memcpy(lru->bhs, bhs, sizeof(bhs));
				1380	}
				1381	bh_lru_unlock();
				1382
				1383	if (evictee)
				1384	__brelse(evictee);
				1385	}
				1386
				1387	/*
				1388	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1389	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1390	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1391	lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
				1392	{
				1393	struct buffer_head *ret = NULL;
				1394	struct bh_lru *lru;
				1395	int i;
				1396
				1397	check_irqs_on();
				1398	bh_lru_lock();
				1399	lru = &__get_cpu_var(bh_lrus);
				1400	for (i = 0; i < BH_LRU_SIZE; i++) {
				1401	struct buffer_head *bh = lru->bhs[i];
				1402
				1403	if (bh && bh->b_bdev == bdev &&
				1404	bh->b_blocknr == block && bh->b_size == size) {
				1405	if (i) {
				1406	while (i) {
				1407	lru->bhs[i] = lru->bhs[i - 1];
				1408	i--;
				1409	}
				1410	lru->bhs[0] = bh;
				1411	}
				1412	get_bh(bh);
				1413	ret = bh;
				1414	break;
				1415	}
				1416	}
				1417	bh_lru_unlock();
				1418	return ret;
				1419	}
				1420
				1421	/*
				1422	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1423	* it in the LRU and mark it as accessed. If it is not present then return
				1424	* NULL
				1425	*/
				1426	struct buffer_head *
				1427	__find_get_block(struct block_device *bdev, sector_t block, int size)
				1428	{
				1429	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1430
				1431	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1432	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1433	if (bh)
				1434	bh_lru_install(bh);
				1435	}
				1436	if (bh)
				1437	touch_buffer(bh);
				1438	return bh;
				1439	}
				1440	EXPORT_SYMBOL(__find_get_block);
				1441
				1442	/*
				1443	* __getblk will locate (and, if necessary, create) the buffer_head
				1444	* which corresponds to the passed block_device, block and size. The
				1445	* returned buffer has its reference count incremented.
				1446	*
				1447	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1448	* illegal block number, __getblk() will happily return a buffer_head
				1449	* which represents the non-existent block. Very weird.
				1450	*
				1451	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1452	* attempt is failing. FIXME, perhaps?
				1453	*/
				1454	struct buffer_head *
				1455	__getblk(struct block_device *bdev, sector_t block, int size)
				1456	{
				1457	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1458
				1459	might_sleep();
				1460	if (bh == NULL)
				1461	bh = __getblk_slow(bdev, block, size);
				1462	return bh;
				1463	}
				1464	EXPORT_SYMBOL(__getblk);
				1465
				1466	/*
				1467	* Do async read-ahead on a buffer..
				1468	*/
				1469	void __breadahead(struct block_device *bdev, sector_t block, int size)
				1470	{
				1471	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1472	if (likely(bh)) {
				1473	ll_rw_block(READA, 1, &bh);
				1474	brelse(bh);
				1475	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1476	}
				1477	EXPORT_SYMBOL(__breadahead);
				1478
				1479	/**
				1480	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1481	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1482	* @block: number of block
				1483	* @size: size (in bytes) to read
				1484	*
				1485	* Reads a specified block, and returns buffer head that contains it.
				1486	* It returns NULL if the block was unreadable.
				1487	*/
				1488	struct buffer_head *
				1489	__bread(struct block_device *bdev, sector_t block, int size)
				1490	{
				1491	struct buffer_head *bh = __getblk(bdev, block, size);
				1492
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1493	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1494	bh = __bread_slow(bh);
				1495	return bh;
				1496	}
				1497	EXPORT_SYMBOL(__bread);
				1498
				1499	/*
				1500	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1501	* This doesn't race because it runs in each cpu either in irq
				1502	* or with preempt disabled.
				1503	*/
				1504	static void invalidate_bh_lru(void *arg)
				1505	{
				1506	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1507	int i;
				1508
				1509	for (i = 0; i < BH_LRU_SIZE; i++) {
				1510	brelse(b->bhs[i]);
				1511	b->bhs[i] = NULL;
				1512	}
				1513	put_cpu_var(bh_lrus);
				1514	}
				1515
				1516	static void invalidate_bh_lrus(void)
				1517	{
				1518	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1519	}
				1520
				1521	void set_bh_page(struct buffer_head *bh,
				1522	struct page *page, unsigned long offset)
				1523	{
				1524	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1525	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1526	if (PageHighMem(page))
				1527	/*
				1528	* This catches illegal uses and preserves the offset:
				1529	*/
				1530	bh->b_data = (char *)(0 + offset);
				1531	else
				1532	bh->b_data = page_address(page) + offset;
				1533	}
				1534	EXPORT_SYMBOL(set_bh_page);
				1535
				1536	/*
				1537	* Called when truncating a buffer on a page completely.
				1538	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1539	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1540	{
				1541	lock_buffer(bh);
				1542	clear_buffer_dirty(bh);
				1543	bh->b_bdev = NULL;
				1544	clear_buffer_mapped(bh);
				1545	clear_buffer_req(bh);
				1546	clear_buffer_new(bh);
				1547	clear_buffer_delay(bh);
				1548	unlock_buffer(bh);
				1549	}
				1550
				1551	/**
				1552	* try_to_release_page() - release old fs-specific metadata on a page
				1553	*
				1554	* @page: the page which the kernel is trying to free
				1555	* @gfp_mask: memory allocation flags (and I/O mode)
				1556	*
				1557	* The address_space is to try to release any data against the page
				1558	* (presumably at page->private). If the release was successful, return `1'.
				1559	* Otherwise return zero.
				1560	*
				1561	* The @gfp_mask argument specifies whether I/O may be performed to release
				1562	* this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
				1563	*
				1564	* NOTE: @gfp_mask may go away, and this function may become non-blocking.
				1565	*/
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1566	int try_to_release_page(struct page *page, gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1567	{
				1568	struct address_space * const mapping = page->mapping;
				1569
				1570	BUG_ON(!PageLocked(page));
				1571	if (PageWriteback(page))
				1572	return 0;
				1573
				1574	if (mapping && mapping->a_ops->releasepage)
				1575	return mapping->a_ops->releasepage(page, gfp_mask);
				1576	return try_to_free_buffers(page);
				1577	}
				1578	EXPORT_SYMBOL(try_to_release_page);
				1579
				1580	/**
				1581	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1582	*
				1583	* @page: the page which is affected
				1584	* @offset: the index of the truncation point
				1585	*
				1586	* block_invalidatepage() is called when all or part of the page has become
				1587	* invalidatedby a truncate operation.
				1588	*
				1589	* block_invalidatepage() does not have to release all buffers, but it must
				1590	* ensure that no dirty buffer is left outside @offset and that no I/O
				1591	* is underway against any of the blocks which are outside the truncation
				1592	* point. Because the caller is about to free (and possibly reuse) those
				1593	* blocks on-disk.
				1594	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1595	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1596	{
				1597	struct buffer_head head, bh, *next;
				1598	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1599
				1600	BUG_ON(!PageLocked(page));
				1601	if (!page_has_buffers(page))
				1602	goto out;
				1603
				1604	head = page_buffers(page);
				1605	bh = head;
				1606	do {
				1607	unsigned int next_off = curr_off + bh->b_size;
				1608	next = bh->b_this_page;
				1609
				1610	/*
				1611	* is this block fully invalidated?
				1612	*/
				1613	if (offset <= curr_off)
				1614	discard_buffer(bh);
				1615	curr_off = next_off;
				1616	bh = next;
				1617	} while (bh != head);
				1618
				1619	/*
				1620	* We release buffers only if the entire page is being invalidated.
				1621	* The get_block cached value has been unconditionally invalidated,
				1622	* so real IO is not possible anymore.
				1623	*/
				1624	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1625	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1626	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1627	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1628	}
				1629	EXPORT_SYMBOL(block_invalidatepage);
				1630
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1631	void do_invalidatepage(struct page *page, unsigned long offset)
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	1632	{
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1633	void (invalidatepage)(struct page , unsigned long);
				1634	invalidatepage = page->mapping->a_ops->invalidatepage ? :
				1635	block_invalidatepage;
				1636	(*invalidatepage)(page, offset);
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	1637	}
				1638
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1639	/*
				1640	* We attach and possibly dirty the buffers atomically wrt
				1641	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1642	* is already excluded via the page lock.
				1643	*/
				1644	void create_empty_buffers(struct page *page,
				1645	unsigned long blocksize, unsigned long b_state)
				1646	{
				1647	struct buffer_head bh, head, *tail;
				1648
				1649	head = alloc_page_buffers(page, blocksize, 1);
				1650	bh = head;
				1651	do {
				1652	bh->b_state \|= b_state;
				1653	tail = bh;
				1654	bh = bh->b_this_page;
				1655	} while (bh);
				1656	tail->b_this_page = head;
				1657
				1658	spin_lock(&page->mapping->private_lock);
				1659	if (PageUptodate(page) \|\| PageDirty(page)) {
				1660	bh = head;
				1661	do {
				1662	if (PageDirty(page))
				1663	set_buffer_dirty(bh);
				1664	if (PageUptodate(page))
				1665	set_buffer_uptodate(bh);
				1666	bh = bh->b_this_page;
				1667	} while (bh != head);
				1668	}
				1669	attach_page_buffers(page, head);
				1670	spin_unlock(&page->mapping->private_lock);
				1671	}
				1672	EXPORT_SYMBOL(create_empty_buffers);
				1673
				1674	/*
				1675	* We are taking a block for data and we don't want any output from any
				1676	* buffer-cache aliases starting from return from that function and
				1677	* until the moment when something will explicitly mark the buffer
				1678	* dirty (hopefully that will not happen until we will free that block ;-)
				1679	* We don't even need to mark it not-uptodate - nobody can expect
				1680	* anything from a newly allocated buffer anyway. We used to used
				1681	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1682	* don't want to mark the alias unmapped, for example - it would confuse
				1683	* anyone who might pick it with bread() afterwards...
				1684	*
				1685	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1686	* be writeout I/O going on against recently-freed buffers. We don't
				1687	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1688	* only if we really need to. That happens here.
				1689	*/
				1690	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1691	{
				1692	struct buffer_head *old_bh;
				1693
				1694	might_sleep();
				1695
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1696	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1697	if (old_bh) {
				1698	clear_buffer_dirty(old_bh);
				1699	wait_on_buffer(old_bh);
				1700	clear_buffer_req(old_bh);
				1701	__brelse(old_bh);
				1702	}
				1703	}
				1704	EXPORT_SYMBOL(unmap_underlying_metadata);
				1705
				1706	/*
				1707	* NOTE! All mapped/uptodate combinations are valid:
				1708	*
				1709	* Mapped Uptodate Meaning
				1710	*
				1711	* No No "unknown" - must do get_block()
				1712	* No Yes "hole" - zero-filled
				1713	* Yes No "allocated" - allocated on disk, not read in
				1714	* Yes Yes "valid" - allocated and up-to-date in memory.
				1715	*
				1716	* "Dirty" is valid only with the last case (mapped+uptodate).
				1717	*/
				1718
				1719	/*
				1720	* While block_write_full_page is writing back the dirty buffers under
				1721	* the page lock, whoever dirtied the buffers may decide to clean them
				1722	* again at any time. We handle that by only looking at the buffer
				1723	* state inside lock_buffer().
				1724	*
				1725	* If block_write_full_page() is called for regular writeback
				1726	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1727	* locked buffer. This only can happen if someone has written the buffer
				1728	* directly, with submit_bh(). At the address_space level PageWriteback
				1729	* prevents this contention from occurring.
				1730	*/
				1731	static int __block_write_full_page(struct inode inode, struct page page,
				1732	get_block_t get_block, struct writeback_control wbc)
				1733	{
				1734	int err;
				1735	sector_t block;
				1736	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1737	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1738	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1739	int nr_underway = 0;
				1740
				1741	BUG_ON(!PageLocked(page));
				1742
				1743	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1744
				1745	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1746	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1747	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1748	}
				1749
				1750	/*
				1751	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1752	* here, and the (potentially unmapped) buffers may become dirty at
				1753	* any time. If a buffer becomes dirty here after we've inspected it
				1754	* then we just miss that fact, and the page stays dirty.
				1755	*
				1756	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1757	* handle that here by just cleaning them.
				1758	*/
				1759
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1760	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1761	head = page_buffers(page);
				1762	bh = head;
				1763
				1764	/*
				1765	* Get all the dirty buffers mapped to disk addresses and
				1766	* handle any aliases from the underlying blockdev's mapping.
				1767	*/
				1768	do {
				1769	if (block > last_block) {
				1770	/*
				1771	* mapped buffers outside i_size will occur, because
				1772	* this page can be outside i_size when there is a
				1773	* truncate in progress.
				1774	*/
				1775	/*
				1776	* The buffer was zeroed by block_write_full_page()
				1777	*/
				1778	clear_buffer_dirty(bh);
				1779	set_buffer_uptodate(bh);
				1780	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1781	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1782	err = get_block(inode, block, bh, 1);
				1783	if (err)
				1784	goto recover;
				1785	if (buffer_new(bh)) {
				1786	/* blockdev mappings never come here */
				1787	clear_buffer_new(bh);
				1788	unmap_underlying_metadata(bh->b_bdev,
				1789	bh->b_blocknr);
				1790	}
				1791	}
				1792	bh = bh->b_this_page;
				1793	block++;
				1794	} while (bh != head);
				1795
				1796	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1797	if (!buffer_mapped(bh))
				1798	continue;
				1799	/*
				1800	* If it's a fully non-blocking write attempt and we cannot
				1801	* lock the buffer then redirty the page. Note that this can
				1802	* potentially cause a busy-wait loop from pdflush and kswapd
				1803	* activity, but those code paths have their own higher-level
				1804	* throttling.
				1805	*/
				1806	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1807	lock_buffer(bh);
				1808	} else if (test_set_buffer_locked(bh)) {
				1809	redirty_page_for_writepage(wbc, page);
				1810	continue;
				1811	}
				1812	if (test_clear_buffer_dirty(bh)) {
				1813	mark_buffer_async_write(bh);
				1814	} else {
				1815	unlock_buffer(bh);
				1816	}
				1817	} while ((bh = bh->b_this_page) != head);
				1818
				1819	/*
				1820	* The page and its buffers are protected by PageWriteback(), so we can
				1821	* drop the bh refcounts early.
				1822	*/
				1823	BUG_ON(PageWriteback(page));
				1824	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1825
				1826	do {
				1827	struct buffer_head *next = bh->b_this_page;
				1828	if (buffer_async_write(bh)) {
				1829	submit_bh(WRITE, bh);
				1830	nr_underway++;
				1831	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1832	bh = next;
				1833	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1834	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1835
				1836	err = 0;
				1837	done:
				1838	if (nr_underway == 0) {
				1839	/*
				1840	* The page was marked dirty, but the buffers were
				1841	* clean. Someone wrote them back by hand with
				1842	* ll_rw_block/submit_bh. A rare case.
				1843	*/
				1844	int uptodate = 1;
				1845	do {
				1846	if (!buffer_uptodate(bh)) {
				1847	uptodate = 0;
				1848	break;
				1849	}
				1850	bh = bh->b_this_page;
				1851	} while (bh != head);
				1852	if (uptodate)
				1853	SetPageUptodate(page);
				1854	end_page_writeback(page);
				1855	/*
				1856	* The page and buffer_heads can be released at any time from
				1857	* here on.
				1858	*/
				1859	wbc->pages_skipped++; /* We didn't write this page */
				1860	}
				1861	return err;
				1862
				1863	recover:
				1864	/*
				1865	* ENOSPC, or some other error. We may already have added some
				1866	* blocks to the file, so we need to write these out to avoid
				1867	* exposing stale data.
				1868	* The page is currently locked and not marked for writeback
				1869	*/
				1870	bh = head;
				1871	/* Recovery: lock and submit the mapped buffers */
				1872	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1873	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1874	lock_buffer(bh);
				1875	mark_buffer_async_write(bh);
				1876	} else {
				1877	/*
				1878	* The buffer may have been set dirty during
				1879	* attachment to a dirty page.
				1880	*/
				1881	clear_buffer_dirty(bh);
				1882	}
				1883	} while ((bh = bh->b_this_page) != head);
				1884	SetPageError(page);
				1885	BUG_ON(PageWriteback(page));
				1886	set_page_writeback(page);
				1887	unlock_page(page);
				1888	do {
				1889	struct buffer_head *next = bh->b_this_page;
				1890	if (buffer_async_write(bh)) {
				1891	clear_buffer_dirty(bh);
				1892	submit_bh(WRITE, bh);
				1893	nr_underway++;
				1894	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1895	bh = next;
				1896	} while (bh != head);
				1897	goto done;
				1898	}
				1899
				1900	static int __block_prepare_write(struct inode inode, struct page page,
				1901	unsigned from, unsigned to, get_block_t *get_block)
				1902	{
				1903	unsigned block_start, block_end;
				1904	sector_t block;
				1905	int err = 0;
				1906	unsigned blocksize, bbits;
				1907	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1908
				1909	BUG_ON(!PageLocked(page));
				1910	BUG_ON(from > PAGE_CACHE_SIZE);
				1911	BUG_ON(to > PAGE_CACHE_SIZE);
				1912	BUG_ON(from > to);
				1913
				1914	blocksize = 1 << inode->i_blkbits;
				1915	if (!page_has_buffers(page))
				1916	create_empty_buffers(page, blocksize, 0);
				1917	head = page_buffers(page);
				1918
				1919	bbits = inode->i_blkbits;
				1920	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1921
				1922	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1923	block++, block_start=block_end, bh = bh->b_this_page) {
				1924	block_end = block_start + blocksize;
				1925	if (block_end <= from \|\| block_start >= to) {
				1926	if (PageUptodate(page)) {
				1927	if (!buffer_uptodate(bh))
				1928	set_buffer_uptodate(bh);
				1929	}
				1930	continue;
				1931	}
				1932	if (buffer_new(bh))
				1933	clear_buffer_new(bh);
				1934	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1935	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1936	err = get_block(inode, block, bh, 1);
				1937	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1938	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1939	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1940	unmap_underlying_metadata(bh->b_bdev,
				1941	bh->b_blocknr);
				1942	if (PageUptodate(page)) {
				1943	set_buffer_uptodate(bh);
				1944	continue;
				1945	}
				1946	if (block_end > to \|\| block_start < from) {
				1947	void *kaddr;
				1948
				1949	kaddr = kmap_atomic(page, KM_USER0);
				1950	if (block_end > to)
				1951	memset(kaddr+to, 0,
				1952	block_end-to);
				1953	if (block_start < from)
				1954	memset(kaddr+block_start,
				1955	0, from-block_start);
				1956	flush_dcache_page(page);
				1957	kunmap_atomic(kaddr, KM_USER0);
				1958	}
				1959	continue;
				1960	}
				1961	}
				1962	if (PageUptodate(page)) {
				1963	if (!buffer_uptodate(bh))
				1964	set_buffer_uptodate(bh);
				1965	continue;
				1966	}
				1967	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
				1968	(block_start < from \|\| block_end > to)) {
				1969	ll_rw_block(READ, 1, &bh);
				1970	*wait_bh++=bh;
				1971	}
				1972	}
				1973	/*
				1974	* If we issued read requests - let them complete.
				1975	*/
				1976	while(wait_bh > wait) {
				1977	wait_on_buffer(*--wait_bh);
				1978	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1979	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1980	}
Anton Altaparmakov	152becd	2005-06-23 00:10:21 -0700	[diff] [blame]	1981	if (!err) {
				1982	bh = head;
				1983	do {
				1984	if (buffer_new(bh))
				1985	clear_buffer_new(bh);
				1986	} while ((bh = bh->b_this_page) != head);
				1987	return 0;
				1988	}
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1989	/* Error case: */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1990	/*
				1991	* Zero out any newly allocated blocks to avoid exposing stale
				1992	* data. If BH_New is set, we know that the block was newly
				1993	* allocated in the above loop.
				1994	*/
				1995	bh = head;
				1996	block_start = 0;
				1997	do {
				1998	block_end = block_start+blocksize;
				1999	if (block_end <= from)
				2000	goto next_bh;
				2001	if (block_start >= to)
				2002	break;
				2003	if (buffer_new(bh)) {
				2004	void *kaddr;
				2005
				2006	clear_buffer_new(bh);
				2007	kaddr = kmap_atomic(page, KM_USER0);
				2008	memset(kaddr+block_start, 0, bh->b_size);
				2009	kunmap_atomic(kaddr, KM_USER0);
				2010	set_buffer_uptodate(bh);
				2011	mark_buffer_dirty(bh);
				2012	}
				2013	next_bh:
				2014	block_start = block_end;
				2015	bh = bh->b_this_page;
				2016	} while (bh != head);
				2017	return err;
				2018	}
				2019
				2020	static int __block_commit_write(struct inode inode, struct page page,
				2021	unsigned from, unsigned to)
				2022	{
				2023	unsigned block_start, block_end;
				2024	int partial = 0;
				2025	unsigned blocksize;
				2026	struct buffer_head bh, head;
				2027
				2028	blocksize = 1 << inode->i_blkbits;
				2029
				2030	for(bh = head = page_buffers(page), block_start = 0;
				2031	bh != head \|\| !block_start;
				2032	block_start=block_end, bh = bh->b_this_page) {
				2033	block_end = block_start + blocksize;
				2034	if (block_end <= from \|\| block_start >= to) {
				2035	if (!buffer_uptodate(bh))
				2036	partial = 1;
				2037	} else {
				2038	set_buffer_uptodate(bh);
				2039	mark_buffer_dirty(bh);
				2040	}
				2041	}
				2042
				2043	/*
				2044	* If this is a partial write which happened to make all buffers
				2045	* uptodate then we can optimize away a bogus readpage() for
				2046	* the next read(). Here we 'discover' whether the page went
				2047	* uptodate as a result of this (potentially partial) write.
				2048	*/
				2049	if (!partial)
				2050	SetPageUptodate(page);
				2051	return 0;
				2052	}
				2053
				2054	/*
				2055	* Generic "read page" function for block devices that have the normal
				2056	* get_block functionality. This is most of the block device filesystems.
				2057	* Reads the page asynchronously --- the unlock_buffer() and
				2058	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2059	* page struct once IO has completed.
				2060	*/
				2061	int block_read_full_page(struct page page, get_block_t get_block)
				2062	{
				2063	struct inode *inode = page->mapping->host;
				2064	sector_t iblock, lblock;
				2065	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2066	unsigned int blocksize;
				2067	int nr, i;
				2068	int fully_mapped = 1;
				2069
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2070	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2071	blocksize = 1 << inode->i_blkbits;
				2072	if (!page_has_buffers(page))
				2073	create_empty_buffers(page, blocksize, 0);
				2074	head = page_buffers(page);
				2075
				2076	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2077	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2078	bh = head;
				2079	nr = 0;
				2080	i = 0;
				2081
				2082	do {
				2083	if (buffer_uptodate(bh))
				2084	continue;
				2085
				2086	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2087	int err = 0;
				2088
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2089	fully_mapped = 0;
				2090	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2091	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2092	err = get_block(inode, iblock, bh, 0);
				2093	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2094	SetPageError(page);
				2095	}
				2096	if (!buffer_mapped(bh)) {
				2097	void *kaddr = kmap_atomic(page, KM_USER0);
				2098	memset(kaddr + i * blocksize, 0, blocksize);
				2099	flush_dcache_page(page);
				2100	kunmap_atomic(kaddr, KM_USER0);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2101	if (!err)
				2102	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2103	continue;
				2104	}
				2105	/*
				2106	* get_block() might have updated the buffer
				2107	* synchronously
				2108	*/
				2109	if (buffer_uptodate(bh))
				2110	continue;
				2111	}
				2112	arr[nr++] = bh;
				2113	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2114
				2115	if (fully_mapped)
				2116	SetPageMappedToDisk(page);
				2117
				2118	if (!nr) {
				2119	/*
				2120	* All buffers are uptodate - we can set the page uptodate
				2121	* as well. But not if get_block() returned an error.
				2122	*/
				2123	if (!PageError(page))
				2124	SetPageUptodate(page);
				2125	unlock_page(page);
				2126	return 0;
				2127	}
				2128
				2129	/* Stage two: lock the buffers */
				2130	for (i = 0; i < nr; i++) {
				2131	bh = arr[i];
				2132	lock_buffer(bh);
				2133	mark_buffer_async_read(bh);
				2134	}
				2135
				2136	/*
				2137	* Stage 3: start the IO. Check for uptodateness
				2138	* inside the buffer lock in case another process reading
				2139	* the underlying blockdev brought it uptodate (the sct fix).
				2140	*/
				2141	for (i = 0; i < nr; i++) {
				2142	bh = arr[i];
				2143	if (buffer_uptodate(bh))
				2144	end_buffer_async_read(bh, 1);
				2145	else
				2146	submit_bh(READ, bh);
				2147	}
				2148	return 0;
				2149	}
				2150
				2151	/* utility function for filesystems that need to do work on expanding
				2152	* truncates. Uses prepare/commit_write to allow the filesystem to
				2153	* deal with the hole.
				2154	*/
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2155	static int __generic_cont_expand(struct inode *inode, loff_t size,
				2156	pgoff_t index, unsigned int offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2157	{
				2158	struct address_space *mapping = inode->i_mapping;
				2159	struct page *page;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2160	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2161	int err;
				2162
				2163	err = -EFBIG;
				2164	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2165	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2166	send_sig(SIGXFSZ, current, 0);
				2167	goto out;
				2168	}
				2169	if (size > inode->i_sb->s_maxbytes)
				2170	goto out;
				2171
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2172	err = -ENOMEM;
				2173	page = grab_cache_page(mapping, index);
				2174	if (!page)
				2175	goto out;
				2176	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2177	if (err) {
				2178	/*
				2179	* ->prepare_write() may have instantiated a few blocks
				2180	* outside i_size. Trim these off again.
				2181	*/
				2182	unlock_page(page);
				2183	page_cache_release(page);
				2184	vmtruncate(inode, inode->i_size);
				2185	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2186	}
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2187
				2188	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
				2189
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2190	unlock_page(page);
				2191	page_cache_release(page);
				2192	if (err > 0)
				2193	err = 0;
				2194	out:
				2195	return err;
				2196	}
				2197
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2198	int generic_cont_expand(struct inode *inode, loff_t size)
				2199	{
				2200	pgoff_t index;
				2201	unsigned int offset;
				2202
				2203	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
				2204
				2205	/* ugh. in prepare/commit_write, if from==to==start of block, we
				2206	** skip the prepare. make sure we never send an offset for the start
				2207	** of a block
				2208	*/
				2209	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				2210	/* caller must handle this extra byte. */
				2211	offset++;
				2212	}
				2213	index = size >> PAGE_CACHE_SHIFT;
				2214
				2215	return __generic_cont_expand(inode, size, index, offset);
				2216	}
				2217
				2218	int generic_cont_expand_simple(struct inode *inode, loff_t size)
				2219	{
				2220	loff_t pos = size - 1;
				2221	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
				2222	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
				2223
				2224	/* prepare/commit_write can handle even if from==to==start of block. */
				2225	return __generic_cont_expand(inode, size, index, offset);
				2226	}
				2227
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2228	/*
				2229	* For moronic filesystems that do not allow holes in file.
				2230	* We may have to extend the file.
				2231	*/
				2232
				2233	int cont_prepare_write(struct page *page, unsigned offset,
				2234	unsigned to, get_block_t get_block, loff_t bytes)
				2235	{
				2236	struct address_space *mapping = page->mapping;
				2237	struct inode *inode = mapping->host;
				2238	struct page *new_page;
				2239	pgoff_t pgpos;
				2240	long status;
				2241	unsigned zerofrom;
				2242	unsigned blocksize = 1 << inode->i_blkbits;
				2243	void *kaddr;
				2244
				2245	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
				2246	status = -ENOMEM;
				2247	new_page = grab_cache_page(mapping, pgpos);
				2248	if (!new_page)
				2249	goto out;
				2250	/* we might sleep */
				2251	if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
				2252	unlock_page(new_page);
				2253	page_cache_release(new_page);
				2254	continue;
				2255	}
				2256	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2257	if (zerofrom & (blocksize-1)) {
				2258	*bytes \|= (blocksize-1);
				2259	(*bytes)++;
				2260	}
				2261	status = __block_prepare_write(inode, new_page, zerofrom,
				2262	PAGE_CACHE_SIZE, get_block);
				2263	if (status)
				2264	goto out_unmap;
				2265	kaddr = kmap_atomic(new_page, KM_USER0);
				2266	memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
				2267	flush_dcache_page(new_page);
				2268	kunmap_atomic(kaddr, KM_USER0);
				2269	generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
				2270	unlock_page(new_page);
				2271	page_cache_release(new_page);
				2272	}
				2273
				2274	if (page->index < pgpos) {
				2275	/* completely inside the area */
				2276	zerofrom = offset;
				2277	} else {
				2278	/* page covers the boundary, find the boundary offset */
				2279	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2280
				2281	/* if we will expand the thing last block will be filled */
				2282	if (to > zerofrom && (zerofrom & (blocksize-1))) {
				2283	*bytes \|= (blocksize-1);
				2284	(*bytes)++;
				2285	}
				2286
				2287	/* starting below the boundary? Nothing to zero out */
				2288	if (offset <= zerofrom)
				2289	zerofrom = offset;
				2290	}
				2291	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
				2292	if (status)
				2293	goto out1;
				2294	if (zerofrom < offset) {
				2295	kaddr = kmap_atomic(page, KM_USER0);
				2296	memset(kaddr+zerofrom, 0, offset-zerofrom);
				2297	flush_dcache_page(page);
				2298	kunmap_atomic(kaddr, KM_USER0);
				2299	__block_commit_write(inode, page, zerofrom, offset);
				2300	}
				2301	return 0;
				2302	out1:
				2303	ClearPageUptodate(page);
				2304	return status;
				2305
				2306	out_unmap:
				2307	ClearPageUptodate(new_page);
				2308	unlock_page(new_page);
				2309	page_cache_release(new_page);
				2310	out:
				2311	return status;
				2312	}
				2313
				2314	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2315	get_block_t *get_block)
				2316	{
				2317	struct inode *inode = page->mapping->host;
				2318	int err = __block_prepare_write(inode, page, from, to, get_block);
				2319	if (err)
				2320	ClearPageUptodate(page);
				2321	return err;
				2322	}
				2323
				2324	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2325	{
				2326	struct inode *inode = page->mapping->host;
				2327	__block_commit_write(inode,page,from,to);
				2328	return 0;
				2329	}
				2330
				2331	int generic_commit_write(struct file file, struct page page,
				2332	unsigned from, unsigned to)
				2333	{
				2334	struct inode *inode = page->mapping->host;
				2335	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2336	__block_commit_write(inode,page,from,to);
				2337	/*
				2338	* No need to use i_size_read() here, the i_size
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	2339	* cannot change under us because we hold i_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2340	*/
				2341	if (pos > inode->i_size) {
				2342	i_size_write(inode, pos);
				2343	mark_inode_dirty(inode);
				2344	}
				2345	return 0;
				2346	}
				2347
				2348
				2349	/*
				2350	* nobh_prepare_write()'s prereads are special: the buffer_heads are freed
				2351	* immediately, while under the page lock. So it needs a special end_io
				2352	* handler which does not touch the bh after unlocking it.
				2353	*
				2354	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				2355	* a race there is benign: unlock_buffer() only use the bh's address for
				2356	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				2357	* itself.
				2358	*/
				2359	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2360	{
				2361	if (uptodate) {
				2362	set_buffer_uptodate(bh);
				2363	} else {
				2364	/* This happens, due to failed READA attempts. */
				2365	clear_buffer_uptodate(bh);
				2366	}
				2367	unlock_buffer(bh);
				2368	}
				2369
				2370	/*
				2371	* On entry, the page is fully not uptodate.
				2372	* On exit the page is fully uptodate in the areas outside (from,to)
				2373	*/
				2374	int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
				2375	get_block_t *get_block)
				2376	{
				2377	struct inode *inode = page->mapping->host;
				2378	const unsigned blkbits = inode->i_blkbits;
				2379	const unsigned blocksize = 1 << blkbits;
				2380	struct buffer_head map_bh;
				2381	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
				2382	unsigned block_in_page;
				2383	unsigned block_start;
				2384	sector_t block_in_file;
				2385	char *kaddr;
				2386	int nr_reads = 0;
				2387	int i;
				2388	int ret = 0;
				2389	int is_mapped_to_disk = 1;
				2390	int dirtied_it = 0;
				2391
				2392	if (PageMappedToDisk(page))
				2393	return 0;
				2394
				2395	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
				2396	map_bh.b_page = page;
				2397
				2398	/*
				2399	* We loop across all blocks in the page, whether or not they are
				2400	* part of the affected region. This is so we can discover if the
				2401	* page is fully mapped-to-disk.
				2402	*/
				2403	for (block_start = 0, block_in_page = 0;
				2404	block_start < PAGE_CACHE_SIZE;
				2405	block_in_page++, block_start += blocksize) {
				2406	unsigned block_end = block_start + blocksize;
				2407	int create;
				2408
				2409	map_bh.b_state = 0;
				2410	create = 1;
				2411	if (block_start >= to)
				2412	create = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2413	map_bh.b_size = blocksize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2414	ret = get_block(inode, block_in_file + block_in_page,
				2415	&map_bh, create);
				2416	if (ret)
				2417	goto failed;
				2418	if (!buffer_mapped(&map_bh))
				2419	is_mapped_to_disk = 0;
				2420	if (buffer_new(&map_bh))
				2421	unmap_underlying_metadata(map_bh.b_bdev,
				2422	map_bh.b_blocknr);
				2423	if (PageUptodate(page))
				2424	continue;
				2425	if (buffer_new(&map_bh) \|\| !buffer_mapped(&map_bh)) {
				2426	kaddr = kmap_atomic(page, KM_USER0);
				2427	if (block_start < from) {
				2428	memset(kaddr+block_start, 0, from-block_start);
				2429	dirtied_it = 1;
				2430	}
				2431	if (block_end > to) {
				2432	memset(kaddr + to, 0, block_end - to);
				2433	dirtied_it = 1;
				2434	}
				2435	flush_dcache_page(page);
				2436	kunmap_atomic(kaddr, KM_USER0);
				2437	continue;
				2438	}
				2439	if (buffer_uptodate(&map_bh))
				2440	continue; /* reiserfs does this */
				2441	if (block_start < from \|\| block_end > to) {
				2442	struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
				2443
				2444	if (!bh) {
				2445	ret = -ENOMEM;
				2446	goto failed;
				2447	}
				2448	bh->b_state = map_bh.b_state;
				2449	atomic_set(&bh->b_count, 0);
				2450	bh->b_this_page = NULL;
				2451	bh->b_page = page;
				2452	bh->b_blocknr = map_bh.b_blocknr;
				2453	bh->b_size = blocksize;
				2454	bh->b_data = (char *)(long)block_start;
				2455	bh->b_bdev = map_bh.b_bdev;
				2456	bh->b_private = NULL;
				2457	read_bh[nr_reads++] = bh;
				2458	}
				2459	}
				2460
				2461	if (nr_reads) {
				2462	struct buffer_head *bh;
				2463
				2464	/*
				2465	* The page is locked, so these buffers are protected from
				2466	* any VM or truncate activity. Hence we don't need to care
				2467	* for the buffer_head refcounts.
				2468	*/
				2469	for (i = 0; i < nr_reads; i++) {
				2470	bh = read_bh[i];
				2471	lock_buffer(bh);
				2472	bh->b_end_io = end_buffer_read_nobh;
				2473	submit_bh(READ, bh);
				2474	}
				2475	for (i = 0; i < nr_reads; i++) {
				2476	bh = read_bh[i];
				2477	wait_on_buffer(bh);
				2478	if (!buffer_uptodate(bh))
				2479	ret = -EIO;
				2480	free_buffer_head(bh);
				2481	read_bh[i] = NULL;
				2482	}
				2483	if (ret)
				2484	goto failed;
				2485	}
				2486
				2487	if (is_mapped_to_disk)
				2488	SetPageMappedToDisk(page);
				2489	SetPageUptodate(page);
				2490
				2491	/*
				2492	* Setting the page dirty here isn't necessary for the prepare_write
				2493	* function - commit_write will do that. But if/when this function is
				2494	* used within the pagefault handler to ensure that all mmapped pages
				2495	* have backing space in the filesystem, we will need to dirty the page
				2496	* if its contents were altered.
				2497	*/
				2498	if (dirtied_it)
				2499	set_page_dirty(page);
				2500
				2501	return 0;
				2502
				2503	failed:
				2504	for (i = 0; i < nr_reads; i++) {
				2505	if (read_bh[i])
				2506	free_buffer_head(read_bh[i]);
				2507	}
				2508
				2509	/*
				2510	* Error recovery is pretty slack. Clear the page and mark it dirty
				2511	* so we'll later zero out any blocks which _were_ allocated.
				2512	*/
				2513	kaddr = kmap_atomic(page, KM_USER0);
				2514	memset(kaddr, 0, PAGE_CACHE_SIZE);
				2515	kunmap_atomic(kaddr, KM_USER0);
				2516	SetPageUptodate(page);
				2517	set_page_dirty(page);
				2518	return ret;
				2519	}
				2520	EXPORT_SYMBOL(nobh_prepare_write);
				2521
				2522	int nobh_commit_write(struct file file, struct page page,
				2523	unsigned from, unsigned to)
				2524	{
				2525	struct inode *inode = page->mapping->host;
				2526	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2527
				2528	set_page_dirty(page);
				2529	if (pos > inode->i_size) {
				2530	i_size_write(inode, pos);
				2531	mark_inode_dirty(inode);
				2532	}
				2533	return 0;
				2534	}
				2535	EXPORT_SYMBOL(nobh_commit_write);
				2536
				2537	/*
				2538	* nobh_writepage() - based on block_full_write_page() except
				2539	* that it tries to operate without attaching bufferheads to
				2540	* the page.
				2541	*/
				2542	int nobh_writepage(struct page page, get_block_t get_block,
				2543	struct writeback_control *wbc)
				2544	{
				2545	struct inode * const inode = page->mapping->host;
				2546	loff_t i_size = i_size_read(inode);
				2547	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2548	unsigned offset;
				2549	void *kaddr;
				2550	int ret;
				2551
				2552	/* Is the page fully inside i_size? */
				2553	if (page->index < end_index)
				2554	goto out;
				2555
				2556	/* Is the page fully outside i_size? (truncate in progress) */
				2557	offset = i_size & (PAGE_CACHE_SIZE-1);
				2558	if (page->index >= end_index+1 \|\| !offset) {
				2559	/*
				2560	* The page may have dirty, unmapped buffers. For example,
				2561	* they may have been added in ext3_writepage(). Make them
				2562	* freeable here, so the page does not leak.
				2563	*/
				2564	#if 0
				2565	/* Not really sure about this - do we need this ? */
				2566	if (page->mapping->a_ops->invalidatepage)
				2567	page->mapping->a_ops->invalidatepage(page, offset);
				2568	#endif
				2569	unlock_page(page);
				2570	return 0; /* don't care */
				2571	}
				2572
				2573	/*
				2574	* The page straddles i_size. It must be zeroed out on each and every
				2575	* writepage invocation because it may be mmapped. "A file is mapped
				2576	* in multiples of the page size. For a file that is not a multiple of
				2577	* the page size, the remaining memory is zeroed when mapped, and
				2578	* writes to that region are not written out to the file."
				2579	*/
				2580	kaddr = kmap_atomic(page, KM_USER0);
				2581	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2582	flush_dcache_page(page);
				2583	kunmap_atomic(kaddr, KM_USER0);
				2584	out:
				2585	ret = mpage_writepage(page, get_block, wbc);
				2586	if (ret == -EAGAIN)
				2587	ret = __block_write_full_page(inode, page, get_block, wbc);
				2588	return ret;
				2589	}
				2590	EXPORT_SYMBOL(nobh_writepage);
				2591
				2592	/*
				2593	* This function assumes that ->prepare_write() uses nobh_prepare_write().
				2594	*/
				2595	int nobh_truncate_page(struct address_space *mapping, loff_t from)
				2596	{
				2597	struct inode *inode = mapping->host;
				2598	unsigned blocksize = 1 << inode->i_blkbits;
				2599	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2600	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2601	unsigned to;
				2602	struct page *page;
				2603	struct address_space_operations *a_ops = mapping->a_ops;
				2604	char *kaddr;
				2605	int ret = 0;
				2606
				2607	if ((offset & (blocksize - 1)) == 0)
				2608	goto out;
				2609
				2610	ret = -ENOMEM;
				2611	page = grab_cache_page(mapping, index);
				2612	if (!page)
				2613	goto out;
				2614
				2615	to = (offset + blocksize) & ~(blocksize - 1);
				2616	ret = a_ops->prepare_write(NULL, page, offset, to);
				2617	if (ret == 0) {
				2618	kaddr = kmap_atomic(page, KM_USER0);
				2619	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2620	flush_dcache_page(page);
				2621	kunmap_atomic(kaddr, KM_USER0);
				2622	set_page_dirty(page);
				2623	}
				2624	unlock_page(page);
				2625	page_cache_release(page);
				2626	out:
				2627	return ret;
				2628	}
				2629	EXPORT_SYMBOL(nobh_truncate_page);
				2630
				2631	int block_truncate_page(struct address_space *mapping,
				2632	loff_t from, get_block_t *get_block)
				2633	{
				2634	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2635	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2636	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2637	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2638	unsigned length, pos;
				2639	struct inode *inode = mapping->host;
				2640	struct page *page;
				2641	struct buffer_head *bh;
				2642	void *kaddr;
				2643	int err;
				2644
				2645	blocksize = 1 << inode->i_blkbits;
				2646	length = offset & (blocksize - 1);
				2647
				2648	/* Block boundary? Nothing to do */
				2649	if (!length)
				2650	return 0;
				2651
				2652	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2653	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2654
				2655	page = grab_cache_page(mapping, index);
				2656	err = -ENOMEM;
				2657	if (!page)
				2658	goto out;
				2659
				2660	if (!page_has_buffers(page))
				2661	create_empty_buffers(page, blocksize, 0);
				2662
				2663	/* Find the buffer that contains "offset" */
				2664	bh = page_buffers(page);
				2665	pos = blocksize;
				2666	while (offset >= pos) {
				2667	bh = bh->b_this_page;
				2668	iblock++;
				2669	pos += blocksize;
				2670	}
				2671
				2672	err = 0;
				2673	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2674	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2675	err = get_block(inode, iblock, bh, 0);
				2676	if (err)
				2677	goto unlock;
				2678	/* unmapped? It's a hole - nothing to do */
				2679	if (!buffer_mapped(bh))
				2680	goto unlock;
				2681	}
				2682
				2683	/* Ok, it's mapped. Make sure it's up-to-date */
				2684	if (PageUptodate(page))
				2685	set_buffer_uptodate(bh);
				2686
				2687	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
				2688	err = -EIO;
				2689	ll_rw_block(READ, 1, &bh);
				2690	wait_on_buffer(bh);
				2691	/* Uhhuh. Read error. Complain and punt. */
				2692	if (!buffer_uptodate(bh))
				2693	goto unlock;
				2694	}
				2695
				2696	kaddr = kmap_atomic(page, KM_USER0);
				2697	memset(kaddr + offset, 0, length);
				2698	flush_dcache_page(page);
				2699	kunmap_atomic(kaddr, KM_USER0);
				2700
				2701	mark_buffer_dirty(bh);
				2702	err = 0;
				2703
				2704	unlock:
				2705	unlock_page(page);
				2706	page_cache_release(page);
				2707	out:
				2708	return err;
				2709	}
				2710
				2711	/*
				2712	* The generic ->writepage function for buffer-backed address_spaces
				2713	*/
				2714	int block_write_full_page(struct page page, get_block_t get_block,
				2715	struct writeback_control *wbc)
				2716	{
				2717	struct inode * const inode = page->mapping->host;
				2718	loff_t i_size = i_size_read(inode);
				2719	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2720	unsigned offset;
				2721	void *kaddr;
				2722
				2723	/* Is the page fully inside i_size? */
				2724	if (page->index < end_index)
				2725	return __block_write_full_page(inode, page, get_block, wbc);
				2726
				2727	/* Is the page fully outside i_size? (truncate in progress) */
				2728	offset = i_size & (PAGE_CACHE_SIZE-1);
				2729	if (page->index >= end_index+1 \|\| !offset) {
				2730	/*
				2731	* The page may have dirty, unmapped buffers. For example,
				2732	* they may have been added in ext3_writepage(). Make them
				2733	* freeable here, so the page does not leak.
				2734	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2735	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2736	unlock_page(page);
				2737	return 0; /* don't care */
				2738	}
				2739
				2740	/*
				2741	* The page straddles i_size. It must be zeroed out on each and every
				2742	* writepage invokation because it may be mmapped. "A file is mapped
				2743	* in multiples of the page size. For a file that is not a multiple of
				2744	* the page size, the remaining memory is zeroed when mapped, and
				2745	* writes to that region are not written out to the file."
				2746	*/
				2747	kaddr = kmap_atomic(page, KM_USER0);
				2748	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2749	flush_dcache_page(page);
				2750	kunmap_atomic(kaddr, KM_USER0);
				2751	return __block_write_full_page(inode, page, get_block, wbc);
				2752	}
				2753
				2754	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2755	get_block_t *get_block)
				2756	{
				2757	struct buffer_head tmp;
				2758	struct inode *inode = mapping->host;
				2759	tmp.b_state = 0;
				2760	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2761	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2762	get_block(inode, block, &tmp, 0);
				2763	return tmp.b_blocknr;
				2764	}
				2765
				2766	static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
				2767	{
				2768	struct buffer_head *bh = bio->bi_private;
				2769
				2770	if (bio->bi_size)
				2771	return 1;
				2772
				2773	if (err == -EOPNOTSUPP) {
				2774	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2775	set_bit(BH_Eopnotsupp, &bh->b_state);
				2776	}
				2777
				2778	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2779	bio_put(bio);
				2780	return 0;
				2781	}
				2782
				2783	int submit_bh(int rw, struct buffer_head * bh)
				2784	{
				2785	struct bio *bio;
				2786	int ret = 0;
				2787
				2788	BUG_ON(!buffer_locked(bh));
				2789	BUG_ON(!buffer_mapped(bh));
				2790	BUG_ON(!bh->b_end_io);
				2791
				2792	if (buffer_ordered(bh) && (rw == WRITE))
				2793	rw = WRITE_BARRIER;
				2794
				2795	/*
				2796	* Only clear out a write error when rewriting, should this
				2797	* include WRITE_SYNC as well?
				2798	*/
				2799	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2800	clear_buffer_write_io_error(bh);
				2801
				2802	/*
				2803	* from here on down, it's all bio -- do the initial mapping,
				2804	* submit_bio -> generic_make_request may further map this bio around
				2805	*/
				2806	bio = bio_alloc(GFP_NOIO, 1);
				2807
				2808	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2809	bio->bi_bdev = bh->b_bdev;
				2810	bio->bi_io_vec[0].bv_page = bh->b_page;
				2811	bio->bi_io_vec[0].bv_len = bh->b_size;
				2812	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2813
				2814	bio->bi_vcnt = 1;
				2815	bio->bi_idx = 0;
				2816	bio->bi_size = bh->b_size;
				2817
				2818	bio->bi_end_io = end_bio_bh_io_sync;
				2819	bio->bi_private = bh;
				2820
				2821	bio_get(bio);
				2822	submit_bio(rw, bio);
				2823
				2824	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2825	ret = -EOPNOTSUPP;
				2826
				2827	bio_put(bio);
				2828	return ret;
				2829	}
				2830
				2831	/**
				2832	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2833	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2834	* @nr: number of &struct buffer_heads in the array
				2835	* @bhs: array of pointers to &struct buffer_head
				2836	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2837	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2838	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2839	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2840	* are sent to disk. The fourth %READA option is described in the documentation
				2841	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2842	*
				2843	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2844	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2845	* clean when doing a write request, and any buffer that appears to be
				2846	* up-to-date when doing read request. Further it marks as clean buffers that
				2847	* are processed for writing (the buffer cache won't assume that they are
				2848	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2849	*
				2850	* ll_rw_block sets b_end_io to simple completion handler that marks
				2851	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2852	* any waiters.
				2853	*
				2854	* All of the buffers must be for the same device, and must also be a
				2855	* multiple of the current approved size for the device.
				2856	*/
				2857	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2858	{
				2859	int i;
				2860
				2861	for (i = 0; i < nr; i++) {
				2862	struct buffer_head *bh = bhs[i];
				2863
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2864	if (rw == SWRITE)
				2865	lock_buffer(bh);
				2866	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2867	continue;
				2868
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2869	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2870	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2871	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2872	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2873	submit_bh(WRITE, bh);
				2874	continue;
				2875	}
				2876	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2877	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2878	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2879	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2880	submit_bh(rw, bh);
				2881	continue;
				2882	}
				2883	}
				2884	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2885	}
				2886	}
				2887
				2888	/*
				2889	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2890	* and then start new I/O and then wait upon it. The caller must have a ref on
				2891	* the buffer_head.
				2892	*/
				2893	int sync_dirty_buffer(struct buffer_head *bh)
				2894	{
				2895	int ret = 0;
				2896
				2897	WARN_ON(atomic_read(&bh->b_count) < 1);
				2898	lock_buffer(bh);
				2899	if (test_clear_buffer_dirty(bh)) {
				2900	get_bh(bh);
				2901	bh->b_end_io = end_buffer_write_sync;
				2902	ret = submit_bh(WRITE, bh);
				2903	wait_on_buffer(bh);
				2904	if (buffer_eopnotsupp(bh)) {
				2905	clear_buffer_eopnotsupp(bh);
				2906	ret = -EOPNOTSUPP;
				2907	}
				2908	if (!ret && !buffer_uptodate(bh))
				2909	ret = -EIO;
				2910	} else {
				2911	unlock_buffer(bh);
				2912	}
				2913	return ret;
				2914	}
				2915
				2916	/*
				2917	* try_to_free_buffers() checks if all the buffers on this particular page
				2918	* are unused, and releases them if so.
				2919	*
				2920	* Exclusion against try_to_free_buffers may be obtained by either
				2921	* locking the page or by holding its mapping's private_lock.
				2922	*
				2923	* If the page is dirty but all the buffers are clean then we need to
				2924	* be sure to mark the page clean as well. This is because the page
				2925	* may be against a block device, and a later reattachment of buffers
				2926	* to a dirty page will set all buffers dirty. Which would corrupt
				2927	* filesystem data on the same device.
				2928	*
				2929	* The same applies to regular filesystem pages: if all the buffers are
				2930	* clean then we set the page clean and proceed. To do that, we require
				2931	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				2932	* private_lock.
				2933	*
				2934	* try_to_free_buffers() is non-blocking.
				2935	*/
				2936	static inline int buffer_busy(struct buffer_head *bh)
				2937	{
				2938	return atomic_read(&bh->b_count) \|
				2939	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				2940	}
				2941
				2942	static int
				2943	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				2944	{
				2945	struct buffer_head *head = page_buffers(page);
				2946	struct buffer_head *bh;
				2947
				2948	bh = head;
				2949	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	2950	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2951	set_bit(AS_EIO, &page->mapping->flags);
				2952	if (buffer_busy(bh))
				2953	goto failed;
				2954	bh = bh->b_this_page;
				2955	} while (bh != head);
				2956
				2957	do {
				2958	struct buffer_head *next = bh->b_this_page;
				2959
				2960	if (!list_empty(&bh->b_assoc_buffers))
				2961	__remove_assoc_queue(bh);
				2962	bh = next;
				2963	} while (bh != head);
				2964	*buffers_to_free = head;
				2965	__clear_page_buffers(page);
				2966	return 1;
				2967	failed:
				2968	return 0;
				2969	}
				2970
				2971	int try_to_free_buffers(struct page *page)
				2972	{
				2973	struct address_space * const mapping = page->mapping;
				2974	struct buffer_head *buffers_to_free = NULL;
				2975	int ret = 0;
				2976
				2977	BUG_ON(!PageLocked(page));
				2978	if (PageWriteback(page))
				2979	return 0;
				2980
				2981	if (mapping == NULL) { /* can this still happen? */
				2982	ret = drop_buffers(page, &buffers_to_free);
				2983	goto out;
				2984	}
				2985
				2986	spin_lock(&mapping->private_lock);
				2987	ret = drop_buffers(page, &buffers_to_free);
				2988	if (ret) {
				2989	/*
				2990	* If the filesystem writes its buffers by hand (eg ext3)
				2991	* then we can have clean buffers against a dirty page. We
				2992	* clean the page here; otherwise later reattachment of buffers
				2993	* could encounter a non-uptodate page, which is unresolvable.
				2994	* This only applies in the rare case where try_to_free_buffers
				2995	* succeeds but the page is not freed.
				2996	*/
				2997	clear_page_dirty(page);
				2998	}
				2999	spin_unlock(&mapping->private_lock);
				3000	out:
				3001	if (buffers_to_free) {
				3002	struct buffer_head *bh = buffers_to_free;
				3003
				3004	do {
				3005	struct buffer_head *next = bh->b_this_page;
				3006	free_buffer_head(bh);
				3007	bh = next;
				3008	} while (bh != buffers_to_free);
				3009	}
				3010	return ret;
				3011	}
				3012	EXPORT_SYMBOL(try_to_free_buffers);
				3013
NeilBrown	3978d71	2006-03-26 01:37:17 -0800	[diff] [blame]	3014	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3015	{
				3016	struct address_space *mapping;
				3017
				3018	smp_mb();
				3019	mapping = page_mapping(page);
				3020	if (mapping)
				3021	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3022	}
				3023
				3024	/*
				3025	* There are no bdflush tunables left. But distributions are
				3026	* still running obsolete flush daemons, so we terminate them here.
				3027	*
				3028	* Use of bdflush() is deprecated and will be removed in a future kernel.
				3029	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				3030	*/
				3031	asmlinkage long sys_bdflush(int func, long data)
				3032	{
				3033	static int msg_count;
				3034
				3035	if (!capable(CAP_SYS_ADMIN))
				3036	return -EPERM;
				3037
				3038	if (msg_count < 5) {
				3039	msg_count++;
				3040	printk(KERN_INFO
				3041	"warning: process `%s' used the obsolete bdflush"
				3042	" system call\n", current->comm);
				3043	printk(KERN_INFO "Fix your initscripts?\n");
				3044	}
				3045
				3046	if (func == 1)
				3047	do_exit(0);
				3048	return 0;
				3049	}
				3050
				3051	/*
				3052	* Buffer-head allocation
				3053	*/
				3054	static kmem_cache_t *bh_cachep;
				3055
				3056	/*
				3057	* Once the number of bh's in the machine exceeds this level, we start
				3058	* stripping them in writeback.
				3059	*/
				3060	static int max_buffer_heads;
				3061
				3062	int buffer_heads_over_limit;
				3063
				3064	struct bh_accounting {
				3065	int nr; /* Number of live bh's */
				3066	int ratelimit; /* Limit cacheline bouncing */
				3067	};
				3068
				3069	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3070
				3071	static void recalc_bh_state(void)
				3072	{
				3073	int i;
				3074	int tot = 0;
				3075
				3076	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3077	return;
				3078	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3079	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3080	tot += per_cpu(bh_accounting, i).nr;
				3081	buffer_heads_over_limit = (tot > max_buffer_heads);
				3082	}
				3083
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3084	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3085	{
				3086	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
				3087	if (ret) {
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3088	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3089	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3090	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3091	}
				3092	return ret;
				3093	}
				3094	EXPORT_SYMBOL(alloc_buffer_head);
				3095
				3096	void free_buffer_head(struct buffer_head *bh)
				3097	{
				3098	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3099	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3100	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3101	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3102	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3103	}
				3104	EXPORT_SYMBOL(free_buffer_head);
				3105
				3106	static void
				3107	init_buffer_head(void data, kmem_cache_t cachep, unsigned long flags)
				3108	{
				3109	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
				3110	SLAB_CTOR_CONSTRUCTOR) {
				3111	struct buffer_head * bh = (struct buffer_head *)data;
				3112
				3113	memset(bh, 0, sizeof(*bh));
				3114	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				3115	}
				3116	}
				3117
				3118	#ifdef CONFIG_HOTPLUG_CPU
				3119	static void buffer_exit_cpu(int cpu)
				3120	{
				3121	int i;
				3122	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3123
				3124	for (i = 0; i < BH_LRU_SIZE; i++) {
				3125	brelse(b->bhs[i]);
				3126	b->bhs[i] = NULL;
				3127	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3128	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				3129	per_cpu(bh_accounting, cpu).nr = 0;
				3130	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3131	}
				3132
				3133	static int buffer_cpu_notify(struct notifier_block *self,
				3134	unsigned long action, void *hcpu)
				3135	{
				3136	if (action == CPU_DEAD)
				3137	buffer_exit_cpu((unsigned long)hcpu);
				3138	return NOTIFY_OK;
				3139	}
				3140	#endif /* CONFIG_HOTPLUG_CPU */
				3141
				3142	void __init buffer_init(void)
				3143	{
				3144	int nrpages;
				3145
				3146	bh_cachep = kmem_cache_create("buffer_head",
Paul Jackson	b019600	2006-03-24 03:16:09 -0800	[diff] [blame]	3147	sizeof(struct buffer_head), 0,
				3148	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3149	SLAB_MEM_SPREAD),
				3150	init_buffer_head,
				3151	NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3152
				3153	/*
				3154	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3155	*/
				3156	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3157	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3158	hotcpu_notifier(buffer_cpu_notify, 0);
				3159	}
				3160
				3161	EXPORT_SYMBOL(__bforget);
				3162	EXPORT_SYMBOL(__brelse);
				3163	EXPORT_SYMBOL(__wait_on_buffer);
				3164	EXPORT_SYMBOL(block_commit_write);
				3165	EXPORT_SYMBOL(block_prepare_write);
				3166	EXPORT_SYMBOL(block_read_full_page);
				3167	EXPORT_SYMBOL(block_sync_page);
				3168	EXPORT_SYMBOL(block_truncate_page);
				3169	EXPORT_SYMBOL(block_write_full_page);
				3170	EXPORT_SYMBOL(cont_prepare_write);
				3171	EXPORT_SYMBOL(end_buffer_async_write);
				3172	EXPORT_SYMBOL(end_buffer_read_sync);
				3173	EXPORT_SYMBOL(end_buffer_write_sync);
				3174	EXPORT_SYMBOL(file_fsync);
				3175	EXPORT_SYMBOL(fsync_bdev);
				3176	EXPORT_SYMBOL(generic_block_bmap);
				3177	EXPORT_SYMBOL(generic_commit_write);
				3178	EXPORT_SYMBOL(generic_cont_expand);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3179	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3180	EXPORT_SYMBOL(init_buffer);
				3181	EXPORT_SYMBOL(invalidate_bdev);
				3182	EXPORT_SYMBOL(ll_rw_block);
				3183	EXPORT_SYMBOL(mark_buffer_dirty);
				3184	EXPORT_SYMBOL(submit_bh);
				3185	EXPORT_SYMBOL(sync_dirty_buffer);
				3186	EXPORT_SYMBOL(unlock_buffer);