Blame - fs/xfs/linux-2.6/xfs_buf.c - android_kernel_htc_msm8960

blob: 23e0eb67fc25e012c291a6efd7fed62ec8b415bb [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms of version 2 of the GNU General Public License as
				6	* published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it would be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				11	*
				12	* Further, this software is distributed without any warranty that it is
				13	* free of the rightful claim of any third person regarding infringement
				14	* or the like. Any license provided herein, whether implied or
				15	* otherwise, applies only to this software file. Patent licenses, if
				16	* any, provided herein do not apply to combinations of this program with
				17	* other software, or any other product whatsoever.
				18	*
				19	* You should have received a copy of the GNU General Public License along
				20	* with this program; if not, write the Free Software Foundation, Inc., 59
				21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.
				22	*
				23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
				24	* Mountain View, CA 94043, or:
				25	*
				26	* http://www.sgi.com
				27	*
				28	* For further information regarding this notice, see:
				29	*
				30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
				31	*/
				32
				33	/*
				34	* The xfs_buf.c code provides an abstract buffer cache model on top
				35	* of the Linux page cache. Cached metadata blocks for a file system
				36	* are hashed to the inode for the block device. xfs_buf.c assembles
				37	* buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
				38	*
				39	* Written by Steve Lord, Jim Mostek, Russell Cattelan
				40	* and Rajagopal Ananthanarayanan ("ananth") at SGI.
				41	*
				42	*/
				43
				44	#include <linux/stddef.h>
				45	#include <linux/errno.h>
				46	#include <linux/slab.h>
				47	#include <linux/pagemap.h>
				48	#include <linux/init.h>
				49	#include <linux/vmalloc.h>
				50	#include <linux/bio.h>
				51	#include <linux/sysctl.h>
				52	#include <linux/proc_fs.h>
				53	#include <linux/workqueue.h>
				54	#include <linux/percpu.h>
				55	#include <linux/blkdev.h>
				56	#include <linux/hash.h>
				57
				58	#include "xfs_linux.h"
				59
				60	/*
				61	* File wide globals
				62	*/
				63
				64	STATIC kmem_cache_t *pagebuf_cache;
				65	STATIC kmem_shaker_t pagebuf_shake;
				66	STATIC int pagebuf_daemon_wakeup(int, unsigned int);
				67	STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
				68	STATIC struct workqueue_struct *pagebuf_logio_workqueue;
				69	STATIC struct workqueue_struct *pagebuf_dataio_workqueue;
				70
				71	/*
				72	* Pagebuf debugging
				73	*/
				74
				75	#ifdef PAGEBUF_TRACE
				76	void
				77	pagebuf_trace(
				78	xfs_buf_t *pb,
				79	char *id,
				80	void *data,
				81	void *ra)
				82	{
				83	ktrace_enter(pagebuf_trace_buf,
				84	pb, id,
				85	(void *)(unsigned long)pb->pb_flags,
				86	(void *)(unsigned long)pb->pb_hold.counter,
				87	(void *)(unsigned long)pb->pb_sema.count.counter,
				88	(void *)current,
				89	data, ra,
				90	(void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
				91	(void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
				92	(void *)(unsigned long)pb->pb_buffer_length,
				93	NULL, NULL, NULL, NULL, NULL);
				94	}
				95	ktrace_t *pagebuf_trace_buf;
				96	#define PAGEBUF_TRACE_SIZE 4096
				97	#define PB_TRACE(pb, id, data) \
				98	pagebuf_trace(pb, id, (void )data, (void )__builtin_return_address(0))
				99	#else
				100	#define PB_TRACE(pb, id, data) do { } while (0)
				101	#endif
				102
				103	#ifdef PAGEBUF_LOCK_TRACKING
				104	# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)
				105	# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)
				106	# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)
				107	#else
				108	# define PB_SET_OWNER(pb) do { } while (0)
				109	# define PB_CLEAR_OWNER(pb) do { } while (0)
				110	# define PB_GET_OWNER(pb) do { } while (0)
				111	#endif
				112
				113	/*
				114	* Pagebuf allocation / freeing.
				115	*/
				116
				117	#define pb_to_gfp(flags) \
				118	((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \
				119	((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) \| __GFP_NOWARN)
				120
				121	#define pb_to_km(flags) \
				122	(((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
				123
				124
				125	#define pagebuf_allocate(flags) \
				126	kmem_zone_alloc(pagebuf_cache, pb_to_km(flags))
				127	#define pagebuf_deallocate(pb) \
				128	kmem_zone_free(pagebuf_cache, (pb));
				129
				130	/*
				131	* Page Region interfaces.
				132	*
				133	* For pages in filesystems where the blocksize is smaller than the
				134	* pagesize, we use the page->private field (long) to hold a bitmap
				135	* of uptodate regions within the page.
				136	*
				137	* Each such region is "bytes per page / bits per long" bytes long.
				138	*
				139	* NBPPR == number-of-bytes-per-page-region
				140	* BTOPR == bytes-to-page-region (rounded up)
				141	* BTOPRT == bytes-to-page-region-truncated (rounded down)
				142	*/
				143	#if (BITS_PER_LONG == 32)
				144	#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
				145	#elif (BITS_PER_LONG == 64)
				146	#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
				147	#else
				148	#error BITS_PER_LONG must be 32 or 64
				149	#endif
				150	#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
				151	#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
				152	#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
				153
				154	STATIC unsigned long
				155	page_region_mask(
				156	size_t offset,
				157	size_t length)
				158	{
				159	unsigned long mask;
				160	int first, final;
				161
				162	first = BTOPR(offset);
				163	final = BTOPRT(offset + length - 1);
				164	first = min(first, final);
				165
				166	mask = ~0UL;
				167	mask <<= BITS_PER_LONG - (final - first);
				168	mask >>= BITS_PER_LONG - (final);
				169
				170	ASSERT(offset + length <= PAGE_CACHE_SIZE);
				171	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
				172
				173	return mask;
				174	}
				175
				176	STATIC inline void
				177	set_page_region(
				178	struct page *page,
				179	size_t offset,
				180	size_t length)
				181	{
				182	page->private \|= page_region_mask(offset, length);
				183	if (page->private == ~0UL)
				184	SetPageUptodate(page);
				185	}
				186
				187	STATIC inline int
				188	test_page_region(
				189	struct page *page,
				190	size_t offset,
				191	size_t length)
				192	{
				193	unsigned long mask = page_region_mask(offset, length);
				194
				195	return (mask && (page->private & mask) == mask);
				196	}
				197
				198	/*
				199	* Mapping of multi-page buffers into contiguous virtual space
				200	*/
				201
				202	typedef struct a_list {
				203	void *vm_addr;
				204	struct a_list *next;
				205	} a_list_t;
				206
				207	STATIC a_list_t *as_free_head;
				208	STATIC int as_list_len;
				209	STATIC DEFINE_SPINLOCK(as_lock);
				210
				211	/*
				212	* Try to batch vunmaps because they are costly.
				213	*/
				214	STATIC void
				215	free_address(
				216	void *addr)
				217	{
				218	a_list_t *aentry;
				219
				220	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
				221	if (likely(aentry)) {
				222	spin_lock(&as_lock);
				223	aentry->next = as_free_head;
				224	aentry->vm_addr = addr;
				225	as_free_head = aentry;
				226	as_list_len++;
				227	spin_unlock(&as_lock);
				228	} else {
				229	vunmap(addr);
				230	}
				231	}
				232
				233	STATIC void
				234	purge_addresses(void)
				235	{
				236	a_list_t aentry, old;
				237
				238	if (as_free_head == NULL)
				239	return;
				240
				241	spin_lock(&as_lock);
				242	aentry = as_free_head;
				243	as_free_head = NULL;
				244	as_list_len = 0;
				245	spin_unlock(&as_lock);
				246
				247	while ((old = aentry) != NULL) {
				248	vunmap(aentry->vm_addr);
				249	aentry = aentry->next;
				250	kfree(old);
				251	}
				252	}
				253
				254	/*
				255	* Internal pagebuf object manipulation
				256	*/
				257
				258	STATIC void
				259	_pagebuf_initialize(
				260	xfs_buf_t *pb,
				261	xfs_buftarg_t *target,
				262	loff_t range_base,
				263	size_t range_length,
				264	page_buf_flags_t flags)
				265	{
				266	/*
				267	* We don't want certain flags to appear in pb->pb_flags.
				268	*/
				269	flags &= ~(PBF_LOCK\|PBF_MAPPED\|PBF_DONT_BLOCK\|PBF_READ_AHEAD);
				270
				271	memset(pb, 0, sizeof(xfs_buf_t));
				272	atomic_set(&pb->pb_hold, 1);
				273	init_MUTEX_LOCKED(&pb->pb_iodonesema);
				274	INIT_LIST_HEAD(&pb->pb_list);
				275	INIT_LIST_HEAD(&pb->pb_hash_list);
				276	init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
				277	PB_SET_OWNER(pb);
				278	pb->pb_target = target;
				279	pb->pb_file_offset = range_base;
				280	/*
				281	* Set buffer_length and count_desired to the same value initially.
				282	* I/O routines should use count_desired, which will be the same in
				283	* most cases but may be reset (e.g. XFS recovery).
				284	*/
				285	pb->pb_buffer_length = pb->pb_count_desired = range_length;
				286	pb->pb_flags = flags \| PBF_NONE;
				287	pb->pb_bn = XFS_BUF_DADDR_NULL;
				288	atomic_set(&pb->pb_pin_count, 0);
				289	init_waitqueue_head(&pb->pb_waiters);
				290
				291	XFS_STATS_INC(pb_create);
				292	PB_TRACE(pb, "initialize", target);
				293	}
				294
				295	/*
				296	* Allocate a page array capable of holding a specified number
				297	* of pages, and point the page buf at it.
				298	*/
				299	STATIC int
				300	_pagebuf_get_pages(
				301	xfs_buf_t *pb,
				302	int page_count,
				303	page_buf_flags_t flags)
				304	{
				305	/* Make sure that we have a page list */
				306	if (pb->pb_pages == NULL) {
				307	pb->pb_offset = page_buf_poff(pb->pb_file_offset);
				308	pb->pb_page_count = page_count;
				309	if (page_count <= PB_PAGES) {
				310	pb->pb_pages = pb->pb_page_array;
				311	} else {
				312	pb->pb_pages = kmem_alloc(sizeof(struct page )
				313	page_count, pb_to_km(flags));
				314	if (pb->pb_pages == NULL)
				315	return -ENOMEM;
				316	}
				317	memset(pb->pb_pages, 0, sizeof(struct page ) page_count);
				318	}
				319	return 0;
				320	}
				321
				322	/*
				323	* Frees pb_pages if it was malloced.
				324	*/
				325	STATIC void
				326	_pagebuf_free_pages(
				327	xfs_buf_t *bp)
				328	{
				329	if (bp->pb_pages != bp->pb_page_array) {
				330	kmem_free(bp->pb_pages,
				331	bp->pb_page_count * sizeof(struct page *));
				332	}
				333	}
				334
				335	/*
				336	* Releases the specified buffer.
				337	*
				338	* The modification state of any associated pages is left unchanged.
				339	* The buffer most not be on any hash - use pagebuf_rele instead for
				340	* hashed and refcounted buffers
				341	*/
				342	void
				343	pagebuf_free(
				344	xfs_buf_t *bp)
				345	{
				346	PB_TRACE(bp, "free", 0);
				347
				348	ASSERT(list_empty(&bp->pb_hash_list));
				349
				350	if (bp->pb_flags & _PBF_PAGE_CACHE) {
				351	uint i;
				352
				353	if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
				354	free_address(bp->pb_addr - bp->pb_offset);
				355
				356	for (i = 0; i < bp->pb_page_count; i++)
				357	page_cache_release(bp->pb_pages[i]);
				358	_pagebuf_free_pages(bp);
				359	} else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
				360	/*
				361	* XXX(hch): bp->pb_count_desired might be incorrect (see
				362	* pagebuf_associate_memory for details), but fortunately
				363	* the Linux version of kmem_free ignores the len argument..
				364	*/
				365	kmem_free(bp->pb_addr, bp->pb_count_desired);
				366	_pagebuf_free_pages(bp);
				367	}
				368
				369	pagebuf_deallocate(bp);
				370	}
				371
				372	/*
				373	* Finds all pages for buffer in question and builds it's page list.
				374	*/
				375	STATIC int
				376	_pagebuf_lookup_pages(
				377	xfs_buf_t *bp,
				378	uint flags)
				379	{
				380	struct address_space *mapping = bp->pb_target->pbr_mapping;
				381	size_t blocksize = bp->pb_target->pbr_bsize;
				382	size_t size = bp->pb_count_desired;
				383	size_t nbytes, offset;
				384	int gfp_mask = pb_to_gfp(flags);
				385	unsigned short page_count, i;
				386	pgoff_t first;
				387	loff_t end;
				388	int error;
				389
				390	end = bp->pb_file_offset + bp->pb_buffer_length;
				391	page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
				392
				393	error = _pagebuf_get_pages(bp, page_count, flags);
				394	if (unlikely(error))
				395	return error;
				396	bp->pb_flags \|= _PBF_PAGE_CACHE;
				397
				398	offset = bp->pb_offset;
				399	first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
				400
				401	for (i = 0; i < bp->pb_page_count; i++) {
				402	struct page *page;
				403	uint retries = 0;
				404
				405	retry:
				406	page = find_or_create_page(mapping, first + i, gfp_mask);
				407	if (unlikely(page == NULL)) {
				408	if (flags & PBF_READ_AHEAD) {
				409	bp->pb_page_count = i;
				410	for (i = 0; i < bp->pb_page_count; i++)
				411	unlock_page(bp->pb_pages[i]);
				412	return -ENOMEM;
				413	}
				414
				415	/*
				416	* This could deadlock.
				417	*
				418	* But until all the XFS lowlevel code is revamped to
				419	* handle buffer allocation failures we can't do much.
				420	*/
				421	if (!(++retries % 100))
				422	printk(KERN_ERR
				423	"XFS: possible memory allocation "
				424	"deadlock in %s (mode:0x%x)\n",
				425	__FUNCTION__, gfp_mask);
				426
				427	XFS_STATS_INC(pb_page_retries);
				428	pagebuf_daemon_wakeup(0, gfp_mask);
				429	blk_congestion_wait(WRITE, HZ/50);
				430	goto retry;
				431	}
				432
				433	XFS_STATS_INC(pb_page_found);
				434
				435	nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
				436	size -= nbytes;
				437
				438	if (!PageUptodate(page)) {
				439	page_count--;
				440	if (blocksize >= PAGE_CACHE_SIZE) {
				441	if (flags & PBF_READ)
				442	bp->pb_locked = 1;
				443	} else if (!PagePrivate(page)) {
				444	if (test_page_region(page, offset, nbytes))
				445	page_count++;
				446	}
				447	}
				448
				449	bp->pb_pages[i] = page;
				450	offset = 0;
				451	}
				452
				453	if (!bp->pb_locked) {
				454	for (i = 0; i < bp->pb_page_count; i++)
				455	unlock_page(bp->pb_pages[i]);
				456	}
				457
				458	if (page_count) {
				459	/* if we have any uptodate pages, mark that in the buffer */
				460	bp->pb_flags &= ~PBF_NONE;
				461
				462	/* if some pages aren't uptodate, mark that in the buffer */
				463	if (page_count != bp->pb_page_count)
				464	bp->pb_flags \|= PBF_PARTIAL;
				465	}
				466
				467	PB_TRACE(bp, "lookup_pages", (long)page_count);
				468	return error;
				469	}
				470
				471	/*
				472	* Map buffer into kernel address-space if nessecary.
				473	*/
				474	STATIC int
				475	_pagebuf_map_pages(
				476	xfs_buf_t *bp,
				477	uint flags)
				478	{
				479	/* A single page buffer is always mappable */
				480	if (bp->pb_page_count == 1) {
				481	bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
				482	bp->pb_flags \|= PBF_MAPPED;
				483	} else if (flags & PBF_MAPPED) {
				484	if (as_list_len > 64)
				485	purge_addresses();
				486	bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
				487	VM_MAP, PAGE_KERNEL);
				488	if (unlikely(bp->pb_addr == NULL))
				489	return -ENOMEM;
				490	bp->pb_addr += bp->pb_offset;
				491	bp->pb_flags \|= PBF_MAPPED;
				492	}
				493
				494	return 0;
				495	}
				496
				497	/*
				498	* Finding and Reading Buffers
				499	*/
				500
				501	/*
				502	* _pagebuf_find
				503	*
				504	* Looks up, and creates if absent, a lockable buffer for
				505	* a given range of an inode. The buffer is returned
				506	* locked. If other overlapping buffers exist, they are
				507	* released before the new buffer is created and locked,
				508	* which may imply that this call will block until those buffers
				509	* are unlocked. No I/O is implied by this call.
				510	*/
				511	xfs_buf_t *
				512	_pagebuf_find(
				513	xfs_buftarg_t btp, / block device target */
				514	loff_t ioff, /* starting offset of range */
				515	size_t isize, /* length of range */
				516	page_buf_flags_t flags, /* PBF_TRYLOCK */
				517	xfs_buf_t new_pb)/ newly allocated buffer */
				518	{
				519	loff_t range_base;
				520	size_t range_length;
				521	xfs_bufhash_t *hash;
				522	xfs_buf_t pb, n;
				523
				524	range_base = (ioff << BBSHIFT);
				525	range_length = (isize << BBSHIFT);
				526
				527	/* Check for IOs smaller than the sector size / not sector aligned */
				528	ASSERT(!(range_length < (1 << btp->pbr_sshift)));
				529	ASSERT(!(range_base & (loff_t)btp->pbr_smask));
				530
				531	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
				532
				533	spin_lock(&hash->bh_lock);
				534
				535	list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) {
				536	ASSERT(btp == pb->pb_target);
				537	if (pb->pb_file_offset == range_base &&
				538	pb->pb_buffer_length == range_length) {
				539	/*
				540	* If we look at something bring it to the
				541	* front of the list for next time.
				542	*/
				543	atomic_inc(&pb->pb_hold);
				544	list_move(&pb->pb_hash_list, &hash->bh_list);
				545	goto found;
				546	}
				547	}
				548
				549	/* No match found */
				550	if (new_pb) {
				551	_pagebuf_initialize(new_pb, btp, range_base,
				552	range_length, flags);
				553	new_pb->pb_hash = hash;
				554	list_add(&new_pb->pb_hash_list, &hash->bh_list);
				555	} else {
				556	XFS_STATS_INC(pb_miss_locked);
				557	}
				558
				559	spin_unlock(&hash->bh_lock);
				560	return new_pb;
				561
				562	found:
				563	spin_unlock(&hash->bh_lock);
				564
				565	/* Attempt to get the semaphore without sleeping,
				566	* if this does not work then we need to drop the
				567	* spinlock and do a hard attempt on the semaphore.
				568	*/
				569	if (down_trylock(&pb->pb_sema)) {
				570	if (!(flags & PBF_TRYLOCK)) {
				571	/* wait for buffer ownership */
				572	PB_TRACE(pb, "get_lock", 0);
				573	pagebuf_lock(pb);
				574	XFS_STATS_INC(pb_get_locked_waited);
				575	} else {
				576	/* We asked for a trylock and failed, no need
				577	* to look at file offset and length here, we
				578	* know that this pagebuf at least overlaps our
				579	* pagebuf and is locked, therefore our buffer
				580	* either does not exist, or is this buffer
				581	*/
				582
				583	pagebuf_rele(pb);
				584	XFS_STATS_INC(pb_busy_locked);
				585	return (NULL);
				586	}
				587	} else {
				588	/* trylock worked */
				589	PB_SET_OWNER(pb);
				590	}
				591
				592	if (pb->pb_flags & PBF_STALE)
				593	pb->pb_flags &= PBF_MAPPED;
				594	PB_TRACE(pb, "got_lock", 0);
				595	XFS_STATS_INC(pb_get_locked);
				596	return (pb);
				597	}
				598
				599	/*
				600	* xfs_buf_get_flags assembles a buffer covering the specified range.
				601	*
				602	* Storage in memory for all portions of the buffer will be allocated,
				603	* although backing storage may not be.
				604	*/
				605	xfs_buf_t *
				606	xfs_buf_get_flags( /* allocate a buffer */
				607	xfs_buftarg_t target,/ target for buffer */
				608	loff_t ioff, /* starting offset of range */
				609	size_t isize, /* length of range */
				610	page_buf_flags_t flags) /* PBF_TRYLOCK */
				611	{
				612	xfs_buf_t pb, new_pb;
				613	int error = 0, i;
				614
				615	new_pb = pagebuf_allocate(flags);
				616	if (unlikely(!new_pb))
				617	return NULL;
				618
				619	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
				620	if (pb == new_pb) {
				621	error = _pagebuf_lookup_pages(pb, flags);
				622	if (error)
				623	goto no_buffer;
				624	} else {
				625	pagebuf_deallocate(new_pb);
				626	if (unlikely(pb == NULL))
				627	return NULL;
				628	}
				629
				630	for (i = 0; i < pb->pb_page_count; i++)
				631	mark_page_accessed(pb->pb_pages[i]);
				632
				633	if (!(pb->pb_flags & PBF_MAPPED)) {
				634	error = _pagebuf_map_pages(pb, flags);
				635	if (unlikely(error)) {
				636	printk(KERN_WARNING "%s: failed to map pages\n",
				637	__FUNCTION__);
				638	goto no_buffer;
				639	}
				640	}
				641
				642	XFS_STATS_INC(pb_get);
				643
				644	/*
				645	* Always fill in the block number now, the mapped cases can do
				646	* their own overlay of this later.
				647	*/
				648	pb->pb_bn = ioff;
				649	pb->pb_count_desired = pb->pb_buffer_length;
				650
				651	PB_TRACE(pb, "get", (unsigned long)flags);
				652	return pb;
				653
				654	no_buffer:
				655	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				656	pagebuf_unlock(pb);
				657	pagebuf_rele(pb);
				658	return NULL;
				659	}
				660
				661	xfs_buf_t *
				662	xfs_buf_read_flags(
				663	xfs_buftarg_t *target,
				664	loff_t ioff,
				665	size_t isize,
				666	page_buf_flags_t flags)
				667	{
				668	xfs_buf_t *pb;
				669
				670	flags \|= PBF_READ;
				671
				672	pb = xfs_buf_get_flags(target, ioff, isize, flags);
				673	if (pb) {
				674	if (PBF_NOT_DONE(pb)) {
				675	PB_TRACE(pb, "read", (unsigned long)flags);
				676	XFS_STATS_INC(pb_get_read);
				677	pagebuf_iostart(pb, flags);
				678	} else if (flags & PBF_ASYNC) {
				679	PB_TRACE(pb, "read_async", (unsigned long)flags);
				680	/*
				681	* Read ahead call which is already satisfied,
				682	* drop the buffer
				683	*/
				684	goto no_buffer;
				685	} else {
				686	PB_TRACE(pb, "read_done", (unsigned long)flags);
				687	/* We do not want read in the flags */
				688	pb->pb_flags &= ~PBF_READ;
				689	}
				690	}
				691
				692	return pb;
				693
				694	no_buffer:
				695	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				696	pagebuf_unlock(pb);
				697	pagebuf_rele(pb);
				698	return NULL;
				699	}
				700
				701	/*
				702	* Create a skeletal pagebuf (no pages associated with it).
				703	*/
				704	xfs_buf_t *
				705	pagebuf_lookup(
				706	xfs_buftarg_t *target,
				707	loff_t ioff,
				708	size_t isize,
				709	page_buf_flags_t flags)
				710	{
				711	xfs_buf_t *pb;
				712
				713	pb = pagebuf_allocate(flags);
				714	if (pb) {
				715	_pagebuf_initialize(pb, target, ioff, isize, flags);
				716	}
				717	return pb;
				718	}
				719
				720	/*
				721	* If we are not low on memory then do the readahead in a deadlock
				722	* safe manner.
				723	*/
				724	void
				725	pagebuf_readahead(
				726	xfs_buftarg_t *target,
				727	loff_t ioff,
				728	size_t isize,
				729	page_buf_flags_t flags)
				730	{
				731	struct backing_dev_info *bdi;
				732
				733	bdi = target->pbr_mapping->backing_dev_info;
				734	if (bdi_read_congested(bdi))
				735	return;
				736
				737	flags \|= (PBF_TRYLOCK\|PBF_ASYNC\|PBF_READ_AHEAD);
				738	xfs_buf_read_flags(target, ioff, isize, flags);
				739	}
				740
				741	xfs_buf_t *
				742	pagebuf_get_empty(
				743	size_t len,
				744	xfs_buftarg_t *target)
				745	{
				746	xfs_buf_t *pb;
				747
				748	pb = pagebuf_allocate(0);
				749	if (pb)
				750	_pagebuf_initialize(pb, target, 0, len, 0);
				751	return pb;
				752	}
				753
				754	static inline struct page *
				755	mem_to_page(
				756	void *addr)
				757	{
				758	if (((unsigned long)addr < VMALLOC_START) \|\|
				759	((unsigned long)addr >= VMALLOC_END)) {
				760	return virt_to_page(addr);
				761	} else {
				762	return vmalloc_to_page(addr);
				763	}
				764	}
				765
				766	int
				767	pagebuf_associate_memory(
				768	xfs_buf_t *pb,
				769	void *mem,
				770	size_t len)
				771	{
				772	int rval;
				773	int i = 0;
				774	size_t ptr;
				775	size_t end, end_cur;
				776	off_t offset;
				777	int page_count;
				778
				779	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
				780	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
				781	if (offset && (len > PAGE_CACHE_SIZE))
				782	page_count++;
				783
				784	/* Free any previous set of page pointers */
				785	if (pb->pb_pages)
				786	_pagebuf_free_pages(pb);
				787
				788	pb->pb_pages = NULL;
				789	pb->pb_addr = mem;
				790
				791	rval = _pagebuf_get_pages(pb, page_count, 0);
				792	if (rval)
				793	return rval;
				794
				795	pb->pb_offset = offset;
				796	ptr = (size_t) mem & PAGE_CACHE_MASK;
				797	end = PAGE_CACHE_ALIGN((size_t) mem + len);
				798	end_cur = end;
				799	/* set up first page */
				800	pb->pb_pages[0] = mem_to_page(mem);
				801
				802	ptr += PAGE_CACHE_SIZE;
				803	pb->pb_page_count = ++i;
				804	while (ptr < end) {
				805	pb->pb_pages[i] = mem_to_page((void *)ptr);
				806	pb->pb_page_count = ++i;
				807	ptr += PAGE_CACHE_SIZE;
				808	}
				809	pb->pb_locked = 0;
				810
				811	pb->pb_count_desired = pb->pb_buffer_length = len;
				812	pb->pb_flags \|= PBF_MAPPED;
				813
				814	return 0;
				815	}
				816
				817	xfs_buf_t *
				818	pagebuf_get_no_daddr(
				819	size_t len,
				820	xfs_buftarg_t *target)
				821	{
				822	size_t malloc_len = len;
				823	xfs_buf_t *bp;
				824	void *data;
				825	int error;
				826
				827	bp = pagebuf_allocate(0);
				828	if (unlikely(bp == NULL))
				829	goto fail;
				830	_pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO);
				831
				832	try_again:
				833	data = kmem_alloc(malloc_len, KM_SLEEP \| KM_MAYFAIL);
				834	if (unlikely(data == NULL))
				835	goto fail_free_buf;
				836
				837	/* check whether alignment matches.. */
				838	if ((__psunsigned_t)data !=
				839	((__psunsigned_t)data & ~target->pbr_smask)) {
				840	/* .. else double the size and try again */
				841	kmem_free(data, malloc_len);
				842	malloc_len <<= 1;
				843	goto try_again;
				844	}
				845
				846	error = pagebuf_associate_memory(bp, data, len);
				847	if (error)
				848	goto fail_free_mem;
				849	bp->pb_flags \|= _PBF_KMEM_ALLOC;
				850
				851	pagebuf_unlock(bp);
				852
				853	PB_TRACE(bp, "no_daddr", data);
				854	return bp;
				855	fail_free_mem:
				856	kmem_free(data, malloc_len);
				857	fail_free_buf:
				858	pagebuf_free(bp);
				859	fail:
				860	return NULL;
				861	}
				862
				863	/*
				864	* pagebuf_hold
				865	*
				866	* Increment reference count on buffer, to hold the buffer concurrently
				867	* with another thread which may release (free) the buffer asynchronously.
				868	*
				869	* Must hold the buffer already to call this function.
				870	*/
				871	void
				872	pagebuf_hold(
				873	xfs_buf_t *pb)
				874	{
				875	atomic_inc(&pb->pb_hold);
				876	PB_TRACE(pb, "hold", 0);
				877	}
				878
				879	/*
				880	* pagebuf_rele
				881	*
				882	* pagebuf_rele releases a hold on the specified buffer. If the
				883	* the hold count is 1, pagebuf_rele calls pagebuf_free.
				884	*/
				885	void
				886	pagebuf_rele(
				887	xfs_buf_t *pb)
				888	{
				889	xfs_bufhash_t *hash = pb->pb_hash;
				890
				891	PB_TRACE(pb, "rele", pb->pb_relse);
				892
				893	/*
				894	* pagebuf_lookup buffers are not hashed, not delayed write,
				895	* and don't have their own release routines. Special case.
				896	*/
				897	if (unlikely(!hash)) {
				898	ASSERT(!pb->pb_relse);
				899	if (atomic_dec_and_test(&pb->pb_hold))
				900	xfs_buf_free(pb);
				901	return;
				902	}
				903
				904	if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
				905	int do_free = 1;
				906
				907	if (pb->pb_relse) {
				908	atomic_inc(&pb->pb_hold);
				909	spin_unlock(&hash->bh_lock);
				910	(*(pb->pb_relse)) (pb);
				911	spin_lock(&hash->bh_lock);
				912	do_free = 0;
				913	}
				914
				915	if (pb->pb_flags & PBF_DELWRI) {
				916	pb->pb_flags \|= PBF_ASYNC;
				917	atomic_inc(&pb->pb_hold);
				918	pagebuf_delwri_queue(pb, 0);
				919	do_free = 0;
				920	} else if (pb->pb_flags & PBF_FS_MANAGED) {
				921	do_free = 0;
				922	}
				923
				924	if (do_free) {
				925	list_del_init(&pb->pb_hash_list);
				926	spin_unlock(&hash->bh_lock);
				927	pagebuf_free(pb);
				928	} else {
				929	spin_unlock(&hash->bh_lock);
				930	}
				931	}
				932	}
				933
				934
				935	/*
				936	* Mutual exclusion on buffers. Locking model:
				937	*
				938	* Buffers associated with inodes for which buffer locking
				939	* is not enabled are not protected by semaphores, and are
				940	* assumed to be exclusively owned by the caller. There is a
				941	* spinlock in the buffer, used by the caller when concurrent
				942	* access is possible.
				943	*/
				944
				945	/*
				946	* pagebuf_cond_lock
				947	*
				948	* pagebuf_cond_lock locks a buffer object, if it is not already locked.
				949	* Note that this in no way
				950	* locks the underlying pages, so it is only useful for synchronizing
				951	* concurrent use of page buffer objects, not for synchronizing independent
				952	* access to the underlying pages.
				953	*/
				954	int
				955	pagebuf_cond_lock( /* lock buffer, if not locked */
				956	/* returns -EBUSY if locked) */
				957	xfs_buf_t *pb)
				958	{
				959	int locked;
				960
				961	locked = down_trylock(&pb->pb_sema) == 0;
				962	if (locked) {
				963	PB_SET_OWNER(pb);
				964	}
				965	PB_TRACE(pb, "cond_lock", (long)locked);
				966	return(locked ? 0 : -EBUSY);
				967	}
				968
				969	#if defined(DEBUG) \|\| defined(XFS_BLI_TRACE)
				970	/*
				971	* pagebuf_lock_value
				972	*
				973	* Return lock value for a pagebuf
				974	*/
				975	int
				976	pagebuf_lock_value(
				977	xfs_buf_t *pb)
				978	{
				979	return(atomic_read(&pb->pb_sema.count));
				980	}
				981	#endif
				982
				983	/*
				984	* pagebuf_lock
				985	*
				986	* pagebuf_lock locks a buffer object. Note that this in no way
				987	* locks the underlying pages, so it is only useful for synchronizing
				988	* concurrent use of page buffer objects, not for synchronizing independent
				989	* access to the underlying pages.
				990	*/
				991	int
				992	pagebuf_lock(
				993	xfs_buf_t *pb)
				994	{
				995	PB_TRACE(pb, "lock", 0);
				996	if (atomic_read(&pb->pb_io_remaining))
				997	blk_run_address_space(pb->pb_target->pbr_mapping);
				998	down(&pb->pb_sema);
				999	PB_SET_OWNER(pb);
				1000	PB_TRACE(pb, "locked", 0);
				1001	return 0;
				1002	}
				1003
				1004	/*
				1005	* pagebuf_unlock
				1006	*
				1007	* pagebuf_unlock releases the lock on the buffer object created by
				1008	* pagebuf_lock or pagebuf_cond_lock (not any
				1009	* pinning of underlying pages created by pagebuf_pin).
				1010	*/
				1011	void
				1012	pagebuf_unlock( /* unlock buffer */
				1013	xfs_buf_t pb) / buffer to unlock */
				1014	{
				1015	PB_CLEAR_OWNER(pb);
				1016	up(&pb->pb_sema);
				1017	PB_TRACE(pb, "unlock", 0);
				1018	}
				1019
				1020
				1021	/*
				1022	* Pinning Buffer Storage in Memory
				1023	*/
				1024
				1025	/*
				1026	* pagebuf_pin
				1027	*
				1028	* pagebuf_pin locks all of the memory represented by a buffer in
				1029	* memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
				1030	* the same or different buffers affecting a given page, will
				1031	* properly count the number of outstanding "pin" requests. The
				1032	* buffer may be released after the pagebuf_pin and a different
				1033	* buffer used when calling pagebuf_unpin, if desired.
				1034	* pagebuf_pin should be used by the file system when it wants be
				1035	* assured that no attempt will be made to force the affected
				1036	* memory to disk. It does not assure that a given logical page
				1037	* will not be moved to a different physical page.
				1038	*/
				1039	void
				1040	pagebuf_pin(
				1041	xfs_buf_t *pb)
				1042	{
				1043	atomic_inc(&pb->pb_pin_count);
				1044	PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
				1045	}
				1046
				1047	/*
				1048	* pagebuf_unpin
				1049	*
				1050	* pagebuf_unpin reverses the locking of memory performed by
				1051	* pagebuf_pin. Note that both functions affected the logical
				1052	* pages associated with the buffer, not the buffer itself.
				1053	*/
				1054	void
				1055	pagebuf_unpin(
				1056	xfs_buf_t *pb)
				1057	{
				1058	if (atomic_dec_and_test(&pb->pb_pin_count)) {
				1059	wake_up_all(&pb->pb_waiters);
				1060	}
				1061	PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
				1062	}
				1063
				1064	int
				1065	pagebuf_ispin(
				1066	xfs_buf_t *pb)
				1067	{
				1068	return atomic_read(&pb->pb_pin_count);
				1069	}
				1070
				1071	/*
				1072	* pagebuf_wait_unpin
				1073	*
				1074	* pagebuf_wait_unpin waits until all of the memory associated
				1075	* with the buffer is not longer locked in memory. It returns
				1076	* immediately if none of the affected pages are locked.
				1077	*/
				1078	static inline void
				1079	_pagebuf_wait_unpin(
				1080	xfs_buf_t *pb)
				1081	{
				1082	DECLARE_WAITQUEUE (wait, current);
				1083
				1084	if (atomic_read(&pb->pb_pin_count) == 0)
				1085	return;
				1086
				1087	add_wait_queue(&pb->pb_waiters, &wait);
				1088	for (;;) {
				1089	set_current_state(TASK_UNINTERRUPTIBLE);
				1090	if (atomic_read(&pb->pb_pin_count) == 0)
				1091	break;
				1092	if (atomic_read(&pb->pb_io_remaining))
				1093	blk_run_address_space(pb->pb_target->pbr_mapping);
				1094	schedule();
				1095	}
				1096	remove_wait_queue(&pb->pb_waiters, &wait);
				1097	set_current_state(TASK_RUNNING);
				1098	}
				1099
				1100	/*
				1101	* Buffer Utility Routines
				1102	*/
				1103
				1104	/*
				1105	* pagebuf_iodone
				1106	*
				1107	* pagebuf_iodone marks a buffer for which I/O is in progress
				1108	* done with respect to that I/O. The pb_iodone routine, if
				1109	* present, will be called as a side-effect.
				1110	*/
				1111	STATIC void
				1112	pagebuf_iodone_work(
				1113	void *v)
				1114	{
				1115	xfs_buf_t bp = (xfs_buf_t )v;
				1116
				1117	if (bp->pb_iodone)
				1118	(*(bp->pb_iodone))(bp);
				1119	else if (bp->pb_flags & PBF_ASYNC)
				1120	xfs_buf_relse(bp);
				1121	}
				1122
				1123	void
				1124	pagebuf_iodone(
				1125	xfs_buf_t *pb,
				1126	int dataio,
				1127	int schedule)
				1128	{
				1129	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE);
				1130	if (pb->pb_error == 0) {
				1131	pb->pb_flags &= ~(PBF_PARTIAL \| PBF_NONE);
				1132	}
				1133
				1134	PB_TRACE(pb, "iodone", pb->pb_iodone);
				1135
				1136	if ((pb->pb_iodone) \|\| (pb->pb_flags & PBF_ASYNC)) {
				1137	if (schedule) {
				1138	INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
				1139	queue_work(dataio ? pagebuf_dataio_workqueue :
				1140	pagebuf_logio_workqueue, &pb->pb_iodone_work);
				1141	} else {
				1142	pagebuf_iodone_work(pb);
				1143	}
				1144	} else {
				1145	up(&pb->pb_iodonesema);
				1146	}
				1147	}
				1148
				1149	/*
				1150	* pagebuf_ioerror
				1151	*
				1152	* pagebuf_ioerror sets the error code for a buffer.
				1153	*/
				1154	void
				1155	pagebuf_ioerror( /* mark/clear buffer error flag */
				1156	xfs_buf_t pb, / buffer to mark */
				1157	int error) /* error to store (0 if none) */
				1158	{
				1159	ASSERT(error >= 0 && error <= 0xffff);
				1160	pb->pb_error = (unsigned short)error;
				1161	PB_TRACE(pb, "ioerror", (unsigned long)error);
				1162	}
				1163
				1164	/*
				1165	* pagebuf_iostart
				1166	*
				1167	* pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
				1168	* If necessary, it will arrange for any disk space allocation required,
				1169	* and it will break up the request if the block mappings require it.
				1170	* The pb_iodone routine in the buffer supplied will only be called
				1171	* when all of the subsidiary I/O requests, if any, have been completed.
				1172	* pagebuf_iostart calls the pagebuf_ioinitiate routine or
				1173	* pagebuf_iorequest, if the former routine is not defined, to start
				1174	* the I/O on a given low-level request.
				1175	*/
				1176	int
				1177	pagebuf_iostart( /* start I/O on a buffer */
				1178	xfs_buf_t pb, / buffer to start */
				1179	page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
				1180	/* PBF_WRITE, PBF_DELWRI, */
				1181	/* PBF_DONT_BLOCK */
				1182	{
				1183	int status = 0;
				1184
				1185	PB_TRACE(pb, "iostart", (unsigned long)flags);
				1186
				1187	if (flags & PBF_DELWRI) {
				1188	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC);
				1189	pb->pb_flags \|= flags & (PBF_DELWRI \| PBF_ASYNC);
				1190	pagebuf_delwri_queue(pb, 1);
				1191	return status;
				1192	}
				1193
				1194	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC \| PBF_DELWRI \| \
				1195	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1196	pb->pb_flags \|= flags & (PBF_READ \| PBF_WRITE \| PBF_ASYNC \| \
				1197	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1198
				1199	BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
				1200
				1201	/* For writes allow an alternate strategy routine to precede
				1202	* the actual I/O request (which may not be issued at all in
				1203	* a shutdown situation, for example).
				1204	*/
				1205	status = (flags & PBF_WRITE) ?
				1206	pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
				1207
				1208	/* Wait for I/O if we are not an async request.
				1209	* Note: async I/O request completion will release the buffer,
				1210	* and that can already be done by this point. So using the
				1211	* buffer pointer from here on, after async I/O, is invalid.
				1212	*/
				1213	if (!status && !(flags & PBF_ASYNC))
				1214	status = pagebuf_iowait(pb);
				1215
				1216	return status;
				1217	}
				1218
				1219	/*
				1220	* Helper routine for pagebuf_iorequest
				1221	*/
				1222
				1223	STATIC __inline__ int
				1224	_pagebuf_iolocked(
				1225	xfs_buf_t *pb)
				1226	{
				1227	ASSERT(pb->pb_flags & (PBF_READ\|PBF_WRITE));
				1228	if (pb->pb_flags & PBF_READ)
				1229	return pb->pb_locked;
				1230	return 0;
				1231	}
				1232
				1233	STATIC __inline__ void
				1234	_pagebuf_iodone(
				1235	xfs_buf_t *pb,
				1236	int schedule)
				1237	{
				1238	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
				1239	pb->pb_locked = 0;
				1240	pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
				1241	}
				1242	}
				1243
				1244	STATIC int
				1245	bio_end_io_pagebuf(
				1246	struct bio *bio,
				1247	unsigned int bytes_done,
				1248	int error)
				1249	{
				1250	xfs_buf_t pb = (xfs_buf_t )bio->bi_private;
				1251	unsigned int i, blocksize = pb->pb_target->pbr_bsize;
				1252	struct bio_vec *bvec = bio->bi_io_vec;
				1253
				1254	if (bio->bi_size)
				1255	return 1;
				1256
				1257	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				1258	pb->pb_error = EIO;
				1259
				1260	for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
				1261	struct page *page = bvec->bv_page;
				1262
				1263	if (pb->pb_error) {
				1264	SetPageError(page);
				1265	} else if (blocksize == PAGE_CACHE_SIZE) {
				1266	SetPageUptodate(page);
				1267	} else if (!PagePrivate(page) &&
				1268	(pb->pb_flags & _PBF_PAGE_CACHE)) {
				1269	set_page_region(page, bvec->bv_offset, bvec->bv_len);
				1270	}
				1271
				1272	if (_pagebuf_iolocked(pb)) {
				1273	unlock_page(page);
				1274	}
				1275	}
				1276
				1277	_pagebuf_iodone(pb, 1);
				1278	bio_put(bio);
				1279	return 0;
				1280	}
				1281
				1282	STATIC void
				1283	_pagebuf_ioapply(
				1284	xfs_buf_t *pb)
				1285	{
				1286	int i, rw, map_i, total_nr_pages, nr_pages;
				1287	struct bio *bio;
				1288	int offset = pb->pb_offset;
				1289	int size = pb->pb_count_desired;
				1290	sector_t sector = pb->pb_bn;
				1291	unsigned int blocksize = pb->pb_target->pbr_bsize;
				1292	int locking = _pagebuf_iolocked(pb);
				1293
				1294	total_nr_pages = pb->pb_page_count;
				1295	map_i = 0;
				1296
				1297	if (pb->pb_flags & _PBF_RUN_QUEUES) {
				1298	pb->pb_flags &= ~_PBF_RUN_QUEUES;
				1299	rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC;
				1300	} else {
				1301	rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
				1302	}
				1303
				1304	/* Special code path for reading a sub page size pagebuf in --
				1305	* we populate up the whole page, and hence the other metadata
				1306	* in the same page. This optimization is only valid when the
				1307	* filesystem block size and the page size are equal.
				1308	*/
				1309	if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
				1310	(pb->pb_flags & PBF_READ) && locking &&
				1311	(blocksize == PAGE_CACHE_SIZE)) {
				1312	bio = bio_alloc(GFP_NOIO, 1);
				1313
				1314	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1315	bio->bi_sector = sector - (offset >> BBSHIFT);
				1316	bio->bi_end_io = bio_end_io_pagebuf;
				1317	bio->bi_private = pb;
				1318
				1319	bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
				1320	size = 0;
				1321
				1322	atomic_inc(&pb->pb_io_remaining);
				1323
				1324	goto submit_io;
				1325	}
				1326
				1327	/* Lock down the pages which we need to for the request */
				1328	if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
				1329	for (i = 0; size; i++) {
				1330	int nbytes = PAGE_CACHE_SIZE - offset;
				1331	struct page *page = pb->pb_pages[i];
				1332
				1333	if (nbytes > size)
				1334	nbytes = size;
				1335
				1336	lock_page(page);
				1337
				1338	size -= nbytes;
				1339	offset = 0;
				1340	}
				1341	offset = pb->pb_offset;
				1342	size = pb->pb_count_desired;
				1343	}
				1344
				1345	next_chunk:
				1346	atomic_inc(&pb->pb_io_remaining);
				1347	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
				1348	if (nr_pages > total_nr_pages)
				1349	nr_pages = total_nr_pages;
				1350
				1351	bio = bio_alloc(GFP_NOIO, nr_pages);
				1352	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1353	bio->bi_sector = sector;
				1354	bio->bi_end_io = bio_end_io_pagebuf;
				1355	bio->bi_private = pb;
				1356
				1357	for (; size && nr_pages; nr_pages--, map_i++) {
				1358	int nbytes = PAGE_CACHE_SIZE - offset;
				1359
				1360	if (nbytes > size)
				1361	nbytes = size;
				1362
				1363	if (bio_add_page(bio, pb->pb_pages[map_i],
				1364	nbytes, offset) < nbytes)
				1365	break;
				1366
				1367	offset = 0;
				1368	sector += nbytes >> BBSHIFT;
				1369	size -= nbytes;
				1370	total_nr_pages--;
				1371	}
				1372
				1373	submit_io:
				1374	if (likely(bio->bi_size)) {
				1375	submit_bio(rw, bio);
				1376	if (size)
				1377	goto next_chunk;
				1378	} else {
				1379	bio_put(bio);
				1380	pagebuf_ioerror(pb, EIO);
				1381	}
				1382	}
				1383
				1384	/*
				1385	* pagebuf_iorequest -- the core I/O request routine.
				1386	*/
				1387	int
				1388	pagebuf_iorequest( /* start real I/O */
				1389	xfs_buf_t pb) / buffer to convey to device */
				1390	{
				1391	PB_TRACE(pb, "iorequest", 0);
				1392
				1393	if (pb->pb_flags & PBF_DELWRI) {
				1394	pagebuf_delwri_queue(pb, 1);
				1395	return 0;
				1396	}
				1397
				1398	if (pb->pb_flags & PBF_WRITE) {
				1399	_pagebuf_wait_unpin(pb);
				1400	}
				1401
				1402	pagebuf_hold(pb);
				1403
				1404	/* Set the count to 1 initially, this will stop an I/O
				1405	* completion callout which happens before we have started
				1406	* all the I/O from calling pagebuf_iodone too early.
				1407	*/
				1408	atomic_set(&pb->pb_io_remaining, 1);
				1409	_pagebuf_ioapply(pb);
				1410	_pagebuf_iodone(pb, 0);
				1411
				1412	pagebuf_rele(pb);
				1413	return 0;
				1414	}
				1415
				1416	/*
				1417	* pagebuf_iowait
				1418	*
				1419	* pagebuf_iowait waits for I/O to complete on the buffer supplied.
				1420	* It returns immediately if no I/O is pending. In any case, it returns
				1421	* the error code, if any, or 0 if there is no error.
				1422	*/
				1423	int
				1424	pagebuf_iowait(
				1425	xfs_buf_t *pb)
				1426	{
				1427	PB_TRACE(pb, "iowait", 0);
				1428	if (atomic_read(&pb->pb_io_remaining))
				1429	blk_run_address_space(pb->pb_target->pbr_mapping);
				1430	down(&pb->pb_iodonesema);
				1431	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
				1432	return pb->pb_error;
				1433	}
				1434
				1435	caddr_t
				1436	pagebuf_offset(
				1437	xfs_buf_t *pb,
				1438	size_t offset)
				1439	{
				1440	struct page *page;
				1441
				1442	offset += pb->pb_offset;
				1443
				1444	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
				1445	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
				1446	}
				1447
				1448	/*
				1449	* pagebuf_iomove
				1450	*
				1451	* Move data into or out of a buffer.
				1452	*/
				1453	void
				1454	pagebuf_iomove(
				1455	xfs_buf_t pb, / buffer to process */
				1456	size_t boff, /* starting buffer offset */
				1457	size_t bsize, /* length to copy */
				1458	caddr_t data, /* data address */
				1459	page_buf_rw_t mode) /* read/write flag */
				1460	{
				1461	size_t bend, cpoff, csize;
				1462	struct page *page;
				1463
				1464	bend = boff + bsize;
				1465	while (boff < bend) {
				1466	page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
				1467	cpoff = page_buf_poff(boff + pb->pb_offset);
				1468	csize = min_t(size_t,
				1469	PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
				1470
				1471	ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
				1472
				1473	switch (mode) {
				1474	case PBRW_ZERO:
				1475	memset(page_address(page) + cpoff, 0, csize);
				1476	break;
				1477	case PBRW_READ:
				1478	memcpy(data, page_address(page) + cpoff, csize);
				1479	break;
				1480	case PBRW_WRITE:
				1481	memcpy(page_address(page) + cpoff, data, csize);
				1482	}
				1483
				1484	boff += csize;
				1485	data += csize;
				1486	}
				1487	}
				1488
				1489	/*
				1490	* Handling of buftargs.
				1491	*/
				1492
				1493	/*
				1494	* Wait for any bufs with callbacks that have been submitted but
				1495	* have not yet returned... walk the hash list for the target.
				1496	*/
				1497	void
				1498	xfs_wait_buftarg(
				1499	xfs_buftarg_t *btp)
				1500	{
				1501	xfs_buf_t bp, n;
				1502	xfs_bufhash_t *hash;
				1503	uint i;
				1504
				1505	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1506	hash = &btp->bt_hash[i];
				1507	again:
				1508	spin_lock(&hash->bh_lock);
				1509	list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) {
				1510	ASSERT(btp == bp->pb_target);
				1511	if (!(bp->pb_flags & PBF_FS_MANAGED)) {
				1512	spin_unlock(&hash->bh_lock);
				1513	delay(100);
				1514	goto again;
				1515	}
				1516	}
				1517	spin_unlock(&hash->bh_lock);
				1518	}
				1519	}
				1520
				1521	/*
				1522	* Allocate buffer hash table for a given target.
				1523	* For devices containing metadata (i.e. not the log/realtime devices)
				1524	* we need to allocate a much larger hash table.
				1525	*/
				1526	STATIC void
				1527	xfs_alloc_bufhash(
				1528	xfs_buftarg_t *btp,
				1529	int external)
				1530	{
				1531	unsigned int i;
				1532
				1533	btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
				1534	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
				1535	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
				1536	sizeof(xfs_bufhash_t), KM_SLEEP);
				1537	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1538	spin_lock_init(&btp->bt_hash[i].bh_lock);
				1539	INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
				1540	}
				1541	}
				1542
				1543	STATIC void
				1544	xfs_free_bufhash(
				1545	xfs_buftarg_t *btp)
				1546	{
				1547	kmem_free(btp->bt_hash,
				1548	(1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
				1549	btp->bt_hash = NULL;
				1550	}
				1551
				1552	void
				1553	xfs_free_buftarg(
				1554	xfs_buftarg_t *btp,
				1555	int external)
				1556	{
				1557	xfs_flush_buftarg(btp, 1);
				1558	if (external)
				1559	xfs_blkdev_put(btp->pbr_bdev);
				1560	xfs_free_bufhash(btp);
				1561	iput(btp->pbr_mapping->host);
				1562	kmem_free(btp, sizeof(*btp));
				1563	}
				1564
				1565	void
				1566	xfs_incore_relse(
				1567	xfs_buftarg_t *btp,
				1568	int delwri_only,
				1569	int wait)
				1570	{
				1571	invalidate_bdev(btp->pbr_bdev, 1);
				1572	truncate_inode_pages(btp->pbr_mapping, 0LL);
				1573	}
				1574
				1575	STATIC int
				1576	xfs_setsize_buftarg_flags(
				1577	xfs_buftarg_t *btp,
				1578	unsigned int blocksize,
				1579	unsigned int sectorsize,
				1580	int verbose)
				1581	{
				1582	btp->pbr_bsize = blocksize;
				1583	btp->pbr_sshift = ffs(sectorsize) - 1;
				1584	btp->pbr_smask = sectorsize - 1;
				1585
				1586	if (set_blocksize(btp->pbr_bdev, sectorsize)) {
				1587	printk(KERN_WARNING
				1588	"XFS: Cannot set_blocksize to %u on device %s\n",
				1589	sectorsize, XFS_BUFTARG_NAME(btp));
				1590	return EINVAL;
				1591	}
				1592
				1593	if (verbose &&
				1594	(PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
				1595	printk(KERN_WARNING
				1596	"XFS: %u byte sectors in use on device %s. "
				1597	"This is suboptimal; %u or greater is ideal.\n",
				1598	sectorsize, XFS_BUFTARG_NAME(btp),
				1599	(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
				1600	}
				1601
				1602	return 0;
				1603	}
				1604
				1605	/*
				1606	* When allocating the initial buffer target we have not yet
				1607	* read in the superblock, so don't know what sized sectors
				1608	* are being used is at this early stage. Play safe.
				1609	*/
				1610	STATIC int
				1611	xfs_setsize_buftarg_early(
				1612	xfs_buftarg_t *btp,
				1613	struct block_device *bdev)
				1614	{
				1615	return xfs_setsize_buftarg_flags(btp,
				1616	PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
				1617	}
				1618
				1619	int
				1620	xfs_setsize_buftarg(
				1621	xfs_buftarg_t *btp,
				1622	unsigned int blocksize,
				1623	unsigned int sectorsize)
				1624	{
				1625	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
				1626	}
				1627
				1628	STATIC int
				1629	xfs_mapping_buftarg(
				1630	xfs_buftarg_t *btp,
				1631	struct block_device *bdev)
				1632	{
				1633	struct backing_dev_info *bdi;
				1634	struct inode *inode;
				1635	struct address_space *mapping;
				1636	static struct address_space_operations mapping_aops = {
				1637	.sync_page = block_sync_page,
				1638	};
				1639
				1640	inode = new_inode(bdev->bd_inode->i_sb);
				1641	if (!inode) {
				1642	printk(KERN_WARNING
				1643	"XFS: Cannot allocate mapping inode for device %s\n",
				1644	XFS_BUFTARG_NAME(btp));
				1645	return ENOMEM;
				1646	}
				1647	inode->i_mode = S_IFBLK;
				1648	inode->i_bdev = bdev;
				1649	inode->i_rdev = bdev->bd_dev;
				1650	bdi = blk_get_backing_dev_info(bdev);
				1651	if (!bdi)
				1652	bdi = &default_backing_dev_info;
				1653	mapping = &inode->i_data;
				1654	mapping->a_ops = &mapping_aops;
				1655	mapping->backing_dev_info = bdi;
				1656	mapping_set_gfp_mask(mapping, GFP_NOFS);
				1657	btp->pbr_mapping = mapping;
				1658	return 0;
				1659	}
				1660
				1661	xfs_buftarg_t *
				1662	xfs_alloc_buftarg(
				1663	struct block_device *bdev,
				1664	int external)
				1665	{
				1666	xfs_buftarg_t *btp;
				1667
				1668	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
				1669
				1670	btp->pbr_dev = bdev->bd_dev;
				1671	btp->pbr_bdev = bdev;
				1672	if (xfs_setsize_buftarg_early(btp, bdev))
				1673	goto error;
				1674	if (xfs_mapping_buftarg(btp, bdev))
				1675	goto error;
				1676	xfs_alloc_bufhash(btp, external);
				1677	return btp;
				1678
				1679	error:
				1680	kmem_free(btp, sizeof(*btp));
				1681	return NULL;
				1682	}
				1683
				1684
				1685	/*
				1686	* Pagebuf delayed write buffer handling
				1687	*/
				1688
				1689	STATIC LIST_HEAD(pbd_delwrite_queue);
				1690	STATIC DEFINE_SPINLOCK(pbd_delwrite_lock);
				1691
				1692	STATIC void
				1693	pagebuf_delwri_queue(
				1694	xfs_buf_t *pb,
				1695	int unlock)
				1696	{
				1697	PB_TRACE(pb, "delwri_q", (long)unlock);
				1698	ASSERT(pb->pb_flags & PBF_DELWRI);
				1699
				1700	spin_lock(&pbd_delwrite_lock);
				1701	/* If already in the queue, dequeue and place at tail */
				1702	if (!list_empty(&pb->pb_list)) {
				1703	if (unlock) {
				1704	atomic_dec(&pb->pb_hold);
				1705	}
				1706	list_del(&pb->pb_list);
				1707	}
				1708
				1709	list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
				1710	pb->pb_queuetime = jiffies;
				1711	spin_unlock(&pbd_delwrite_lock);
				1712
				1713	if (unlock)
				1714	pagebuf_unlock(pb);
				1715	}
				1716
				1717	void
				1718	pagebuf_delwri_dequeue(
				1719	xfs_buf_t *pb)
				1720	{
				1721	int dequeued = 0;
				1722
				1723	spin_lock(&pbd_delwrite_lock);
				1724	if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
				1725	list_del_init(&pb->pb_list);
				1726	dequeued = 1;
				1727	}
				1728	pb->pb_flags &= ~PBF_DELWRI;
				1729	spin_unlock(&pbd_delwrite_lock);
				1730
				1731	if (dequeued)
				1732	pagebuf_rele(pb);
				1733
				1734	PB_TRACE(pb, "delwri_dq", (long)dequeued);
				1735	}
				1736
				1737	STATIC void
				1738	pagebuf_runall_queues(
				1739	struct workqueue_struct *queue)
				1740	{
				1741	flush_workqueue(queue);
				1742	}
				1743
				1744	/* Defines for pagebuf daemon */
				1745	STATIC DECLARE_COMPLETION(pagebuf_daemon_done);
				1746	STATIC struct task_struct *pagebuf_daemon_task;
				1747	STATIC int pagebuf_daemon_active;
				1748	STATIC int force_flush;
				1749
				1750
				1751	STATIC int
				1752	pagebuf_daemon_wakeup(
				1753	int priority,
				1754	unsigned int mask)
				1755	{
				1756	force_flush = 1;
				1757	barrier();
				1758	wake_up_process(pagebuf_daemon_task);
				1759	return 0;
				1760	}
				1761
				1762	STATIC int
				1763	pagebuf_daemon(
				1764	void *data)
				1765	{
				1766	struct list_head tmp;
				1767	unsigned long age;
				1768	xfs_buftarg_t *target;
				1769	xfs_buf_t pb, n;
				1770
				1771	/* Set up the thread */
				1772	daemonize("xfsbufd");
				1773	current->flags \|= PF_MEMALLOC;
				1774
				1775	pagebuf_daemon_task = current;
				1776	pagebuf_daemon_active = 1;
				1777	barrier();
				1778
				1779	INIT_LIST_HEAD(&tmp);
				1780	do {
				1781	try_to_freeze(PF_FREEZE);
				1782
				1783	set_current_state(TASK_INTERRUPTIBLE);
				1784	schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100);
				1785
				1786	age = (xfs_buf_age_centisecs * HZ) / 100;
				1787	spin_lock(&pbd_delwrite_lock);
				1788	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1789	PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
				1790	ASSERT(pb->pb_flags & PBF_DELWRI);
				1791
				1792	if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
				1793	if (!force_flush &&
				1794	time_before(jiffies,
				1795	pb->pb_queuetime + age)) {
				1796	pagebuf_unlock(pb);
				1797	break;
				1798	}
				1799
				1800	pb->pb_flags &= ~PBF_DELWRI;
				1801	pb->pb_flags \|= PBF_WRITE;
				1802	list_move(&pb->pb_list, &tmp);
				1803	}
				1804	}
				1805	spin_unlock(&pbd_delwrite_lock);
				1806
				1807	while (!list_empty(&tmp)) {
				1808	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1809	target = pb->pb_target;
				1810
				1811	list_del_init(&pb->pb_list);
				1812	pagebuf_iostrategy(pb);
				1813
				1814	blk_run_address_space(target->pbr_mapping);
				1815	}
				1816
				1817	if (as_list_len > 0)
				1818	purge_addresses();
				1819
				1820	force_flush = 0;
				1821	} while (pagebuf_daemon_active);
				1822
				1823	complete_and_exit(&pagebuf_daemon_done, 0);
				1824	}
				1825
				1826	/*
				1827	* Go through all incore buffers, and release buffers if they belong to
				1828	* the given device. This is used in filesystem error handling to
				1829	* preserve the consistency of its metadata.
				1830	*/
				1831	int
				1832	xfs_flush_buftarg(
				1833	xfs_buftarg_t *target,
				1834	int wait)
				1835	{
				1836	struct list_head tmp;
				1837	xfs_buf_t pb, n;
				1838	int pincount = 0;
				1839
				1840	pagebuf_runall_queues(pagebuf_dataio_workqueue);
				1841	pagebuf_runall_queues(pagebuf_logio_workqueue);
				1842
				1843	INIT_LIST_HEAD(&tmp);
				1844	spin_lock(&pbd_delwrite_lock);
				1845	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1846
				1847	if (pb->pb_target != target)
				1848	continue;
				1849
				1850	ASSERT(pb->pb_flags & PBF_DELWRI);
				1851	PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
				1852	if (pagebuf_ispin(pb)) {
				1853	pincount++;
				1854	continue;
				1855	}
				1856
				1857	pb->pb_flags &= ~PBF_DELWRI;
				1858	pb->pb_flags \|= PBF_WRITE;
				1859	list_move(&pb->pb_list, &tmp);
				1860	}
				1861	spin_unlock(&pbd_delwrite_lock);
				1862
				1863	/*
				1864	* Dropped the delayed write list lock, now walk the temporary list
				1865	*/
				1866	list_for_each_entry_safe(pb, n, &tmp, pb_list) {
				1867	if (wait)
				1868	pb->pb_flags &= ~PBF_ASYNC;
				1869	else
				1870	list_del_init(&pb->pb_list);
				1871
				1872	pagebuf_lock(pb);
				1873	pagebuf_iostrategy(pb);
				1874	}
				1875
				1876	/*
				1877	* Remaining list items must be flushed before returning
				1878	*/
				1879	while (!list_empty(&tmp)) {
				1880	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1881
				1882	list_del_init(&pb->pb_list);
				1883	xfs_iowait(pb);
				1884	xfs_buf_relse(pb);
				1885	}
				1886
				1887	if (wait)
				1888	blk_run_address_space(target->pbr_mapping);
				1889
				1890	return pincount;
				1891	}
				1892
				1893	STATIC int
				1894	pagebuf_daemon_start(void)
				1895	{
				1896	int rval;
				1897
				1898	pagebuf_logio_workqueue = create_workqueue("xfslogd");
				1899	if (!pagebuf_logio_workqueue)
				1900	return -ENOMEM;
				1901
				1902	pagebuf_dataio_workqueue = create_workqueue("xfsdatad");
				1903	if (!pagebuf_dataio_workqueue) {
				1904	destroy_workqueue(pagebuf_logio_workqueue);
				1905	return -ENOMEM;
				1906	}
				1907
				1908	rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS\|CLONE_FILES);
				1909	if (rval < 0) {
				1910	destroy_workqueue(pagebuf_logio_workqueue);
				1911	destroy_workqueue(pagebuf_dataio_workqueue);
				1912	}
				1913
				1914	return rval;
				1915	}
				1916
				1917	/*
				1918	* pagebuf_daemon_stop
				1919	*
				1920	* Note: do not mark as __exit, it is called from pagebuf_terminate.
				1921	*/
				1922	STATIC void
				1923	pagebuf_daemon_stop(void)
				1924	{
				1925	pagebuf_daemon_active = 0;
				1926	barrier();
				1927	wait_for_completion(&pagebuf_daemon_done);
				1928
				1929	destroy_workqueue(pagebuf_logio_workqueue);
				1930	destroy_workqueue(pagebuf_dataio_workqueue);
				1931	}
				1932
				1933	/*
				1934	* Initialization and Termination
				1935	*/
				1936
				1937	int __init
				1938	pagebuf_init(void)
				1939	{
				1940	pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0,
				1941	SLAB_HWCACHE_ALIGN, NULL, NULL);
				1942	if (pagebuf_cache == NULL) {
				1943	printk("XFS: couldn't init xfs_buf_t cache\n");
				1944	pagebuf_terminate();
				1945	return -ENOMEM;
				1946	}
				1947
				1948	#ifdef PAGEBUF_TRACE
				1949	pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
				1950	#endif
				1951
				1952	pagebuf_daemon_start();
				1953
				1954	pagebuf_shake = kmem_shake_register(pagebuf_daemon_wakeup);
				1955	if (pagebuf_shake == NULL) {
				1956	pagebuf_terminate();
				1957	return -ENOMEM;
				1958	}
				1959
				1960	return 0;
				1961	}
				1962
				1963
				1964	/*
				1965	* pagebuf_terminate.
				1966	*
				1967	* Note: do not mark as __exit, this is also called from the __init code.
				1968	*/
				1969	void
				1970	pagebuf_terminate(void)
				1971	{
				1972	pagebuf_daemon_stop();
				1973
				1974	#ifdef PAGEBUF_TRACE
				1975	ktrace_free(pagebuf_trace_buf);
				1976	#endif
				1977
				1978	kmem_zone_destroy(pagebuf_cache);
				1979	kmem_shake_deregister(pagebuf_shake);
				1980	}