| /* | 
 |  * dm-snapshot.c | 
 |  * | 
 |  * Copyright (C) 2001-2002 Sistina Software (UK) Limited. | 
 |  * | 
 |  * This file is released under the GPL. | 
 |  */ | 
 |  | 
 | #include "dm.h" | 
 | #include "dm-snap.h" | 
 | #include "dm-io.h" | 
 | #include "kcopyd.h" | 
 |  | 
 | #include <linux/mm.h> | 
 | #include <linux/pagemap.h> | 
 | #include <linux/vmalloc.h> | 
 | #include <linux/slab.h> | 
 |  | 
 | /*----------------------------------------------------------------- | 
 |  * Persistent snapshots, by persistent we mean that the snapshot | 
 |  * will survive a reboot. | 
 |  *---------------------------------------------------------------*/ | 
 |  | 
 | /* | 
 |  * We need to store a record of which parts of the origin have | 
 |  * been copied to the snapshot device.  The snapshot code | 
 |  * requires that we copy exception chunks to chunk aligned areas | 
 |  * of the COW store.  It makes sense therefore, to store the | 
 |  * metadata in chunk size blocks. | 
 |  * | 
 |  * There is no backward or forward compatibility implemented, | 
 |  * snapshots with different disk versions than the kernel will | 
 |  * not be usable.  It is expected that "lvcreate" will blank out | 
 |  * the start of a fresh COW device before calling the snapshot | 
 |  * constructor. | 
 |  * | 
 |  * The first chunk of the COW device just contains the header. | 
 |  * After this there is a chunk filled with exception metadata, | 
 |  * followed by as many exception chunks as can fit in the | 
 |  * metadata areas. | 
 |  * | 
 |  * All on disk structures are in little-endian format.  The end | 
 |  * of the exceptions info is indicated by an exception with a | 
 |  * new_chunk of 0, which is invalid since it would point to the | 
 |  * header chunk. | 
 |  */ | 
 |  | 
 | /* | 
 |  * Magic for persistent snapshots: "SnAp" - Feeble isn't it. | 
 |  */ | 
 | #define SNAP_MAGIC 0x70416e53 | 
 |  | 
 | /* | 
 |  * The on-disk version of the metadata. | 
 |  */ | 
 | #define SNAPSHOT_DISK_VERSION 1 | 
 |  | 
 | struct disk_header { | 
 | 	uint32_t magic; | 
 |  | 
 | 	/* | 
 | 	 * Is this snapshot valid.  There is no way of recovering | 
 | 	 * an invalid snapshot. | 
 | 	 */ | 
 | 	uint32_t valid; | 
 |  | 
 | 	/* | 
 | 	 * Simple, incrementing version. no backward | 
 | 	 * compatibility. | 
 | 	 */ | 
 | 	uint32_t version; | 
 |  | 
 | 	/* In sectors */ | 
 | 	uint32_t chunk_size; | 
 | }; | 
 |  | 
 | struct disk_exception { | 
 | 	uint64_t old_chunk; | 
 | 	uint64_t new_chunk; | 
 | }; | 
 |  | 
 | struct commit_callback { | 
 | 	void (*callback)(void *, int success); | 
 | 	void *context; | 
 | }; | 
 |  | 
 | /* | 
 |  * The top level structure for a persistent exception store. | 
 |  */ | 
 | struct pstore { | 
 | 	struct dm_snapshot *snap;	/* up pointer to my snapshot */ | 
 | 	int version; | 
 | 	int valid; | 
 | 	uint32_t chunk_size; | 
 | 	uint32_t exceptions_per_area; | 
 |  | 
 | 	/* | 
 | 	 * Now that we have an asynchronous kcopyd there is no | 
 | 	 * need for large chunk sizes, so it wont hurt to have a | 
 | 	 * whole chunks worth of metadata in memory at once. | 
 | 	 */ | 
 | 	void *area; | 
 |  | 
 | 	/* | 
 | 	 * Used to keep track of which metadata area the data in | 
 | 	 * 'chunk' refers to. | 
 | 	 */ | 
 | 	uint32_t current_area; | 
 |  | 
 | 	/* | 
 | 	 * The next free chunk for an exception. | 
 | 	 */ | 
 | 	uint32_t next_free; | 
 |  | 
 | 	/* | 
 | 	 * The index of next free exception in the current | 
 | 	 * metadata area. | 
 | 	 */ | 
 | 	uint32_t current_committed; | 
 |  | 
 | 	atomic_t pending_count; | 
 | 	uint32_t callback_count; | 
 | 	struct commit_callback *callbacks; | 
 | }; | 
 |  | 
 | static inline unsigned int sectors_to_pages(unsigned int sectors) | 
 | { | 
 | 	return sectors / (PAGE_SIZE >> 9); | 
 | } | 
 |  | 
 | static int alloc_area(struct pstore *ps) | 
 | { | 
 | 	int r = -ENOMEM; | 
 | 	size_t len; | 
 |  | 
 | 	len = ps->chunk_size << SECTOR_SHIFT; | 
 |  | 
 | 	/* | 
 | 	 * Allocate the chunk_size block of memory that will hold | 
 | 	 * a single metadata area. | 
 | 	 */ | 
 | 	ps->area = vmalloc(len); | 
 | 	if (!ps->area) | 
 | 		return r; | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | static void free_area(struct pstore *ps) | 
 | { | 
 | 	vfree(ps->area); | 
 | } | 
 |  | 
 | /* | 
 |  * Read or write a chunk aligned and sized block of data from a device. | 
 |  */ | 
 | static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) | 
 | { | 
 | 	struct io_region where; | 
 | 	unsigned long bits; | 
 |  | 
 | 	where.bdev = ps->snap->cow->bdev; | 
 | 	where.sector = ps->chunk_size * chunk; | 
 | 	where.count = ps->chunk_size; | 
 |  | 
 | 	return dm_io_sync_vm(1, &where, rw, ps->area, &bits); | 
 | } | 
 |  | 
 | /* | 
 |  * Read or write a metadata area.  Remembering to skip the first | 
 |  * chunk which holds the header. | 
 |  */ | 
 | static int area_io(struct pstore *ps, uint32_t area, int rw) | 
 | { | 
 | 	int r; | 
 | 	uint32_t chunk; | 
 |  | 
 | 	/* convert a metadata area index to a chunk index */ | 
 | 	chunk = 1 + ((ps->exceptions_per_area + 1) * area); | 
 |  | 
 | 	r = chunk_io(ps, chunk, rw); | 
 | 	if (r) | 
 | 		return r; | 
 |  | 
 | 	ps->current_area = area; | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int zero_area(struct pstore *ps, uint32_t area) | 
 | { | 
 | 	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); | 
 | 	return area_io(ps, area, WRITE); | 
 | } | 
 |  | 
 | static int read_header(struct pstore *ps, int *new_snapshot) | 
 | { | 
 | 	int r; | 
 | 	struct disk_header *dh; | 
 |  | 
 | 	r = chunk_io(ps, 0, READ); | 
 | 	if (r) | 
 | 		return r; | 
 |  | 
 | 	dh = (struct disk_header *) ps->area; | 
 |  | 
 | 	if (le32_to_cpu(dh->magic) == 0) { | 
 | 		*new_snapshot = 1; | 
 |  | 
 | 	} else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { | 
 | 		*new_snapshot = 0; | 
 | 		ps->valid = le32_to_cpu(dh->valid); | 
 | 		ps->version = le32_to_cpu(dh->version); | 
 | 		ps->chunk_size = le32_to_cpu(dh->chunk_size); | 
 |  | 
 | 	} else { | 
 | 		DMWARN("Invalid/corrupt snapshot"); | 
 | 		r = -ENXIO; | 
 | 	} | 
 |  | 
 | 	return r; | 
 | } | 
 |  | 
 | static int write_header(struct pstore *ps) | 
 | { | 
 | 	struct disk_header *dh; | 
 |  | 
 | 	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); | 
 |  | 
 | 	dh = (struct disk_header *) ps->area; | 
 | 	dh->magic = cpu_to_le32(SNAP_MAGIC); | 
 | 	dh->valid = cpu_to_le32(ps->valid); | 
 | 	dh->version = cpu_to_le32(ps->version); | 
 | 	dh->chunk_size = cpu_to_le32(ps->chunk_size); | 
 |  | 
 | 	return chunk_io(ps, 0, WRITE); | 
 | } | 
 |  | 
 | /* | 
 |  * Access functions for the disk exceptions, these do the endian conversions. | 
 |  */ | 
 | static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) | 
 | { | 
 | 	if (index >= ps->exceptions_per_area) | 
 | 		return NULL; | 
 |  | 
 | 	return ((struct disk_exception *) ps->area) + index; | 
 | } | 
 |  | 
 | static int read_exception(struct pstore *ps, | 
 | 			  uint32_t index, struct disk_exception *result) | 
 | { | 
 | 	struct disk_exception *e; | 
 |  | 
 | 	e = get_exception(ps, index); | 
 | 	if (!e) | 
 | 		return -EINVAL; | 
 |  | 
 | 	/* copy it */ | 
 | 	result->old_chunk = le64_to_cpu(e->old_chunk); | 
 | 	result->new_chunk = le64_to_cpu(e->new_chunk); | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int write_exception(struct pstore *ps, | 
 | 			   uint32_t index, struct disk_exception *de) | 
 | { | 
 | 	struct disk_exception *e; | 
 |  | 
 | 	e = get_exception(ps, index); | 
 | 	if (!e) | 
 | 		return -EINVAL; | 
 |  | 
 | 	/* copy it */ | 
 | 	e->old_chunk = cpu_to_le64(de->old_chunk); | 
 | 	e->new_chunk = cpu_to_le64(de->new_chunk); | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | /* | 
 |  * Registers the exceptions that are present in the current area. | 
 |  * 'full' is filled in to indicate if the area has been | 
 |  * filled. | 
 |  */ | 
 | static int insert_exceptions(struct pstore *ps, int *full) | 
 | { | 
 | 	int r; | 
 | 	unsigned int i; | 
 | 	struct disk_exception de; | 
 |  | 
 | 	/* presume the area is full */ | 
 | 	*full = 1; | 
 |  | 
 | 	for (i = 0; i < ps->exceptions_per_area; i++) { | 
 | 		r = read_exception(ps, i, &de); | 
 |  | 
 | 		if (r) | 
 | 			return r; | 
 |  | 
 | 		/* | 
 | 		 * If the new_chunk is pointing at the start of | 
 | 		 * the COW device, where the first metadata area | 
 | 		 * is we know that we've hit the end of the | 
 | 		 * exceptions.  Therefore the area is not full. | 
 | 		 */ | 
 | 		if (de.new_chunk == 0LL) { | 
 | 			ps->current_committed = i; | 
 | 			*full = 0; | 
 | 			break; | 
 | 		} | 
 |  | 
 | 		/* | 
 | 		 * Keep track of the start of the free chunks. | 
 | 		 */ | 
 | 		if (ps->next_free <= de.new_chunk) | 
 | 			ps->next_free = de.new_chunk + 1; | 
 |  | 
 | 		/* | 
 | 		 * Otherwise we add the exception to the snapshot. | 
 | 		 */ | 
 | 		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); | 
 | 		if (r) | 
 | 			return r; | 
 | 	} | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int read_exceptions(struct pstore *ps) | 
 | { | 
 | 	uint32_t area; | 
 | 	int r, full = 1; | 
 |  | 
 | 	/* | 
 | 	 * Keeping reading chunks and inserting exceptions until | 
 | 	 * we find a partially full area. | 
 | 	 */ | 
 | 	for (area = 0; full; area++) { | 
 | 		r = area_io(ps, area, READ); | 
 | 		if (r) | 
 | 			return r; | 
 |  | 
 | 		r = insert_exceptions(ps, &full); | 
 | 		if (r) | 
 | 			return r; | 
 | 	} | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | static inline struct pstore *get_info(struct exception_store *store) | 
 | { | 
 | 	return (struct pstore *) store->context; | 
 | } | 
 |  | 
 | static void persistent_fraction_full(struct exception_store *store, | 
 | 				     sector_t *numerator, sector_t *denominator) | 
 | { | 
 | 	*numerator = get_info(store)->next_free * store->snap->chunk_size; | 
 | 	*denominator = get_dev_size(store->snap->cow->bdev); | 
 | } | 
 |  | 
 | static void persistent_destroy(struct exception_store *store) | 
 | { | 
 | 	struct pstore *ps = get_info(store); | 
 |  | 
 | 	dm_io_put(sectors_to_pages(ps->chunk_size)); | 
 | 	vfree(ps->callbacks); | 
 | 	free_area(ps); | 
 | 	kfree(ps); | 
 | } | 
 |  | 
 | static int persistent_read_metadata(struct exception_store *store) | 
 | { | 
 | 	int r, new_snapshot; | 
 | 	struct pstore *ps = get_info(store); | 
 |  | 
 | 	/* | 
 | 	 * Read the snapshot header. | 
 | 	 */ | 
 | 	r = read_header(ps, &new_snapshot); | 
 | 	if (r) | 
 | 		return r; | 
 |  | 
 | 	/* | 
 | 	 * Do we need to setup a new snapshot ? | 
 | 	 */ | 
 | 	if (new_snapshot) { | 
 | 		r = write_header(ps); | 
 | 		if (r) { | 
 | 			DMWARN("write_header failed"); | 
 | 			return r; | 
 | 		} | 
 |  | 
 | 		r = zero_area(ps, 0); | 
 | 		if (r) { | 
 | 			DMWARN("zero_area(0) failed"); | 
 | 			return r; | 
 | 		} | 
 |  | 
 | 	} else { | 
 | 		/* | 
 | 		 * Sanity checks. | 
 | 		 */ | 
 | 		if (!ps->valid) { | 
 | 			DMWARN("snapshot is marked invalid"); | 
 | 			return -EINVAL; | 
 | 		} | 
 |  | 
 | 		if (ps->version != SNAPSHOT_DISK_VERSION) { | 
 | 			DMWARN("unable to handle snapshot disk version %d", | 
 | 			       ps->version); | 
 | 			return -EINVAL; | 
 | 		} | 
 |  | 
 | 		/* | 
 | 		 * Read the metadata. | 
 | 		 */ | 
 | 		r = read_exceptions(ps); | 
 | 		if (r) | 
 | 			return r; | 
 | 	} | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int persistent_prepare(struct exception_store *store, | 
 | 			      struct exception *e) | 
 | { | 
 | 	struct pstore *ps = get_info(store); | 
 | 	uint32_t stride; | 
 | 	sector_t size = get_dev_size(store->snap->cow->bdev); | 
 |  | 
 | 	/* Is there enough room ? */ | 
 | 	if (size < ((ps->next_free + 1) * store->snap->chunk_size)) | 
 | 		return -ENOSPC; | 
 |  | 
 | 	e->new_chunk = ps->next_free; | 
 |  | 
 | 	/* | 
 | 	 * Move onto the next free pending, making sure to take | 
 | 	 * into account the location of the metadata chunks. | 
 | 	 */ | 
 | 	stride = (ps->exceptions_per_area + 1); | 
 | 	if ((++ps->next_free % stride) == 1) | 
 | 		ps->next_free++; | 
 |  | 
 | 	atomic_inc(&ps->pending_count); | 
 | 	return 0; | 
 | } | 
 |  | 
 | static void persistent_commit(struct exception_store *store, | 
 | 			      struct exception *e, | 
 | 			      void (*callback) (void *, int success), | 
 | 			      void *callback_context) | 
 | { | 
 | 	int r; | 
 | 	unsigned int i; | 
 | 	struct pstore *ps = get_info(store); | 
 | 	struct disk_exception de; | 
 | 	struct commit_callback *cb; | 
 |  | 
 | 	de.old_chunk = e->old_chunk; | 
 | 	de.new_chunk = e->new_chunk; | 
 | 	write_exception(ps, ps->current_committed++, &de); | 
 |  | 
 | 	/* | 
 | 	 * Add the callback to the back of the array.  This code | 
 | 	 * is the only place where the callback array is | 
 | 	 * manipulated, and we know that it will never be called | 
 | 	 * multiple times concurrently. | 
 | 	 */ | 
 | 	cb = ps->callbacks + ps->callback_count++; | 
 | 	cb->callback = callback; | 
 | 	cb->context = callback_context; | 
 |  | 
 | 	/* | 
 | 	 * If there are no more exceptions in flight, or we have | 
 | 	 * filled this metadata area we commit the exceptions to | 
 | 	 * disk. | 
 | 	 */ | 
 | 	if (atomic_dec_and_test(&ps->pending_count) || | 
 | 	    (ps->current_committed == ps->exceptions_per_area)) { | 
 | 		r = area_io(ps, ps->current_area, WRITE); | 
 | 		if (r) | 
 | 			ps->valid = 0; | 
 |  | 
 | 		for (i = 0; i < ps->callback_count; i++) { | 
 | 			cb = ps->callbacks + i; | 
 | 			cb->callback(cb->context, r == 0 ? 1 : 0); | 
 | 		} | 
 |  | 
 | 		ps->callback_count = 0; | 
 | 	} | 
 |  | 
 | 	/* | 
 | 	 * Have we completely filled the current area ? | 
 | 	 */ | 
 | 	if (ps->current_committed == ps->exceptions_per_area) { | 
 | 		ps->current_committed = 0; | 
 | 		r = zero_area(ps, ps->current_area + 1); | 
 | 		if (r) | 
 | 			ps->valid = 0; | 
 | 	} | 
 | } | 
 |  | 
 | static void persistent_drop(struct exception_store *store) | 
 | { | 
 | 	struct pstore *ps = get_info(store); | 
 |  | 
 | 	ps->valid = 0; | 
 | 	if (write_header(ps)) | 
 | 		DMWARN("write header failed"); | 
 | } | 
 |  | 
 | int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) | 
 | { | 
 | 	int r; | 
 | 	struct pstore *ps; | 
 |  | 
 | 	r = dm_io_get(sectors_to_pages(chunk_size)); | 
 | 	if (r) | 
 | 		return r; | 
 |  | 
 | 	/* allocate the pstore */ | 
 | 	ps = kmalloc(sizeof(*ps), GFP_KERNEL); | 
 | 	if (!ps) { | 
 | 		r = -ENOMEM; | 
 | 		goto bad; | 
 | 	} | 
 |  | 
 | 	ps->snap = store->snap; | 
 | 	ps->valid = 1; | 
 | 	ps->version = SNAPSHOT_DISK_VERSION; | 
 | 	ps->chunk_size = chunk_size; | 
 | 	ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / | 
 | 	    sizeof(struct disk_exception); | 
 | 	ps->next_free = 2;	/* skipping the header and first area */ | 
 | 	ps->current_committed = 0; | 
 |  | 
 | 	r = alloc_area(ps); | 
 | 	if (r) | 
 | 		goto bad; | 
 |  | 
 | 	/* | 
 | 	 * Allocate space for all the callbacks. | 
 | 	 */ | 
 | 	ps->callback_count = 0; | 
 | 	atomic_set(&ps->pending_count, 0); | 
 | 	ps->callbacks = dm_vcalloc(ps->exceptions_per_area, | 
 | 				   sizeof(*ps->callbacks)); | 
 |  | 
 | 	if (!ps->callbacks) { | 
 | 		r = -ENOMEM; | 
 | 		goto bad; | 
 | 	} | 
 |  | 
 | 	store->destroy = persistent_destroy; | 
 | 	store->read_metadata = persistent_read_metadata; | 
 | 	store->prepare_exception = persistent_prepare; | 
 | 	store->commit_exception = persistent_commit; | 
 | 	store->drop_snapshot = persistent_drop; | 
 | 	store->fraction_full = persistent_fraction_full; | 
 | 	store->context = ps; | 
 |  | 
 | 	return 0; | 
 |  | 
 |       bad: | 
 | 	dm_io_put(sectors_to_pages(chunk_size)); | 
 | 	if (ps) { | 
 | 		if (ps->area) | 
 | 			free_area(ps); | 
 |  | 
 | 		kfree(ps); | 
 | 	} | 
 | 	return r; | 
 | } | 
 |  | 
 | /*----------------------------------------------------------------- | 
 |  * Implementation of the store for non-persistent snapshots. | 
 |  *---------------------------------------------------------------*/ | 
 | struct transient_c { | 
 | 	sector_t next_free; | 
 | }; | 
 |  | 
 | static void transient_destroy(struct exception_store *store) | 
 | { | 
 | 	kfree(store->context); | 
 | } | 
 |  | 
 | static int transient_read_metadata(struct exception_store *store) | 
 | { | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int transient_prepare(struct exception_store *store, struct exception *e) | 
 | { | 
 | 	struct transient_c *tc = (struct transient_c *) store->context; | 
 | 	sector_t size = get_dev_size(store->snap->cow->bdev); | 
 |  | 
 | 	if (size < (tc->next_free + store->snap->chunk_size)) | 
 | 		return -1; | 
 |  | 
 | 	e->new_chunk = sector_to_chunk(store->snap, tc->next_free); | 
 | 	tc->next_free += store->snap->chunk_size; | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | static void transient_commit(struct exception_store *store, | 
 | 		      struct exception *e, | 
 | 		      void (*callback) (void *, int success), | 
 | 		      void *callback_context) | 
 | { | 
 | 	/* Just succeed */ | 
 | 	callback(callback_context, 1); | 
 | } | 
 |  | 
 | static void transient_fraction_full(struct exception_store *store, | 
 | 				    sector_t *numerator, sector_t *denominator) | 
 | { | 
 | 	*numerator = ((struct transient_c *) store->context)->next_free; | 
 | 	*denominator = get_dev_size(store->snap->cow->bdev); | 
 | } | 
 |  | 
 | int dm_create_transient(struct exception_store *store, | 
 | 			struct dm_snapshot *s, int blocksize) | 
 | { | 
 | 	struct transient_c *tc; | 
 |  | 
 | 	memset(store, 0, sizeof(*store)); | 
 | 	store->destroy = transient_destroy; | 
 | 	store->read_metadata = transient_read_metadata; | 
 | 	store->prepare_exception = transient_prepare; | 
 | 	store->commit_exception = transient_commit; | 
 | 	store->fraction_full = transient_fraction_full; | 
 | 	store->snap = s; | 
 |  | 
 | 	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); | 
 | 	if (!tc) | 
 | 		return -ENOMEM; | 
 |  | 
 | 	tc->next_free = 0; | 
 | 	store->context = tc; | 
 |  | 
 | 	return 0; | 
 | } |