dm: add persistent data library

The persistent-data library offers a re-usable framework for the storage
and management of on-disk metadata in device-mapper targets.

It's used by the thin-provisioning target in the next patch and in an
upcoming hierarchical storage target.

For further information, please read
Documentation/device-mapper/persistent-data.txt

Signed-off-by: Joe Thornber <thornber@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
new file mode 100644
index 0000000..0317ecd
--- /dev/null
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-block-manager.h"
+#include "dm-persistent-data-internal.h"
+#include "../dm-bufio.h"
+
+#include <linux/crc32c.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/device-mapper.h>
+#include <linux/stacktrace.h>
+
+#define DM_MSG_PREFIX "block manager"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * This is a read/write semaphore with a couple of differences.
+ *
+ * i) There is a restriction on the number of concurrent read locks that
+ * may be held at once.  This is just an implementation detail.
+ *
+ * ii) Recursive locking attempts are detected and return EINVAL.  A stack
+ * trace is also emitted for the previous lock aquisition.
+ *
+ * iii) Priority is given to write locks.
+ */
+#define MAX_HOLDERS 4
+#define MAX_STACK 10
+
+typedef unsigned long stack_entries[MAX_STACK];
+
+struct block_lock {
+	spinlock_t lock;
+	__s32 count;
+	struct list_head waiters;
+	struct task_struct *holders[MAX_HOLDERS];
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	struct stack_trace traces[MAX_HOLDERS];
+	stack_entries entries[MAX_HOLDERS];
+#endif
+};
+
+struct waiter {
+	struct list_head list;
+	struct task_struct *task;
+	int wants_write;
+};
+
+static unsigned __find_holder(struct block_lock *lock,
+			      struct task_struct *task)
+{
+	unsigned i;
+
+	for (i = 0; i < MAX_HOLDERS; i++)
+		if (lock->holders[i] == task)
+			break;
+
+	BUG_ON(i == MAX_HOLDERS);
+	return i;
+}
+
+/* call this *after* you increment lock->count */
+static void __add_holder(struct block_lock *lock, struct task_struct *task)
+{
+	unsigned h = __find_holder(lock, NULL);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	struct stack_trace *t;
+#endif
+
+	get_task_struct(task);
+	lock->holders[h] = task;
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	t = lock->traces + h;
+	t->nr_entries = 0;
+	t->max_entries = MAX_STACK;
+	t->entries = lock->entries[h];
+	t->skip = 2;
+	save_stack_trace(t);
+#endif
+}
+
+/* call this *before* you decrement lock->count */
+static void __del_holder(struct block_lock *lock, struct task_struct *task)
+{
+	unsigned h = __find_holder(lock, task);
+	lock->holders[h] = NULL;
+	put_task_struct(task);
+}
+
+static int __check_holder(struct block_lock *lock)
+{
+	unsigned i;
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	static struct stack_trace t;
+	static stack_entries entries;
+#endif
+
+	for (i = 0; i < MAX_HOLDERS; i++) {
+		if (lock->holders[i] == current) {
+			DMERR("recursive lock detected in pool metadata");
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+			DMERR("previously held here:");
+			print_stack_trace(lock->traces + i, 4);
+
+			DMERR("subsequent aquisition attempted here:");
+			t.nr_entries = 0;
+			t.max_entries = MAX_STACK;
+			t.entries = entries;
+			t.skip = 3;
+			save_stack_trace(&t);
+			print_stack_trace(&t, 4);
+#endif
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static void __wait(struct waiter *w)
+{
+	for (;;) {
+		set_task_state(current, TASK_UNINTERRUPTIBLE);
+
+		if (!w->task)
+			break;
+
+		schedule();
+	}
+
+	set_task_state(current, TASK_RUNNING);
+}
+
+static void __wake_waiter(struct waiter *w)
+{
+	struct task_struct *task;
+
+	list_del(&w->list);
+	task = w->task;
+	smp_mb();
+	w->task = NULL;
+	wake_up_process(task);
+}
+
+/*
+ * We either wake a few readers or a single writer.
+ */
+static void __wake_many(struct block_lock *lock)
+{
+	struct waiter *w, *tmp;
+
+	BUG_ON(lock->count < 0);
+	list_for_each_entry_safe(w, tmp, &lock->waiters, list) {
+		if (lock->count >= MAX_HOLDERS)
+			return;
+
+		if (w->wants_write) {
+			if (lock->count > 0)
+				return; /* still read locked */
+
+			lock->count = -1;
+			__add_holder(lock, w->task);
+			__wake_waiter(w);
+			return;
+		}
+
+		lock->count++;
+		__add_holder(lock, w->task);
+		__wake_waiter(w);
+	}
+}
+
+static void bl_init(struct block_lock *lock)
+{
+	int i;
+
+	spin_lock_init(&lock->lock);
+	lock->count = 0;
+	INIT_LIST_HEAD(&lock->waiters);
+	for (i = 0; i < MAX_HOLDERS; i++)
+		lock->holders[i] = NULL;
+}
+
+static int __available_for_read(struct block_lock *lock)
+{
+	return lock->count >= 0 &&
+		lock->count < MAX_HOLDERS &&
+		list_empty(&lock->waiters);
+}
+
+static int bl_down_read(struct block_lock *lock)
+{
+	int r;
+	struct waiter w;
+
+	spin_lock(&lock->lock);
+	r = __check_holder(lock);
+	if (r) {
+		spin_unlock(&lock->lock);
+		return r;
+	}
+
+	if (__available_for_read(lock)) {
+		lock->count++;
+		__add_holder(lock, current);
+		spin_unlock(&lock->lock);
+		return 0;
+	}
+
+	get_task_struct(current);
+
+	w.task = current;
+	w.wants_write = 0;
+	list_add_tail(&w.list, &lock->waiters);
+	spin_unlock(&lock->lock);
+
+	__wait(&w);
+	put_task_struct(current);
+	return 0;
+}
+
+static int bl_down_read_nonblock(struct block_lock *lock)
+{
+	int r;
+
+	spin_lock(&lock->lock);
+	r = __check_holder(lock);
+	if (r)
+		goto out;
+
+	if (__available_for_read(lock)) {
+		lock->count++;
+		__add_holder(lock, current);
+		r = 0;
+	} else
+		r = -EWOULDBLOCK;
+
+out:
+	spin_unlock(&lock->lock);
+	return r;
+}
+
+static void bl_up_read(struct block_lock *lock)
+{
+	spin_lock(&lock->lock);
+	BUG_ON(lock->count <= 0);
+	__del_holder(lock, current);
+	--lock->count;
+	if (!list_empty(&lock->waiters))
+		__wake_many(lock);
+	spin_unlock(&lock->lock);
+}
+
+static int bl_down_write(struct block_lock *lock)
+{
+	int r;
+	struct waiter w;
+
+	spin_lock(&lock->lock);
+	r = __check_holder(lock);
+	if (r) {
+		spin_unlock(&lock->lock);
+		return r;
+	}
+
+	if (lock->count == 0 && list_empty(&lock->waiters)) {
+		lock->count = -1;
+		__add_holder(lock, current);
+		spin_unlock(&lock->lock);
+		return 0;
+	}
+
+	get_task_struct(current);
+	w.task = current;
+	w.wants_write = 1;
+
+	/*
+	 * Writers given priority. We know there's only one mutator in the
+	 * system, so ignoring the ordering reversal.
+	 */
+	list_add(&w.list, &lock->waiters);
+	spin_unlock(&lock->lock);
+
+	__wait(&w);
+	put_task_struct(current);
+
+	return 0;
+}
+
+static void bl_up_write(struct block_lock *lock)
+{
+	spin_lock(&lock->lock);
+	__del_holder(lock, current);
+	lock->count = 0;
+	if (!list_empty(&lock->waiters))
+		__wake_many(lock);
+	spin_unlock(&lock->lock);
+}
+
+static void report_recursive_bug(dm_block_t b, int r)
+{
+	if (r == -EINVAL)
+		DMERR("recursive acquisition of block %llu requested.",
+		      (unsigned long long) b);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Block manager is currently implemented using dm-bufio.  struct
+ * dm_block_manager and struct dm_block map directly onto a couple of
+ * structs in the bufio interface.  I want to retain the freedom to move
+ * away from bufio in the future.  So these structs are just cast within
+ * this .c file, rather than making it through to the public interface.
+ */
+static struct dm_buffer *to_buffer(struct dm_block *b)
+{
+	return (struct dm_buffer *) b;
+}
+
+static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm)
+{
+	return (struct dm_bufio_client *) bm;
+}
+
+dm_block_t dm_block_location(struct dm_block *b)
+{
+	return dm_bufio_get_block_number(to_buffer(b));
+}
+EXPORT_SYMBOL_GPL(dm_block_location);
+
+void *dm_block_data(struct dm_block *b)
+{
+	return dm_bufio_get_block_data(to_buffer(b));
+}
+EXPORT_SYMBOL_GPL(dm_block_data);
+
+struct buffer_aux {
+	struct dm_block_validator *validator;
+	struct block_lock lock;
+	int write_locked;
+};
+
+static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
+{
+	struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+	aux->validator = NULL;
+	bl_init(&aux->lock);
+}
+
+static void dm_block_manager_write_callback(struct dm_buffer *buf)
+{
+	struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+	if (aux->validator) {
+		aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf,
+			 dm_bufio_get_block_size(dm_bufio_get_client(buf)));
+	}
+}
+
+/*----------------------------------------------------------------
+ * Public interface
+ *--------------------------------------------------------------*/
+struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
+						 unsigned block_size,
+						 unsigned cache_size,
+						 unsigned max_held_per_thread)
+{
+	return (struct dm_block_manager *)
+		dm_bufio_client_create(bdev, block_size, max_held_per_thread,
+				       sizeof(struct buffer_aux),
+				       dm_block_manager_alloc_callback,
+				       dm_block_manager_write_callback);
+}
+EXPORT_SYMBOL_GPL(dm_block_manager_create);
+
+void dm_block_manager_destroy(struct dm_block_manager *bm)
+{
+	return dm_bufio_client_destroy(to_bufio(bm));
+}
+EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
+
+unsigned dm_bm_block_size(struct dm_block_manager *bm)
+{
+	return dm_bufio_get_block_size(to_bufio(bm));
+}
+EXPORT_SYMBOL_GPL(dm_bm_block_size);
+
+dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
+{
+	return dm_bufio_get_device_size(to_bufio(bm));
+}
+
+static int dm_bm_validate_buffer(struct dm_block_manager *bm,
+				 struct dm_buffer *buf,
+				 struct buffer_aux *aux,
+				 struct dm_block_validator *v)
+{
+	if (unlikely(!aux->validator)) {
+		int r;
+		if (!v)
+			return 0;
+		r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm)));
+		if (unlikely(r))
+			return r;
+		aux->validator = v;
+	} else {
+		if (unlikely(aux->validator != v)) {
+			DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
+				aux->validator->name, v ? v->name : "NULL",
+				(unsigned long long)
+					dm_bufio_get_block_number(buf));
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
+		    struct dm_block_validator *v,
+		    struct dm_block **result)
+{
+	struct buffer_aux *aux;
+	void *p;
+	int r;
+
+	p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (unlikely(IS_ERR(p)))
+		return PTR_ERR(p);
+
+	aux = dm_bufio_get_aux_data(to_buffer(*result));
+	r = bl_down_read(&aux->lock);
+	if (unlikely(r)) {
+		dm_bufio_release(to_buffer(*result));
+		report_recursive_bug(b, r);
+		return r;
+	}
+
+	aux->write_locked = 0;
+
+	r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+	if (unlikely(r)) {
+		bl_up_read(&aux->lock);
+		dm_bufio_release(to_buffer(*result));
+		return r;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_read_lock);
+
+int dm_bm_write_lock(struct dm_block_manager *bm,
+		     dm_block_t b, struct dm_block_validator *v,
+		     struct dm_block **result)
+{
+	struct buffer_aux *aux;
+	void *p;
+	int r;
+
+	p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (unlikely(IS_ERR(p)))
+		return PTR_ERR(p);
+
+	aux = dm_bufio_get_aux_data(to_buffer(*result));
+	r = bl_down_write(&aux->lock);
+	if (r) {
+		dm_bufio_release(to_buffer(*result));
+		report_recursive_bug(b, r);
+		return r;
+	}
+
+	aux->write_locked = 1;
+
+	r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+	if (unlikely(r)) {
+		bl_up_write(&aux->lock);
+		dm_bufio_release(to_buffer(*result));
+		return r;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_write_lock);
+
+int dm_bm_read_try_lock(struct dm_block_manager *bm,
+			dm_block_t b, struct dm_block_validator *v,
+			struct dm_block **result)
+{
+	struct buffer_aux *aux;
+	void *p;
+	int r;
+
+	p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (unlikely(IS_ERR(p)))
+		return PTR_ERR(p);
+	if (unlikely(!p))
+		return -EWOULDBLOCK;
+
+	aux = dm_bufio_get_aux_data(to_buffer(*result));
+	r = bl_down_read_nonblock(&aux->lock);
+	if (r < 0) {
+		dm_bufio_release(to_buffer(*result));
+		report_recursive_bug(b, r);
+		return r;
+	}
+	aux->write_locked = 0;
+
+	r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+	if (unlikely(r)) {
+		bl_up_read(&aux->lock);
+		dm_bufio_release(to_buffer(*result));
+		return r;
+	}
+
+	return 0;
+}
+
+int dm_bm_write_lock_zero(struct dm_block_manager *bm,
+			  dm_block_t b, struct dm_block_validator *v,
+			  struct dm_block **result)
+{
+	int r;
+	struct buffer_aux *aux;
+	void *p;
+
+	p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (unlikely(IS_ERR(p)))
+		return PTR_ERR(p);
+
+	memset(p, 0, dm_bm_block_size(bm));
+
+	aux = dm_bufio_get_aux_data(to_buffer(*result));
+	r = bl_down_write(&aux->lock);
+	if (r) {
+		dm_bufio_release(to_buffer(*result));
+		return r;
+	}
+
+	aux->write_locked = 1;
+	aux->validator = v;
+
+	return 0;
+}
+
+int dm_bm_unlock(struct dm_block *b)
+{
+	struct buffer_aux *aux;
+	aux = dm_bufio_get_aux_data(to_buffer(b));
+
+	if (aux->write_locked) {
+		dm_bufio_mark_buffer_dirty(to_buffer(b));
+		bl_up_write(&aux->lock);
+	} else
+		bl_up_read(&aux->lock);
+
+	dm_bufio_release(to_buffer(b));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_unlock);
+
+int dm_bm_unlock_move(struct dm_block *b, dm_block_t n)
+{
+	struct buffer_aux *aux;
+
+	aux = dm_bufio_get_aux_data(to_buffer(b));
+
+	if (aux->write_locked) {
+		dm_bufio_mark_buffer_dirty(to_buffer(b));
+		bl_up_write(&aux->lock);
+	} else
+		bl_up_read(&aux->lock);
+
+	dm_bufio_release_move(to_buffer(b), n);
+	return 0;
+}
+
+int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
+			   struct dm_block *superblock)
+{
+	int r;
+
+	r = dm_bufio_write_dirty_buffers(to_bufio(bm));
+	if (unlikely(r))
+		return r;
+	r = dm_bufio_issue_flush(to_bufio(bm));
+	if (unlikely(r))
+		return r;
+
+	dm_bm_unlock(superblock);
+
+	r = dm_bufio_write_dirty_buffers(to_bufio(bm));
+	if (unlikely(r))
+		return r;
+	r = dm_bufio_issue_flush(to_bufio(bm));
+	if (unlikely(r))
+		return r;
+
+	return 0;
+}
+
+u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
+{
+	return crc32c(~(u32) 0, data, len) ^ init_xor;
+}
+EXPORT_SYMBOL_GPL(dm_bm_checksum);
+
+/*----------------------------------------------------------------*/
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_DESCRIPTION("Immutable metadata library for dm");
+
+/*----------------------------------------------------------------*/