blob: d97611e2b4ee319c1431f8e57d41c007f3537e6b [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder9e15b772012-10-30 19:40:33 -050073/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050075#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050076
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050078
Alex Elderd8891402012-10-09 13:50:17 -070079/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
Alex Elder81a89792012-02-02 08:13:30 -060087/*
88 * An RBD device name will be "rbd#", where the "rbd" comes from
89 * RBD_DRV_NAME above, and # is a unique integer identifier.
90 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
91 * enough to hold all possible device names.
92 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060094#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095
Alex Eldercc0538b2012-08-10 13:12:07 -070096#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070097
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700156struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700157 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158};
159
160/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600161 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700162 */
163struct rbd_client {
164 struct ceph_client *client;
165 struct kref kref;
166 struct list_head node;
167};
168
169/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600170 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700171 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700172struct rbd_req_status {
173 int done;
174 int rc;
175 u64 bytes;
176};
177
178/*
179 * a collection of requests
180 */
181struct rbd_req_coll {
182 int total;
183 int num_done;
184 struct kref kref;
185 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700186};
187
Alex Elderf0f8cef2012-01-29 13:57:44 -0600188/*
189 * a single io request
190 */
191struct rbd_request {
192 struct request *rq; /* blk layer request */
193 struct bio *bio; /* cloned bio */
194 struct page **pages; /* list of used pages */
195 u64 len;
196 int coll_index;
197 struct rbd_req_coll *coll;
198};
199
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800200struct rbd_snap {
201 struct device dev;
202 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800203 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800204 struct list_head node;
205 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500206 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800207};
208
Alex Elderf84344f2012-08-31 17:29:51 -0500209struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500210 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500211 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500212 bool read_only;
213};
214
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700215/*
216 * a single device
217 */
218struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500219 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700220
221 int major; /* blkdev assigned major */
222 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700223
Alex Eldera30b71b2012-07-10 20:30:11 -0500224 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700225 struct rbd_client *rbd_client;
226
227 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
228
229 spinlock_t lock; /* queue lock */
230
231 struct rbd_image_header header;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500232 bool exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500233 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700234
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500235 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500236
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700237 struct ceph_osd_event *watch_event;
238 struct ceph_osd_request *watch_request;
239
Alex Elder86b00e02012-10-25 23:34:42 -0500240 struct rbd_spec *parent_spec;
241 u64 parent_overlap;
242
Josh Durginc6666012011-11-21 17:11:12 -0800243 /* protects updating the header */
244 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500245
246 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700247
248 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800249
250 /* list of snapshots */
251 struct list_head snaps;
252
253 /* sysfs related */
254 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600255 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800256};
257
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600259
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600261static DEFINE_SPINLOCK(rbd_dev_list_lock);
262
Alex Elder432b8582012-01-29 13:57:44 -0600263static LIST_HEAD(rbd_client_list); /* clients */
264static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700265
Alex Elder304f6802012-08-31 17:29:52 -0500266static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
267static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
268
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800269static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500270static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271
Alex Elderf0f8cef2012-01-29 13:57:44 -0600272static ssize_t rbd_add(struct bus_type *bus, const char *buf,
273 size_t count);
274static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
275 size_t count);
276
277static struct bus_attribute rbd_bus_attrs[] = {
278 __ATTR(add, S_IWUSR, NULL, rbd_add),
279 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
280 __ATTR_NULL
281};
282
283static struct bus_type rbd_bus_type = {
284 .name = "rbd",
285 .bus_attrs = rbd_bus_attrs,
286};
287
288static void rbd_root_dev_release(struct device *dev)
289{
290}
291
292static struct device rbd_root_dev = {
293 .init_name = "rbd",
294 .release = rbd_root_dev_release,
295};
296
Alex Elderaafb2302012-09-06 16:00:54 -0500297#ifdef RBD_DEBUG
298#define rbd_assert(expr) \
299 if (unlikely(!(expr))) { \
300 printk(KERN_ERR "\nAssertion failure in %s() " \
301 "at line %d:\n\n" \
302 "\trbd_assert(%s);\n\n", \
303 __func__, __LINE__, #expr); \
304 BUG(); \
305 }
306#else /* !RBD_DEBUG */
307# define rbd_assert(expr) ((void) 0)
308#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800309
Alex Elder117973f2012-08-31 17:29:55 -0500310static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
311static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700312
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700313static int rbd_open(struct block_device *bdev, fmode_t mode)
314{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600315 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316
Alex Elderf84344f2012-08-31 17:29:51 -0500317 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318 return -EROFS;
319
Alex Elder42382b72012-11-16 09:29:16 -0600320 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600321 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500322 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600323 rbd_dev->open_count++;
324 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700325
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700326 return 0;
327}
328
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800329static int rbd_release(struct gendisk *disk, fmode_t mode)
330{
331 struct rbd_device *rbd_dev = disk->private_data;
332
Alex Elder42382b72012-11-16 09:29:16 -0600333 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
334 rbd_assert(rbd_dev->open_count > 0);
335 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600336 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600337 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800338
339 return 0;
340}
341
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700342static const struct block_device_operations rbd_bd_ops = {
343 .owner = THIS_MODULE,
344 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800345 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346};
347
348/*
349 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500350 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351 */
Alex Elderf8c38922012-08-10 13:12:07 -0700352static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700353{
354 struct rbd_client *rbdc;
355 int ret = -ENOMEM;
356
357 dout("rbd_client_create\n");
358 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
359 if (!rbdc)
360 goto out_opt;
361
362 kref_init(&rbdc->kref);
363 INIT_LIST_HEAD(&rbdc->node);
364
Alex Elderbc534d862012-01-29 13:57:44 -0600365 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
366
Alex Elder43ae4702012-07-03 16:01:18 -0500367 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700368 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600369 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500370 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371
372 ret = ceph_open_session(rbdc->client);
373 if (ret < 0)
374 goto out_err;
375
Alex Elder432b8582012-01-29 13:57:44 -0600376 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600378 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700379
Alex Elderbc534d862012-01-29 13:57:44 -0600380 mutex_unlock(&ctl_mutex);
381
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382 dout("rbd_client_create created %p\n", rbdc);
383 return rbdc;
384
385out_err:
386 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600387out_mutex:
388 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700389 kfree(rbdc);
390out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500391 if (ceph_opts)
392 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400393 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394}
395
396/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700397 * Find a ceph client with specific addr and configuration. If
398 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700399 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700400static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700401{
402 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700403 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700404
Alex Elder43ae4702012-07-03 16:01:18 -0500405 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406 return NULL;
407
Alex Elder1f7ba332012-08-10 13:12:07 -0700408 spin_lock(&rbd_client_list_lock);
409 list_for_each_entry(client_node, &rbd_client_list, node) {
410 if (!ceph_compare_options(ceph_opts, client_node->client)) {
411 kref_get(&client_node->kref);
412 found = true;
413 break;
414 }
415 }
416 spin_unlock(&rbd_client_list_lock);
417
418 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419}
420
421/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700422 * mount options
423 */
424enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700425 Opt_last_int,
426 /* int args above */
427 Opt_last_string,
428 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700429 Opt_read_only,
430 Opt_read_write,
431 /* Boolean args above */
432 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700433};
434
Alex Elder43ae4702012-07-03 16:01:18 -0500435static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700436 /* int args above */
437 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500438 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700439 {Opt_read_only, "ro"}, /* Alternate spelling */
440 {Opt_read_write, "read_write"},
441 {Opt_read_write, "rw"}, /* Alternate spelling */
442 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700443 {-1, NULL}
444};
445
446static int parse_rbd_opts_token(char *c, void *private)
447{
Alex Elder43ae4702012-07-03 16:01:18 -0500448 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700449 substring_t argstr[MAX_OPT_ARGS];
450 int token, intval, ret;
451
Alex Elder43ae4702012-07-03 16:01:18 -0500452 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700453 if (token < 0)
454 return -EINVAL;
455
456 if (token < Opt_last_int) {
457 ret = match_int(&argstr[0], &intval);
458 if (ret < 0) {
459 pr_err("bad mount option arg (not int) "
460 "at '%s'\n", c);
461 return ret;
462 }
463 dout("got int token %d val %d\n", token, intval);
464 } else if (token > Opt_last_int && token < Opt_last_string) {
465 dout("got string token %d val %s\n", token,
466 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700467 } else if (token > Opt_last_string && token < Opt_last_bool) {
468 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700469 } else {
470 dout("got token %d\n", token);
471 }
472
473 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700474 case Opt_read_only:
475 rbd_opts->read_only = true;
476 break;
477 case Opt_read_write:
478 rbd_opts->read_only = false;
479 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700480 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500481 rbd_assert(false);
482 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700483 }
484 return 0;
485}
486
487/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488 * Get a ceph client with specific addr and configuration, if one does
489 * not exist create it.
490 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500491static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700492{
Alex Elderf8c38922012-08-10 13:12:07 -0700493 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700494
Alex Elder1f7ba332012-08-10 13:12:07 -0700495 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500496 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500497 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500498 else
Alex Elderf8c38922012-08-10 13:12:07 -0700499 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700500
Alex Elder9d3997f2012-10-25 23:34:42 -0500501 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700502}
503
504/*
505 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600506 *
Alex Elder432b8582012-01-29 13:57:44 -0600507 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700508 */
509static void rbd_client_release(struct kref *kref)
510{
511 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
512
513 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500514 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700515 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500516 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700517
518 ceph_destroy_client(rbdc->client);
519 kfree(rbdc);
520}
521
522/*
523 * Drop reference to ceph client node. If it's not referenced anymore, release
524 * it.
525 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500526static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700527{
Alex Elderc53d5892012-10-25 23:34:42 -0500528 if (rbdc)
529 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530}
531
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700532/*
533 * Destroy requests collection
534 */
535static void rbd_coll_release(struct kref *kref)
536{
537 struct rbd_req_coll *coll =
538 container_of(kref, struct rbd_req_coll, kref);
539
540 dout("rbd_coll_release %p\n", coll);
541 kfree(coll);
542}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700543
Alex Eldera30b71b2012-07-10 20:30:11 -0500544static bool rbd_image_format_valid(u32 image_format)
545{
546 return image_format == 1 || image_format == 2;
547}
548
Alex Elder8e94af82012-07-25 09:32:40 -0500549static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
550{
Alex Elder103a1502012-08-02 11:29:45 -0500551 size_t size;
552 u32 snap_count;
553
554 /* The header has to start with the magic rbd header text */
555 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
556 return false;
557
Alex Elderdb2388b2012-10-20 22:17:27 -0500558 /* The bio layer requires at least sector-sized I/O */
559
560 if (ondisk->options.order < SECTOR_SHIFT)
561 return false;
562
563 /* If we use u64 in a few spots we may be able to loosen this */
564
565 if (ondisk->options.order > 8 * sizeof (int) - 1)
566 return false;
567
Alex Elder103a1502012-08-02 11:29:45 -0500568 /*
569 * The size of a snapshot header has to fit in a size_t, and
570 * that limits the number of snapshots.
571 */
572 snap_count = le32_to_cpu(ondisk->snap_count);
573 size = SIZE_MAX - sizeof (struct ceph_snap_context);
574 if (snap_count > size / sizeof (__le64))
575 return false;
576
577 /*
578 * Not only that, but the size of the entire the snapshot
579 * header must also be representable in a size_t.
580 */
581 size -= snap_count * sizeof (__le64);
582 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
583 return false;
584
585 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500586}
587
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700588/*
589 * Create a new header structure, translate header format from the on-disk
590 * header.
591 */
592static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500593 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700594{
Alex Elderccece232012-07-10 20:30:10 -0500595 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500596 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500597 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500598 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599
Alex Elder6a523252012-07-19 17:12:59 -0500600 memset(header, 0, sizeof (*header));
601
Alex Elder103a1502012-08-02 11:29:45 -0500602 snap_count = le32_to_cpu(ondisk->snap_count);
603
Alex Elder58c17b02012-08-23 23:22:06 -0500604 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
605 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500606 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700607 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500608 memcpy(header->object_prefix, ondisk->object_prefix, len);
609 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600610
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700611 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500612 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
613
Alex Elder621901d2012-08-23 23:22:06 -0500614 /* Save a copy of the snapshot names */
615
Alex Elderf785cc12012-08-23 23:22:06 -0500616 if (snap_names_len > (u64) SIZE_MAX)
617 return -EIO;
618 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500620 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500621 /*
622 * Note that rbd_dev_v1_header_read() guarantees
623 * the ondisk buffer we're working with has
624 * snap_names_len bytes beyond the end of the
625 * snapshot id array, this memcpy() is safe.
626 */
627 memcpy(header->snap_names, &ondisk->snaps[snap_count],
628 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500629
Alex Elder621901d2012-08-23 23:22:06 -0500630 /* Record each snapshot's size */
631
Alex Elderd2bb24e2012-07-26 23:37:14 -0500632 size = snap_count * sizeof (*header->snap_sizes);
633 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700634 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500635 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500636 for (i = 0; i < snap_count; i++)
637 header->snap_sizes[i] =
638 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639 } else {
Alex Elderccece232012-07-10 20:30:10 -0500640 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700641 header->snap_names = NULL;
642 header->snap_sizes = NULL;
643 }
Alex Elder849b4262012-07-09 21:04:24 -0500644
Alex Elder34b13182012-07-13 20:35:12 -0500645 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646 header->obj_order = ondisk->options.order;
647 header->crypt_type = ondisk->options.crypt_type;
648 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500649
Alex Elder621901d2012-08-23 23:22:06 -0500650 /* Allocate and fill in the snapshot context */
651
Alex Elderf84344f2012-08-31 17:29:51 -0500652 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500653 size = sizeof (struct ceph_snap_context);
654 size += snap_count * sizeof (header->snapc->snaps[0]);
655 header->snapc = kzalloc(size, GFP_KERNEL);
656 if (!header->snapc)
657 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658
659 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500660 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700661 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500662 for (i = 0; i < snap_count; i++)
663 header->snapc->snaps[i] =
664 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665
666 return 0;
667
Alex Elder6a523252012-07-19 17:12:59 -0500668out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500669 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500670 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500672 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500673 kfree(header->object_prefix);
674 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500675
Alex Elder00f1f362012-02-07 12:03:36 -0600676 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677}
678
Alex Elder9e15b772012-10-30 19:40:33 -0500679static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
680{
681 struct rbd_snap *snap;
682
683 if (snap_id == CEPH_NOSNAP)
684 return RBD_SNAP_HEAD_NAME;
685
686 list_for_each_entry(snap, &rbd_dev->snaps, node)
687 if (snap_id == snap->id)
688 return snap->name;
689
690 return NULL;
691}
692
Alex Elder8836b992012-08-30 14:42:15 -0500693static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695
Alex Eldere86924a2012-07-10 20:30:11 -0500696 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600697
Alex Eldere86924a2012-07-10 20:30:11 -0500698 list_for_each_entry(snap, &rbd_dev->snaps, node) {
699 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500700 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500701 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500702 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600703
Alex Eldere86924a2012-07-10 20:30:11 -0500704 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600705 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706 }
Alex Eldere86924a2012-07-10 20:30:11 -0500707
Alex Elder00f1f362012-02-07 12:03:36 -0600708 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709}
710
Alex Elder819d52b2012-10-25 23:34:41 -0500711static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712{
Alex Elder78dc4472012-07-19 08:49:18 -0500713 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700714
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500715 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800716 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500717 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500718 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500719 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500720 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500722 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723 if (ret < 0)
724 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500725 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726 }
Alex Elderdaba5fd2012-10-26 17:25:23 -0500727 rbd_dev->exists = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700728done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729 return ret;
730}
731
732static void rbd_header_free(struct rbd_image_header *header)
733{
Alex Elder849b4262012-07-09 21:04:24 -0500734 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500735 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700736 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500737 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500738 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500739 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800740 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500741 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742}
743
Alex Elder65ccfe22012-08-09 10:33:26 -0700744static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745{
Alex Elder65ccfe22012-08-09 10:33:26 -0700746 char *name;
747 u64 segment;
748 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700749
Alex Elder2fd82b92012-11-09 15:05:54 -0600750 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700751 if (!name)
752 return NULL;
753 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600754 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700755 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600756 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700757 pr_err("error formatting segment name for #%llu (%d)\n",
758 segment, ret);
759 kfree(name);
760 name = NULL;
761 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700762
Alex Elder65ccfe22012-08-09 10:33:26 -0700763 return name;
764}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700765
Alex Elder65ccfe22012-08-09 10:33:26 -0700766static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
767{
768 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700769
Alex Elder65ccfe22012-08-09 10:33:26 -0700770 return offset & (segment_size - 1);
771}
772
773static u64 rbd_segment_length(struct rbd_device *rbd_dev,
774 u64 offset, u64 length)
775{
776 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
777
778 offset &= segment_size - 1;
779
Alex Elderaafb2302012-09-06 16:00:54 -0500780 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700781 if (offset + length > segment_size)
782 length = segment_size - offset;
783
784 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700785}
786
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700787static int rbd_get_num_segments(struct rbd_image_header *header,
788 u64 ofs, u64 len)
789{
Alex Elderdf111be2012-08-09 10:33:26 -0700790 u64 start_seg;
791 u64 end_seg;
792
793 if (!len)
794 return 0;
795 if (len - 1 > U64_MAX - ofs)
796 return -ERANGE;
797
798 start_seg = ofs >> header->obj_order;
799 end_seg = (ofs + len - 1) >> header->obj_order;
800
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700801 return end_seg - start_seg + 1;
802}
803
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700805 * returns the size of an object in the image
806 */
807static u64 rbd_obj_bytes(struct rbd_image_header *header)
808{
809 return 1 << header->obj_order;
810}
811
812/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700813 * bio helpers
814 */
815
816static void bio_chain_put(struct bio *chain)
817{
818 struct bio *tmp;
819
820 while (chain) {
821 tmp = chain;
822 chain = chain->bi_next;
823 bio_put(tmp);
824 }
825}
826
827/*
828 * zeros a bio chain, starting at specific offset
829 */
830static void zero_bio_chain(struct bio *chain, int start_ofs)
831{
832 struct bio_vec *bv;
833 unsigned long flags;
834 void *buf;
835 int i;
836 int pos = 0;
837
838 while (chain) {
839 bio_for_each_segment(bv, chain, i) {
840 if (pos + bv->bv_len > start_ofs) {
841 int remainder = max(start_ofs - pos, 0);
842 buf = bvec_kmap_irq(bv, &flags);
843 memset(buf + remainder, 0,
844 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200845 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846 }
847 pos += bv->bv_len;
848 }
849
850 chain = chain->bi_next;
851 }
852}
853
854/*
Alex Elderf7760da2012-10-20 22:17:27 -0500855 * Clone a portion of a bio, starting at the given byte offset
856 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700857 */
Alex Elderf7760da2012-10-20 22:17:27 -0500858static struct bio *bio_clone_range(struct bio *bio_src,
859 unsigned int offset,
860 unsigned int len,
861 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700862{
Alex Elderf7760da2012-10-20 22:17:27 -0500863 struct bio_vec *bv;
864 unsigned int resid;
865 unsigned short idx;
866 unsigned int voff;
867 unsigned short end_idx;
868 unsigned short vcnt;
869 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870
Alex Elderf7760da2012-10-20 22:17:27 -0500871 /* Handle the easy case for the caller */
872
873 if (!offset && len == bio_src->bi_size)
874 return bio_clone(bio_src, gfpmask);
875
876 if (WARN_ON_ONCE(!len))
877 return NULL;
878 if (WARN_ON_ONCE(len > bio_src->bi_size))
879 return NULL;
880 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
881 return NULL;
882
883 /* Find first affected segment... */
884
885 resid = offset;
886 __bio_for_each_segment(bv, bio_src, idx, 0) {
887 if (resid < bv->bv_len)
888 break;
889 resid -= bv->bv_len;
890 }
891 voff = resid;
892
893 /* ...and the last affected segment */
894
895 resid += len;
896 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
897 if (resid <= bv->bv_len)
898 break;
899 resid -= bv->bv_len;
900 }
901 vcnt = end_idx - idx + 1;
902
903 /* Build the clone */
904
905 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
906 if (!bio)
907 return NULL; /* ENOMEM */
908
909 bio->bi_bdev = bio_src->bi_bdev;
910 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
911 bio->bi_rw = bio_src->bi_rw;
912 bio->bi_flags |= 1 << BIO_CLONED;
913
914 /*
915 * Copy over our part of the bio_vec, then update the first
916 * and last (or only) entries.
917 */
918 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
919 vcnt * sizeof (struct bio_vec));
920 bio->bi_io_vec[0].bv_offset += voff;
921 if (vcnt > 1) {
922 bio->bi_io_vec[0].bv_len -= voff;
923 bio->bi_io_vec[vcnt - 1].bv_len = resid;
924 } else {
925 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700926 }
927
Alex Elderf7760da2012-10-20 22:17:27 -0500928 bio->bi_vcnt = vcnt;
929 bio->bi_size = len;
930 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700931
Alex Elderf7760da2012-10-20 22:17:27 -0500932 return bio;
933}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934
Alex Elderf7760da2012-10-20 22:17:27 -0500935/*
936 * Clone a portion of a bio chain, starting at the given byte offset
937 * into the first bio in the source chain and continuing for the
938 * number of bytes indicated. The result is another bio chain of
939 * exactly the given length, or a null pointer on error.
940 *
941 * The bio_src and offset parameters are both in-out. On entry they
942 * refer to the first source bio and the offset into that bio where
943 * the start of data to be cloned is located.
944 *
945 * On return, bio_src is updated to refer to the bio in the source
946 * chain that contains first un-cloned byte, and *offset will
947 * contain the offset of that byte within that bio.
948 */
949static struct bio *bio_chain_clone_range(struct bio **bio_src,
950 unsigned int *offset,
951 unsigned int len,
952 gfp_t gfpmask)
953{
954 struct bio *bi = *bio_src;
955 unsigned int off = *offset;
956 struct bio *chain = NULL;
957 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958
Alex Elderf7760da2012-10-20 22:17:27 -0500959 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700960
Alex Elderf7760da2012-10-20 22:17:27 -0500961 if (!bi || off >= bi->bi_size || !len)
962 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963
Alex Elderf7760da2012-10-20 22:17:27 -0500964 end = &chain;
965 while (len) {
966 unsigned int bi_size;
967 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968
Alex Elderf7760da2012-10-20 22:17:27 -0500969 if (!bi)
970 goto out_err; /* EINVAL; ran out of bio's */
971 bi_size = min_t(unsigned int, bi->bi_size - off, len);
972 bio = bio_clone_range(bi, off, bi_size, gfpmask);
973 if (!bio)
974 goto out_err; /* ENOMEM */
975
976 *end = bio;
977 end = &bio->bi_next;
978
979 off += bi_size;
980 if (off == bi->bi_size) {
981 bi = bi->bi_next;
982 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700983 }
Alex Elderf7760da2012-10-20 22:17:27 -0500984 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985 }
Alex Elderf7760da2012-10-20 22:17:27 -0500986 *bio_src = bi;
987 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700988
Alex Elderf7760da2012-10-20 22:17:27 -0500989 return chain;
990out_err:
991 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700992
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700993 return NULL;
994}
995
996/*
997 * helpers for osd request op vectors.
998 */
Alex Elder57cfc102012-06-26 12:57:03 -0700999static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
1000 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001001{
Alex Elder57cfc102012-06-26 12:57:03 -07001002 struct ceph_osd_req_op *ops;
1003
1004 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
1005 if (!ops)
1006 return NULL;
1007
1008 ops[0].op = opcode;
1009
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001010 /*
1011 * op extent offset and length will be set later on
1012 * in calc_raw_layout()
1013 */
Alex Elder57cfc102012-06-26 12:57:03 -07001014 ops[0].payload_len = payload_len;
1015
1016 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017}
1018
1019static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1020{
1021 kfree(ops);
1022}
1023
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001024static void rbd_coll_end_req_index(struct request *rq,
1025 struct rbd_req_coll *coll,
1026 int index,
1027 int ret, u64 len)
1028{
1029 struct request_queue *q;
1030 int min, max, i;
1031
Alex Elderbd919d42012-07-13 20:35:11 -05001032 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1033 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001034
1035 if (!rq)
1036 return;
1037
1038 if (!coll) {
1039 blk_end_request(rq, ret, len);
1040 return;
1041 }
1042
1043 q = rq->q;
1044
1045 spin_lock_irq(q->queue_lock);
1046 coll->status[index].done = 1;
1047 coll->status[index].rc = ret;
1048 coll->status[index].bytes = len;
1049 max = min = coll->num_done;
1050 while (max < coll->total && coll->status[max].done)
1051 max++;
1052
1053 for (i = min; i<max; i++) {
1054 __blk_end_request(rq, coll->status[i].rc,
1055 coll->status[i].bytes);
1056 coll->num_done++;
1057 kref_put(&coll->kref, rbd_coll_release);
1058 }
1059 spin_unlock_irq(q->queue_lock);
1060}
1061
1062static void rbd_coll_end_req(struct rbd_request *req,
1063 int ret, u64 len)
1064{
1065 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1066}
1067
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001068/*
1069 * Send ceph osd request
1070 */
1071static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001072 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001073 struct ceph_snap_context *snapc,
1074 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001075 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001076 struct bio *bio,
1077 struct page **pages,
1078 int num_pages,
1079 int flags,
1080 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001081 struct rbd_req_coll *coll,
1082 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001083 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001084 struct ceph_msg *msg),
1085 struct ceph_osd_request **linger_req,
1086 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001087{
1088 struct ceph_osd_request *req;
1089 struct ceph_file_layout *layout;
1090 int ret;
1091 u64 bno;
1092 struct timespec mtime = CURRENT_TIME;
1093 struct rbd_request *req_data;
1094 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001095 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001096
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001097 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001098 if (!req_data) {
1099 if (coll)
1100 rbd_coll_end_req_index(rq, coll, coll_index,
1101 -ENOMEM, len);
1102 return -ENOMEM;
1103 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001105 if (coll) {
1106 req_data->coll = coll;
1107 req_data->coll_index = coll_index;
1108 }
1109
Alex Elderf7760da2012-10-20 22:17:27 -05001110 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1111 object_name, (unsigned long long) ofs,
1112 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001113
Alex Elder0ce1a792012-07-03 16:01:18 -05001114 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001115 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1116 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001117 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001118 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001119 goto done_pages;
1120 }
1121
1122 req->r_callback = rbd_cb;
1123
1124 req_data->rq = rq;
1125 req_data->bio = bio;
1126 req_data->pages = pages;
1127 req_data->len = len;
1128
1129 req->r_priv = req_data;
1130
1131 reqhead = req->r_request->front.iov_base;
1132 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1133
Alex Elderaded07e2012-07-03 16:01:18 -05001134 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135 req->r_oid_len = strlen(req->r_oid);
1136
1137 layout = &req->r_file_layout;
1138 memset(layout, 0, sizeof(*layout));
1139 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1140 layout->fl_stripe_count = cpu_to_le32(1);
1141 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001142 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001143 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1144 req, ops);
1145 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001146
1147 ceph_osdc_build_request(req, ofs, &len,
1148 ops,
1149 snapc,
1150 &mtime,
1151 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001152
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001153 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001154 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001155 *linger_req = req;
1156 }
1157
Alex Elder1dbb4392012-01-24 10:08:37 -06001158 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001159 if (ret < 0)
1160 goto done_err;
1161
1162 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001163 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001164 if (ver)
1165 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001166 dout("reassert_ver=%llu\n",
1167 (unsigned long long)
1168 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001169 ceph_osdc_put_request(req);
1170 }
1171 return ret;
1172
1173done_err:
1174 bio_chain_put(req_data->bio);
1175 ceph_osdc_put_request(req);
1176done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001177 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179 return ret;
1180}
1181
1182/*
1183 * Ceph osd op callback
1184 */
1185static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1186{
1187 struct rbd_request *req_data = req->r_priv;
1188 struct ceph_osd_reply_head *replyhead;
1189 struct ceph_osd_op *op;
1190 __s32 rc;
1191 u64 bytes;
1192 int read_op;
1193
1194 /* parse reply */
1195 replyhead = msg->front.iov_base;
1196 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1197 op = (void *)(replyhead + 1);
1198 rc = le32_to_cpu(replyhead->result);
1199 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001200 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001201
Alex Elderbd919d42012-07-13 20:35:11 -05001202 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1203 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001204
1205 if (rc == -ENOENT && read_op) {
1206 zero_bio_chain(req_data->bio, 0);
1207 rc = 0;
1208 } else if (rc == 0 && read_op && bytes < req_data->len) {
1209 zero_bio_chain(req_data->bio, bytes);
1210 bytes = req_data->len;
1211 }
1212
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001213 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001214
1215 if (req_data->bio)
1216 bio_chain_put(req_data->bio);
1217
1218 ceph_osdc_put_request(req);
1219 kfree(req_data);
1220}
1221
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001222static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1223{
1224 ceph_osdc_put_request(req);
1225}
1226
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001227/*
1228 * Do a synchronous ceph osd operation
1229 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001230static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231 struct ceph_snap_context *snapc,
1232 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001233 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001234 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001235 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001236 u64 ofs, u64 inbound_size,
1237 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238 struct ceph_osd_request **linger_req,
1239 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001240{
1241 int ret;
1242 struct page **pages;
1243 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001244
Alex Elderaafb2302012-09-06 16:00:54 -05001245 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246
Alex Elderf8d4de62012-07-03 16:01:19 -05001247 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001248 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001249 if (IS_ERR(pages))
1250 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251
Alex Elder0ce1a792012-07-03 16:01:18 -05001252 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001253 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001254 pages, num_pages,
1255 flags,
1256 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001257 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001258 NULL,
1259 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001260 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001261 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001262
Alex Elderf8d4de62012-07-03 16:01:19 -05001263 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1264 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001265
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266done:
1267 ceph_release_page_vector(pages, num_pages);
1268 return ret;
1269}
1270
1271/*
1272 * Do an asynchronous ceph osd operation
1273 */
1274static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001275 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001276 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001277 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001278 struct bio *bio,
1279 struct rbd_req_coll *coll,
1280 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001281{
1282 char *seg_name;
1283 u64 seg_ofs;
1284 u64 seg_len;
1285 int ret;
1286 struct ceph_osd_req_op *ops;
1287 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001288 int opcode;
1289 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001290 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001291
Alex Elder65ccfe22012-08-09 10:33:26 -07001292 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001293 if (!seg_name)
1294 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001295 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1296 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001297
Alex Elderff2e4bb2012-10-10 18:59:29 -07001298 if (rq_data_dir(rq) == WRITE) {
1299 opcode = CEPH_OSD_OP_WRITE;
1300 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001301 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001302 payload_len = seg_len;
1303 } else {
1304 opcode = CEPH_OSD_OP_READ;
1305 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001306 snapc = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001307 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001308 payload_len = 0;
1309 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001310
Alex Elder57cfc102012-06-26 12:57:03 -07001311 ret = -ENOMEM;
1312 ops = rbd_create_rw_ops(1, opcode, payload_len);
1313 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001314 goto done;
1315
1316 /* we've taken care of segment sizes earlier when we
1317 cloned the bios. We should never have a segment
1318 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001319 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001320
1321 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1322 seg_name, seg_ofs, seg_len,
1323 bio,
1324 NULL, 0,
1325 flags,
1326 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001327 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001328 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001329
1330 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001331done:
1332 kfree(seg_name);
1333 return ret;
1334}
1335
1336/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001337 * Request sync osd read
1338 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001339static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001340 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001341 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001342 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343 char *buf,
1344 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001345{
Alex Elder913d2fd2012-06-26 12:57:03 -07001346 struct ceph_osd_req_op *ops;
1347 int ret;
1348
1349 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1350 if (!ops)
1351 return -ENOMEM;
1352
1353 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001354 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001355 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001356 ops, object_name, ofs, len, buf, NULL, ver);
1357 rbd_destroy_ops(ops);
1358
1359 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001360}
1361
1362/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001363 * Request sync osd watch
1364 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001365static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001367 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001368{
1369 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001370 int ret;
1371
Alex Elder57cfc102012-06-26 12:57:03 -07001372 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1373 if (!ops)
1374 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001375
Josh Durgina71b8912011-12-05 18:10:44 -08001376 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001377 ops[0].watch.cookie = notify_id;
1378 ops[0].watch.flag = 0;
1379
Alex Elder0ce1a792012-07-03 16:01:18 -05001380 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001381 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001382 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001383 CEPH_OSD_FLAG_READ,
1384 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001385 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386 rbd_simple_req_cb, 0, NULL);
1387
1388 rbd_destroy_ops(ops);
1389 return ret;
1390}
1391
1392static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1393{
Alex Elder0ce1a792012-07-03 16:01:18 -05001394 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001395 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001396 int rc;
1397
Alex Elder0ce1a792012-07-03 16:01:18 -05001398 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399 return;
1400
Alex Elderbd919d42012-07-13 20:35:11 -05001401 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1402 rbd_dev->header_name, (unsigned long long) notify_id,
1403 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001404 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001405 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001406 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001407 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001408
Alex Elder7f0a24d2012-07-25 09:32:40 -05001409 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001410}
1411
1412/*
1413 * Request sync osd watch
1414 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001415static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001416{
1417 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001418 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001419 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001420
Alex Elder57cfc102012-06-26 12:57:03 -07001421 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1422 if (!ops)
1423 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001424
1425 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001426 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001427 if (ret < 0)
1428 goto fail;
1429
Alex Elder0e6f3222012-07-25 09:32:40 -05001430 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001431 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001432 ops[0].watch.flag = 1;
1433
Alex Elder0ce1a792012-07-03 16:01:18 -05001434 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001435 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001436 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1437 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001438 rbd_dev->header_name,
1439 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001440 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001441
1442 if (ret < 0)
1443 goto fail_event;
1444
1445 rbd_destroy_ops(ops);
1446 return 0;
1447
1448fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001449 ceph_osdc_cancel_event(rbd_dev->watch_event);
1450 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001451fail:
1452 rbd_destroy_ops(ops);
1453 return ret;
1454}
1455
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001456/*
1457 * Request sync osd unwatch
1458 */
Alex Elder070c6332012-07-25 09:32:41 -05001459static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001460{
1461 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001462 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001463
Alex Elder57cfc102012-06-26 12:57:03 -07001464 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1465 if (!ops)
1466 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001467
1468 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001469 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001470 ops[0].watch.flag = 0;
1471
Alex Elder0ce1a792012-07-03 16:01:18 -05001472 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001473 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001474 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1475 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001476 rbd_dev->header_name,
1477 0, 0, NULL, NULL, NULL);
1478
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001479
1480 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001481 ceph_osdc_cancel_event(rbd_dev->watch_event);
1482 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001483 return ret;
1484}
1485
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001486/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001487 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001488 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001489static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001490 const char *object_name,
1491 const char *class_name,
1492 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001493 const char *outbound,
1494 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001495 char *inbound,
1496 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001497 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001498 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001499{
1500 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001501 int class_name_len = strlen(class_name);
1502 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001503 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001504 int ret;
1505
Alex Elder3cb4a682012-06-26 12:57:03 -07001506 /*
1507 * Any input parameters required by the method we're calling
1508 * will be sent along with the class and method names as
1509 * part of the message payload. That data and its size are
1510 * supplied via the indata and indata_len fields (named from
1511 * the perspective of the server side) in the OSD request
1512 * operation.
1513 */
1514 payload_size = class_name_len + method_name_len + outbound_size;
1515 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001516 if (!ops)
1517 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001518
Alex Elderaded07e2012-07-03 16:01:18 -05001519 ops[0].cls.class_name = class_name;
1520 ops[0].cls.class_len = (__u8) class_name_len;
1521 ops[0].cls.method_name = method_name;
1522 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001523 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001524 ops[0].cls.indata = outbound;
1525 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526
Alex Elder0ce1a792012-07-03 16:01:18 -05001527 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001529 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001530 object_name, 0, inbound_size, inbound,
1531 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001532
1533 rbd_destroy_ops(ops);
1534
1535 dout("cls_exec returned %d\n", ret);
1536 return ret;
1537}
1538
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001539static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1540{
1541 struct rbd_req_coll *coll =
1542 kzalloc(sizeof(struct rbd_req_coll) +
1543 sizeof(struct rbd_req_status) * num_reqs,
1544 GFP_ATOMIC);
1545
1546 if (!coll)
1547 return NULL;
1548 coll->total = num_reqs;
1549 kref_init(&coll->kref);
1550 return coll;
1551}
1552
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553/*
1554 * block device queue callback
1555 */
1556static void rbd_rq_fn(struct request_queue *q)
1557{
1558 struct rbd_device *rbd_dev = q->queuedata;
1559 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001560
Alex Elder00f1f362012-02-07 12:03:36 -06001561 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001562 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001563 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001564 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001565 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001566 int num_segs, cur_seg = 0;
1567 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001568 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001569 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001570
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001571 dout("fetched request\n");
1572
1573 /* filter out block requests we don't understand */
1574 if ((rq->cmd_type != REQ_TYPE_FS)) {
1575 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001576 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001577 }
1578
1579 /* deduce our operation (read, write) */
1580 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001581 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001582 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001583 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001584 }
1585
1586 spin_unlock_irq(q->queue_lock);
1587
Josh Durgind1d25642011-12-05 14:03:05 -08001588 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001589
Alex Elderdaba5fd2012-10-26 17:25:23 -05001590 if (!rbd_dev->exists) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001591 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
Josh Durgine88a36e2011-11-21 18:14:25 -08001592 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001593 dout("request for non-existent snapshot");
1594 spin_lock_irq(q->queue_lock);
1595 __blk_end_request_all(rq, -ENXIO);
1596 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001597 }
1598
Josh Durgind1d25642011-12-05 14:03:05 -08001599 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1600
1601 up_read(&rbd_dev->header_rwsem);
1602
Alex Elderf7760da2012-10-20 22:17:27 -05001603 size = blk_rq_bytes(rq);
1604 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1605 bio = rq->bio;
1606
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001607 dout("%s 0x%x bytes at 0x%llx\n",
1608 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001609 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001610
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001611 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001612 if (num_segs <= 0) {
1613 spin_lock_irq(q->queue_lock);
1614 __blk_end_request_all(rq, num_segs);
1615 ceph_put_snap_context(snapc);
1616 continue;
1617 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001618 coll = rbd_alloc_coll(num_segs);
1619 if (!coll) {
1620 spin_lock_irq(q->queue_lock);
1621 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001622 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001623 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001624 }
1625
Alex Elderf7760da2012-10-20 22:17:27 -05001626 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001627 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001628 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1629 unsigned int chain_size;
1630 struct bio *bio_chain;
1631
1632 BUG_ON(limit > (u64) UINT_MAX);
1633 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001634 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001635
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001636 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001637
1638 /* Pass a cloned bio chain via an osd request */
1639
1640 bio_chain = bio_chain_clone_range(&bio,
1641 &bio_offset, chain_size,
1642 GFP_ATOMIC);
1643 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001644 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001645 ofs, chain_size,
1646 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001647 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001648 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001649 -ENOMEM, chain_size);
1650 size -= chain_size;
1651 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001652
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001653 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001654 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001655 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001656
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001657 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001658
1659 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001660 }
1661}
1662
1663/*
1664 * a queue callback. Makes sure that we don't create a bio that spans across
1665 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001666 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001667 */
1668static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1669 struct bio_vec *bvec)
1670{
1671 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05001672 sector_t sector_offset;
1673 sector_t sectors_per_obj;
1674 sector_t obj_sector_offset;
1675 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001676
Alex Eldere5cfeed2012-10-20 22:17:27 -05001677 /*
1678 * Find how far into its rbd object the partition-relative
1679 * bio start sector is to offset relative to the enclosing
1680 * device.
1681 */
1682 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1683 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1684 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001685
Alex Eldere5cfeed2012-10-20 22:17:27 -05001686 /*
1687 * Compute the number of bytes from that offset to the end
1688 * of the object. Account for what's already used by the bio.
1689 */
1690 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1691 if (ret > bmd->bi_size)
1692 ret -= bmd->bi_size;
1693 else
1694 ret = 0;
1695
1696 /*
1697 * Don't send back more than was asked for. And if the bio
1698 * was empty, let the whole thing through because: "Note
1699 * that a block device *must* allow a single page to be
1700 * added to an empty bio."
1701 */
1702 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1703 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1704 ret = (int) bvec->bv_len;
1705
1706 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001707}
1708
1709static void rbd_free_disk(struct rbd_device *rbd_dev)
1710{
1711 struct gendisk *disk = rbd_dev->disk;
1712
1713 if (!disk)
1714 return;
1715
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001716 if (disk->flags & GENHD_FL_UP)
1717 del_gendisk(disk);
1718 if (disk->queue)
1719 blk_cleanup_queue(disk->queue);
1720 put_disk(disk);
1721}
1722
1723/*
Alex Elder4156d992012-08-02 11:29:46 -05001724 * Read the complete header for the given rbd device.
1725 *
1726 * Returns a pointer to a dynamically-allocated buffer containing
1727 * the complete and validated header. Caller can pass the address
1728 * of a variable that will be filled in with the version of the
1729 * header object at the time it was read.
1730 *
1731 * Returns a pointer-coded errno if a failure occurs.
1732 */
1733static struct rbd_image_header_ondisk *
1734rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1735{
1736 struct rbd_image_header_ondisk *ondisk = NULL;
1737 u32 snap_count = 0;
1738 u64 names_size = 0;
1739 u32 want_count;
1740 int ret;
1741
1742 /*
1743 * The complete header will include an array of its 64-bit
1744 * snapshot ids, followed by the names of those snapshots as
1745 * a contiguous block of NUL-terminated strings. Note that
1746 * the number of snapshots could change by the time we read
1747 * it in, in which case we re-read it.
1748 */
1749 do {
1750 size_t size;
1751
1752 kfree(ondisk);
1753
1754 size = sizeof (*ondisk);
1755 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1756 size += names_size;
1757 ondisk = kmalloc(size, GFP_KERNEL);
1758 if (!ondisk)
1759 return ERR_PTR(-ENOMEM);
1760
1761 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1762 rbd_dev->header_name,
1763 0, size,
1764 (char *) ondisk, version);
1765
1766 if (ret < 0)
1767 goto out_err;
1768 if (WARN_ON((size_t) ret < size)) {
1769 ret = -ENXIO;
1770 pr_warning("short header read for image %s"
1771 " (want %zd got %d)\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001772 rbd_dev->spec->image_name, size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05001773 goto out_err;
1774 }
1775 if (!rbd_dev_ondisk_valid(ondisk)) {
1776 ret = -ENXIO;
1777 pr_warning("invalid header for image %s\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001778 rbd_dev->spec->image_name);
Alex Elder4156d992012-08-02 11:29:46 -05001779 goto out_err;
1780 }
1781
1782 names_size = le64_to_cpu(ondisk->snap_names_len);
1783 want_count = snap_count;
1784 snap_count = le32_to_cpu(ondisk->snap_count);
1785 } while (snap_count != want_count);
1786
1787 return ondisk;
1788
1789out_err:
1790 kfree(ondisk);
1791
1792 return ERR_PTR(ret);
1793}
1794
1795/*
1796 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001797 */
1798static int rbd_read_header(struct rbd_device *rbd_dev,
1799 struct rbd_image_header *header)
1800{
Alex Elder4156d992012-08-02 11:29:46 -05001801 struct rbd_image_header_ondisk *ondisk;
1802 u64 ver = 0;
1803 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001804
Alex Elder4156d992012-08-02 11:29:46 -05001805 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1806 if (IS_ERR(ondisk))
1807 return PTR_ERR(ondisk);
1808 ret = rbd_header_from_disk(header, ondisk);
1809 if (ret >= 0)
1810 header->obj_version = ver;
1811 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812
Alex Elder4156d992012-08-02 11:29:46 -05001813 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001814}
1815
Alex Elder41f38c22012-10-25 23:34:40 -05001816static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001817{
1818 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001819 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001820
Alex Eldera0593292012-07-19 09:09:27 -05001821 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001822 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001823}
1824
Alex Elder94785542012-10-09 13:50:17 -07001825static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1826{
1827 sector_t size;
1828
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001829 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001830 return;
1831
1832 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1833 dout("setting size to %llu sectors", (unsigned long long) size);
1834 rbd_dev->mapping.size = (u64) size;
1835 set_capacity(rbd_dev->disk, size);
1836}
1837
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001838/*
1839 * only read the first part of the ondisk header, without the snaps info
1840 */
Alex Elder117973f2012-08-31 17:29:55 -05001841static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001842{
1843 int ret;
1844 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001845
1846 ret = rbd_read_header(rbd_dev, &h);
1847 if (ret < 0)
1848 return ret;
1849
Josh Durgina51aa0c2011-12-05 10:35:04 -08001850 down_write(&rbd_dev->header_rwsem);
1851
Alex Elder94785542012-10-09 13:50:17 -07001852 /* Update image size, and check for resize of mapped image */
1853 rbd_dev->header.image_size = h.image_size;
1854 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001855
Alex Elder849b4262012-07-09 21:04:24 -05001856 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001857 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001858 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001859 /* osd requests may still refer to snapc */
1860 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001861
Alex Elderb8136232012-07-25 09:32:41 -05001862 if (hver)
1863 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001864 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001865 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001866 rbd_dev->header.snapc = h.snapc;
1867 rbd_dev->header.snap_names = h.snap_names;
1868 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001869 /* Free the extra copy of the object prefix */
1870 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1871 kfree(h.object_prefix);
1872
Alex Elder304f6802012-08-31 17:29:52 -05001873 ret = rbd_dev_snaps_update(rbd_dev);
1874 if (!ret)
1875 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001876
Josh Durginc6666012011-11-21 17:11:12 -08001877 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001878
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001879 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001880}
1881
Alex Elder117973f2012-08-31 17:29:55 -05001882static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001883{
1884 int ret;
1885
Alex Elder117973f2012-08-31 17:29:55 -05001886 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001887 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001888 if (rbd_dev->image_format == 1)
1889 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1890 else
1891 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001892 mutex_unlock(&ctl_mutex);
1893
1894 return ret;
1895}
1896
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001897static int rbd_init_disk(struct rbd_device *rbd_dev)
1898{
1899 struct gendisk *disk;
1900 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001901 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001902
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001903 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001904 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1905 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001906 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001907
Alex Elderf0f8cef2012-01-29 13:57:44 -06001908 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001909 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001910 disk->major = rbd_dev->major;
1911 disk->first_minor = 0;
1912 disk->fops = &rbd_bd_ops;
1913 disk->private_data = rbd_dev;
1914
1915 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001916 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1917 if (!q)
1918 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001919
Alex Elder593a9e72012-02-07 12:03:37 -06001920 /* We use the default size, but let's be explicit about it. */
1921 blk_queue_physical_block_size(q, SECTOR_SIZE);
1922
Josh Durgin029bcbd2011-07-22 11:35:23 -07001923 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001924 segment_size = rbd_obj_bytes(&rbd_dev->header);
1925 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1926 blk_queue_max_segment_size(q, segment_size);
1927 blk_queue_io_min(q, segment_size);
1928 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001929
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001930 blk_queue_merge_bvec(q, rbd_merge_bvec);
1931 disk->queue = q;
1932
1933 q->queuedata = rbd_dev;
1934
1935 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001936
Alex Elder12f02942012-08-29 17:11:07 -05001937 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1938
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001939 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001940out_disk:
1941 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001942
1943 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001944}
1945
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001946/*
1947 sysfs
1948*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001949
Alex Elder593a9e72012-02-07 12:03:37 -06001950static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1951{
1952 return container_of(dev, struct rbd_device, dev);
1953}
1954
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001955static ssize_t rbd_size_show(struct device *dev,
1956 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001957{
Alex Elder593a9e72012-02-07 12:03:37 -06001958 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001959 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001960
Josh Durgina51aa0c2011-12-05 10:35:04 -08001961 down_read(&rbd_dev->header_rwsem);
1962 size = get_capacity(rbd_dev->disk);
1963 up_read(&rbd_dev->header_rwsem);
1964
1965 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001966}
1967
Alex Elder34b13182012-07-13 20:35:12 -05001968/*
1969 * Note this shows the features for whatever's mapped, which is not
1970 * necessarily the base image.
1971 */
1972static ssize_t rbd_features_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
1974{
1975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1976
1977 return sprintf(buf, "0x%016llx\n",
1978 (unsigned long long) rbd_dev->mapping.features);
1979}
1980
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001981static ssize_t rbd_major_show(struct device *dev,
1982 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001983{
Alex Elder593a9e72012-02-07 12:03:37 -06001984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001985
1986 return sprintf(buf, "%d\n", rbd_dev->major);
1987}
1988
1989static ssize_t rbd_client_id_show(struct device *dev,
1990 struct device_attribute *attr, char *buf)
1991{
Alex Elder593a9e72012-02-07 12:03:37 -06001992 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001993
Alex Elder1dbb4392012-01-24 10:08:37 -06001994 return sprintf(buf, "client%lld\n",
1995 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001996}
1997
1998static ssize_t rbd_pool_show(struct device *dev,
1999 struct device_attribute *attr, char *buf)
2000{
Alex Elder593a9e72012-02-07 12:03:37 -06002001 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002002
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002003 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002004}
2005
Alex Elder9bb2f332012-07-12 10:46:35 -05002006static ssize_t rbd_pool_id_show(struct device *dev,
2007 struct device_attribute *attr, char *buf)
2008{
2009 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2010
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002011 return sprintf(buf, "%llu\n",
2012 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002013}
2014
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002015static ssize_t rbd_name_show(struct device *dev,
2016 struct device_attribute *attr, char *buf)
2017{
Alex Elder593a9e72012-02-07 12:03:37 -06002018 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002019
Alex Eldera92ffdf2012-10-30 19:40:33 -05002020 if (rbd_dev->spec->image_name)
2021 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2022
2023 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002024}
2025
Alex Elder589d30e2012-07-10 20:30:11 -05002026static ssize_t rbd_image_id_show(struct device *dev,
2027 struct device_attribute *attr, char *buf)
2028{
2029 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2030
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002031 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002032}
2033
Alex Elder34b13182012-07-13 20:35:12 -05002034/*
2035 * Shows the name of the currently-mapped snapshot (or
2036 * RBD_SNAP_HEAD_NAME for the base image).
2037 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002038static ssize_t rbd_snap_show(struct device *dev,
2039 struct device_attribute *attr,
2040 char *buf)
2041{
Alex Elder593a9e72012-02-07 12:03:37 -06002042 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002043
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002044 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002045}
2046
Alex Elder86b00e02012-10-25 23:34:42 -05002047/*
2048 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2049 * for the parent image. If there is no parent, simply shows
2050 * "(no parent image)".
2051 */
2052static ssize_t rbd_parent_show(struct device *dev,
2053 struct device_attribute *attr,
2054 char *buf)
2055{
2056 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2057 struct rbd_spec *spec = rbd_dev->parent_spec;
2058 int count;
2059 char *bufp = buf;
2060
2061 if (!spec)
2062 return sprintf(buf, "(no parent image)\n");
2063
2064 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2065 (unsigned long long) spec->pool_id, spec->pool_name);
2066 if (count < 0)
2067 return count;
2068 bufp += count;
2069
2070 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2071 spec->image_name ? spec->image_name : "(unknown)");
2072 if (count < 0)
2073 return count;
2074 bufp += count;
2075
2076 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2077 (unsigned long long) spec->snap_id, spec->snap_name);
2078 if (count < 0)
2079 return count;
2080 bufp += count;
2081
2082 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2083 if (count < 0)
2084 return count;
2085 bufp += count;
2086
2087 return (ssize_t) (bufp - buf);
2088}
2089
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002090static ssize_t rbd_image_refresh(struct device *dev,
2091 struct device_attribute *attr,
2092 const char *buf,
2093 size_t size)
2094{
Alex Elder593a9e72012-02-07 12:03:37 -06002095 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002096 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002097
Alex Elder117973f2012-08-31 17:29:55 -05002098 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002099
2100 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002101}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002102
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002103static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002104static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2106static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2107static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002108static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002109static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002110static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002111static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2112static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002113static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002114
2115static struct attribute *rbd_attrs[] = {
2116 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002117 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002118 &dev_attr_major.attr,
2119 &dev_attr_client_id.attr,
2120 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002121 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002122 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002123 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002124 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002125 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002126 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002127 NULL
2128};
2129
2130static struct attribute_group rbd_attr_group = {
2131 .attrs = rbd_attrs,
2132};
2133
2134static const struct attribute_group *rbd_attr_groups[] = {
2135 &rbd_attr_group,
2136 NULL
2137};
2138
2139static void rbd_sysfs_dev_release(struct device *dev)
2140{
2141}
2142
2143static struct device_type rbd_device_type = {
2144 .name = "rbd",
2145 .groups = rbd_attr_groups,
2146 .release = rbd_sysfs_dev_release,
2147};
2148
2149
2150/*
2151 sysfs - snapshots
2152*/
2153
2154static ssize_t rbd_snap_size_show(struct device *dev,
2155 struct device_attribute *attr,
2156 char *buf)
2157{
2158 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2159
Josh Durgin35915382011-12-05 18:25:13 -08002160 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002161}
2162
2163static ssize_t rbd_snap_id_show(struct device *dev,
2164 struct device_attribute *attr,
2165 char *buf)
2166{
2167 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2168
Josh Durgin35915382011-12-05 18:25:13 -08002169 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002170}
2171
Alex Elder34b13182012-07-13 20:35:12 -05002172static ssize_t rbd_snap_features_show(struct device *dev,
2173 struct device_attribute *attr,
2174 char *buf)
2175{
2176 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2177
2178 return sprintf(buf, "0x%016llx\n",
2179 (unsigned long long) snap->features);
2180}
2181
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002182static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2183static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002184static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002185
2186static struct attribute *rbd_snap_attrs[] = {
2187 &dev_attr_snap_size.attr,
2188 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002189 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002190 NULL,
2191};
2192
2193static struct attribute_group rbd_snap_attr_group = {
2194 .attrs = rbd_snap_attrs,
2195};
2196
2197static void rbd_snap_dev_release(struct device *dev)
2198{
2199 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2200 kfree(snap->name);
2201 kfree(snap);
2202}
2203
2204static const struct attribute_group *rbd_snap_attr_groups[] = {
2205 &rbd_snap_attr_group,
2206 NULL
2207};
2208
2209static struct device_type rbd_snap_device_type = {
2210 .groups = rbd_snap_attr_groups,
2211 .release = rbd_snap_dev_release,
2212};
2213
Alex Elder8b8fb992012-10-26 17:25:24 -05002214static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2215{
2216 kref_get(&spec->kref);
2217
2218 return spec;
2219}
2220
2221static void rbd_spec_free(struct kref *kref);
2222static void rbd_spec_put(struct rbd_spec *spec)
2223{
2224 if (spec)
2225 kref_put(&spec->kref, rbd_spec_free);
2226}
2227
2228static struct rbd_spec *rbd_spec_alloc(void)
2229{
2230 struct rbd_spec *spec;
2231
2232 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2233 if (!spec)
2234 return NULL;
2235 kref_init(&spec->kref);
2236
2237 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2238
2239 return spec;
2240}
2241
2242static void rbd_spec_free(struct kref *kref)
2243{
2244 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2245
2246 kfree(spec->pool_name);
2247 kfree(spec->image_id);
2248 kfree(spec->image_name);
2249 kfree(spec->snap_name);
2250 kfree(spec);
2251}
2252
Alex Elderc53d5892012-10-25 23:34:42 -05002253struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2254 struct rbd_spec *spec)
2255{
2256 struct rbd_device *rbd_dev;
2257
2258 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2259 if (!rbd_dev)
2260 return NULL;
2261
2262 spin_lock_init(&rbd_dev->lock);
2263 INIT_LIST_HEAD(&rbd_dev->node);
2264 INIT_LIST_HEAD(&rbd_dev->snaps);
2265 init_rwsem(&rbd_dev->header_rwsem);
2266
2267 rbd_dev->spec = spec;
2268 rbd_dev->rbd_client = rbdc;
2269
2270 return rbd_dev;
2271}
2272
2273static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2274{
Alex Elder86b00e02012-10-25 23:34:42 -05002275 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002276 kfree(rbd_dev->header_name);
2277 rbd_put_client(rbd_dev->rbd_client);
2278 rbd_spec_put(rbd_dev->spec);
2279 kfree(rbd_dev);
2280}
2281
Alex Elder304f6802012-08-31 17:29:52 -05002282static bool rbd_snap_registered(struct rbd_snap *snap)
2283{
2284 bool ret = snap->dev.type == &rbd_snap_device_type;
2285 bool reg = device_is_registered(&snap->dev);
2286
2287 rbd_assert(!ret ^ reg);
2288
2289 return ret;
2290}
2291
Alex Elder41f38c22012-10-25 23:34:40 -05002292static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002293{
2294 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002295 if (device_is_registered(&snap->dev))
2296 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002297}
2298
Alex Elder14e70852012-07-19 09:09:27 -05002299static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002300 struct device *parent)
2301{
2302 struct device *dev = &snap->dev;
2303 int ret;
2304
2305 dev->type = &rbd_snap_device_type;
2306 dev->parent = parent;
2307 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002308 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002309 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2310
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002311 ret = device_register(dev);
2312
2313 return ret;
2314}
2315
Alex Elder4e891e02012-07-10 20:30:10 -05002316static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002317 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002318 u64 snap_id, u64 snap_size,
2319 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002320{
Alex Elder4e891e02012-07-10 20:30:10 -05002321 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002322 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002323
2324 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002325 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002326 return ERR_PTR(-ENOMEM);
2327
2328 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002329 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002330 if (!snap->name)
2331 goto err;
2332
Alex Elderc8d18422012-07-10 20:30:11 -05002333 snap->id = snap_id;
2334 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002335 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002336
2337 return snap;
2338
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002339err:
2340 kfree(snap->name);
2341 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002342
2343 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002344}
2345
Alex Eldercd892122012-07-03 16:01:19 -05002346static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2347 u64 *snap_size, u64 *snap_features)
2348{
2349 char *snap_name;
2350
2351 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2352
2353 *snap_size = rbd_dev->header.snap_sizes[which];
2354 *snap_features = 0; /* No features for v1 */
2355
2356 /* Skip over names until we find the one we are looking for */
2357
2358 snap_name = rbd_dev->header.snap_names;
2359 while (which--)
2360 snap_name += strlen(snap_name) + 1;
2361
2362 return snap_name;
2363}
2364
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002365/*
Alex Elder9d475de2012-07-03 16:01:19 -05002366 * Get the size and object order for an image snapshot, or if
2367 * snap_id is CEPH_NOSNAP, gets this information for the base
2368 * image.
2369 */
2370static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2371 u8 *order, u64 *snap_size)
2372{
2373 __le64 snapid = cpu_to_le64(snap_id);
2374 int ret;
2375 struct {
2376 u8 order;
2377 __le64 size;
2378 } __attribute__ ((packed)) size_buf = { 0 };
2379
2380 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2381 "rbd", "get_size",
2382 (char *) &snapid, sizeof (snapid),
2383 (char *) &size_buf, sizeof (size_buf),
2384 CEPH_OSD_FLAG_READ, NULL);
2385 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2386 if (ret < 0)
2387 return ret;
2388
2389 *order = size_buf.order;
2390 *snap_size = le64_to_cpu(size_buf.size);
2391
2392 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2393 (unsigned long long) snap_id, (unsigned int) *order,
2394 (unsigned long long) *snap_size);
2395
2396 return 0;
2397}
2398
2399static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2400{
2401 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2402 &rbd_dev->header.obj_order,
2403 &rbd_dev->header.image_size);
2404}
2405
Alex Elder1e130192012-07-03 16:01:19 -05002406static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2407{
2408 void *reply_buf;
2409 int ret;
2410 void *p;
2411
2412 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2413 if (!reply_buf)
2414 return -ENOMEM;
2415
2416 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2417 "rbd", "get_object_prefix",
2418 NULL, 0,
2419 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2420 CEPH_OSD_FLAG_READ, NULL);
2421 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2422 if (ret < 0)
2423 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002424 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002425
2426 p = reply_buf;
2427 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2428 p + RBD_OBJ_PREFIX_LEN_MAX,
2429 NULL, GFP_NOIO);
2430
2431 if (IS_ERR(rbd_dev->header.object_prefix)) {
2432 ret = PTR_ERR(rbd_dev->header.object_prefix);
2433 rbd_dev->header.object_prefix = NULL;
2434 } else {
2435 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2436 }
2437
2438out:
2439 kfree(reply_buf);
2440
2441 return ret;
2442}
2443
Alex Elderb1b54022012-07-03 16:01:19 -05002444static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2445 u64 *snap_features)
2446{
2447 __le64 snapid = cpu_to_le64(snap_id);
2448 struct {
2449 __le64 features;
2450 __le64 incompat;
2451 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002452 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002453 int ret;
2454
2455 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2456 "rbd", "get_features",
2457 (char *) &snapid, sizeof (snapid),
2458 (char *) &features_buf, sizeof (features_buf),
2459 CEPH_OSD_FLAG_READ, NULL);
2460 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2461 if (ret < 0)
2462 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002463
2464 incompat = le64_to_cpu(features_buf.incompat);
2465 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002466 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002467
Alex Elderb1b54022012-07-03 16:01:19 -05002468 *snap_features = le64_to_cpu(features_buf.features);
2469
2470 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2471 (unsigned long long) snap_id,
2472 (unsigned long long) *snap_features,
2473 (unsigned long long) le64_to_cpu(features_buf.incompat));
2474
2475 return 0;
2476}
2477
2478static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2479{
2480 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2481 &rbd_dev->header.features);
2482}
2483
Alex Elder86b00e02012-10-25 23:34:42 -05002484static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2485{
2486 struct rbd_spec *parent_spec;
2487 size_t size;
2488 void *reply_buf = NULL;
2489 __le64 snapid;
2490 void *p;
2491 void *end;
2492 char *image_id;
2493 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002494 int ret;
2495
2496 parent_spec = rbd_spec_alloc();
2497 if (!parent_spec)
2498 return -ENOMEM;
2499
2500 size = sizeof (__le64) + /* pool_id */
2501 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2502 sizeof (__le64) + /* snap_id */
2503 sizeof (__le64); /* overlap */
2504 reply_buf = kmalloc(size, GFP_KERNEL);
2505 if (!reply_buf) {
2506 ret = -ENOMEM;
2507 goto out_err;
2508 }
2509
2510 snapid = cpu_to_le64(CEPH_NOSNAP);
2511 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2512 "rbd", "get_parent",
2513 (char *) &snapid, sizeof (snapid),
2514 (char *) reply_buf, size,
2515 CEPH_OSD_FLAG_READ, NULL);
2516 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2517 if (ret < 0)
2518 goto out_err;
2519
2520 ret = -ERANGE;
2521 p = reply_buf;
2522 end = (char *) reply_buf + size;
2523 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2524 if (parent_spec->pool_id == CEPH_NOPOOL)
2525 goto out; /* No parent? No problem. */
2526
Alex Elder979ed482012-11-01 08:39:26 -05002527 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002528 if (IS_ERR(image_id)) {
2529 ret = PTR_ERR(image_id);
2530 goto out_err;
2531 }
2532 parent_spec->image_id = image_id;
2533 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2534 ceph_decode_64_safe(&p, end, overlap, out_err);
2535
2536 rbd_dev->parent_overlap = overlap;
2537 rbd_dev->parent_spec = parent_spec;
2538 parent_spec = NULL; /* rbd_dev now owns this */
2539out:
2540 ret = 0;
2541out_err:
2542 kfree(reply_buf);
2543 rbd_spec_put(parent_spec);
2544
2545 return ret;
2546}
2547
Alex Elder9e15b772012-10-30 19:40:33 -05002548static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2549{
2550 size_t image_id_size;
2551 char *image_id;
2552 void *p;
2553 void *end;
2554 size_t size;
2555 void *reply_buf = NULL;
2556 size_t len = 0;
2557 char *image_name = NULL;
2558 int ret;
2559
2560 rbd_assert(!rbd_dev->spec->image_name);
2561
Alex Elder69e7a022012-11-01 08:39:26 -05002562 len = strlen(rbd_dev->spec->image_id);
2563 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002564 image_id = kmalloc(image_id_size, GFP_KERNEL);
2565 if (!image_id)
2566 return NULL;
2567
2568 p = image_id;
2569 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002570 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002571
2572 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2573 reply_buf = kmalloc(size, GFP_KERNEL);
2574 if (!reply_buf)
2575 goto out;
2576
2577 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2578 "rbd", "dir_get_name",
2579 image_id, image_id_size,
2580 (char *) reply_buf, size,
2581 CEPH_OSD_FLAG_READ, NULL);
2582 if (ret < 0)
2583 goto out;
2584 p = reply_buf;
2585 end = (char *) reply_buf + size;
2586 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2587 if (IS_ERR(image_name))
2588 image_name = NULL;
2589 else
2590 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2591out:
2592 kfree(reply_buf);
2593 kfree(image_id);
2594
2595 return image_name;
2596}
2597
2598/*
2599 * When a parent image gets probed, we only have the pool, image,
2600 * and snapshot ids but not the names of any of them. This call
2601 * is made later to fill in those names. It has to be done after
2602 * rbd_dev_snaps_update() has completed because some of the
2603 * information (in particular, snapshot name) is not available
2604 * until then.
2605 */
2606static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2607{
2608 struct ceph_osd_client *osdc;
2609 const char *name;
2610 void *reply_buf = NULL;
2611 int ret;
2612
2613 if (rbd_dev->spec->pool_name)
2614 return 0; /* Already have the names */
2615
2616 /* Look up the pool name */
2617
2618 osdc = &rbd_dev->rbd_client->client->osdc;
2619 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2620 if (!name)
2621 return -EIO; /* pool id too large (>= 2^31) */
2622
2623 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2624 if (!rbd_dev->spec->pool_name)
2625 return -ENOMEM;
2626
2627 /* Fetch the image name; tolerate failure here */
2628
2629 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05002630 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05002631 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05002632 else
Alex Elder9e15b772012-10-30 19:40:33 -05002633 pr_warning(RBD_DRV_NAME "%d "
2634 "unable to get image name for image id %s\n",
2635 rbd_dev->major, rbd_dev->spec->image_id);
Alex Elder9e15b772012-10-30 19:40:33 -05002636
2637 /* Look up the snapshot name. */
2638
2639 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2640 if (!name) {
2641 ret = -EIO;
2642 goto out_err;
2643 }
2644 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2645 if(!rbd_dev->spec->snap_name)
2646 goto out_err;
2647
2648 return 0;
2649out_err:
2650 kfree(reply_buf);
2651 kfree(rbd_dev->spec->pool_name);
2652 rbd_dev->spec->pool_name = NULL;
2653
2654 return ret;
2655}
2656
Alex Elder6e14b1a2012-07-03 16:01:19 -05002657static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002658{
2659 size_t size;
2660 int ret;
2661 void *reply_buf;
2662 void *p;
2663 void *end;
2664 u64 seq;
2665 u32 snap_count;
2666 struct ceph_snap_context *snapc;
2667 u32 i;
2668
2669 /*
2670 * We'll need room for the seq value (maximum snapshot id),
2671 * snapshot count, and array of that many snapshot ids.
2672 * For now we have a fixed upper limit on the number we're
2673 * prepared to receive.
2674 */
2675 size = sizeof (__le64) + sizeof (__le32) +
2676 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2677 reply_buf = kzalloc(size, GFP_KERNEL);
2678 if (!reply_buf)
2679 return -ENOMEM;
2680
2681 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2682 "rbd", "get_snapcontext",
2683 NULL, 0,
2684 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002685 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002686 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2687 if (ret < 0)
2688 goto out;
2689
2690 ret = -ERANGE;
2691 p = reply_buf;
2692 end = (char *) reply_buf + size;
2693 ceph_decode_64_safe(&p, end, seq, out);
2694 ceph_decode_32_safe(&p, end, snap_count, out);
2695
2696 /*
2697 * Make sure the reported number of snapshot ids wouldn't go
2698 * beyond the end of our buffer. But before checking that,
2699 * make sure the computed size of the snapshot context we
2700 * allocate is representable in a size_t.
2701 */
2702 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2703 / sizeof (u64)) {
2704 ret = -EINVAL;
2705 goto out;
2706 }
2707 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2708 goto out;
2709
2710 size = sizeof (struct ceph_snap_context) +
2711 snap_count * sizeof (snapc->snaps[0]);
2712 snapc = kmalloc(size, GFP_KERNEL);
2713 if (!snapc) {
2714 ret = -ENOMEM;
2715 goto out;
2716 }
2717
2718 atomic_set(&snapc->nref, 1);
2719 snapc->seq = seq;
2720 snapc->num_snaps = snap_count;
2721 for (i = 0; i < snap_count; i++)
2722 snapc->snaps[i] = ceph_decode_64(&p);
2723
2724 rbd_dev->header.snapc = snapc;
2725
2726 dout(" snap context seq = %llu, snap_count = %u\n",
2727 (unsigned long long) seq, (unsigned int) snap_count);
2728
2729out:
2730 kfree(reply_buf);
2731
2732 return 0;
2733}
2734
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002735static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2736{
2737 size_t size;
2738 void *reply_buf;
2739 __le64 snap_id;
2740 int ret;
2741 void *p;
2742 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002743 char *snap_name;
2744
2745 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2746 reply_buf = kmalloc(size, GFP_KERNEL);
2747 if (!reply_buf)
2748 return ERR_PTR(-ENOMEM);
2749
2750 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2751 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2752 "rbd", "get_snapshot_name",
2753 (char *) &snap_id, sizeof (snap_id),
2754 reply_buf, size,
2755 CEPH_OSD_FLAG_READ, NULL);
2756 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2757 if (ret < 0)
2758 goto out;
2759
2760 p = reply_buf;
2761 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002762 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002763 if (IS_ERR(snap_name)) {
2764 ret = PTR_ERR(snap_name);
2765 goto out;
2766 } else {
2767 dout(" snap_id 0x%016llx snap_name = %s\n",
2768 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2769 }
2770 kfree(reply_buf);
2771
2772 return snap_name;
2773out:
2774 kfree(reply_buf);
2775
2776 return ERR_PTR(ret);
2777}
2778
2779static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2780 u64 *snap_size, u64 *snap_features)
2781{
2782 __le64 snap_id;
2783 u8 order;
2784 int ret;
2785
2786 snap_id = rbd_dev->header.snapc->snaps[which];
2787 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2788 if (ret)
2789 return ERR_PTR(ret);
2790 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2791 if (ret)
2792 return ERR_PTR(ret);
2793
2794 return rbd_dev_v2_snap_name(rbd_dev, which);
2795}
2796
2797static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2798 u64 *snap_size, u64 *snap_features)
2799{
2800 if (rbd_dev->image_format == 1)
2801 return rbd_dev_v1_snap_info(rbd_dev, which,
2802 snap_size, snap_features);
2803 if (rbd_dev->image_format == 2)
2804 return rbd_dev_v2_snap_info(rbd_dev, which,
2805 snap_size, snap_features);
2806 return ERR_PTR(-EINVAL);
2807}
2808
Alex Elder117973f2012-08-31 17:29:55 -05002809static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2810{
2811 int ret;
2812 __u8 obj_order;
2813
2814 down_write(&rbd_dev->header_rwsem);
2815
2816 /* Grab old order first, to see if it changes */
2817
2818 obj_order = rbd_dev->header.obj_order,
2819 ret = rbd_dev_v2_image_size(rbd_dev);
2820 if (ret)
2821 goto out;
2822 if (rbd_dev->header.obj_order != obj_order) {
2823 ret = -EIO;
2824 goto out;
2825 }
2826 rbd_update_mapping_size(rbd_dev);
2827
2828 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2829 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2830 if (ret)
2831 goto out;
2832 ret = rbd_dev_snaps_update(rbd_dev);
2833 dout("rbd_dev_snaps_update returned %d\n", ret);
2834 if (ret)
2835 goto out;
2836 ret = rbd_dev_snaps_register(rbd_dev);
2837 dout("rbd_dev_snaps_register returned %d\n", ret);
2838out:
2839 up_write(&rbd_dev->header_rwsem);
2840
2841 return ret;
2842}
2843
Alex Elder9d475de2012-07-03 16:01:19 -05002844/*
Alex Elder35938152012-08-02 11:29:46 -05002845 * Scan the rbd device's current snapshot list and compare it to the
2846 * newly-received snapshot context. Remove any existing snapshots
2847 * not present in the new snapshot context. Add a new snapshot for
2848 * any snaphots in the snapshot context not in the current list.
2849 * And verify there are no changes to snapshots we already know
2850 * about.
2851 *
2852 * Assumes the snapshots in the snapshot context are sorted by
2853 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2854 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002855 */
Alex Elder304f6802012-08-31 17:29:52 -05002856static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002857{
Alex Elder35938152012-08-02 11:29:46 -05002858 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2859 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002860 struct list_head *head = &rbd_dev->snaps;
2861 struct list_head *links = head->next;
2862 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002863
Alex Elder9fcbb802012-08-23 23:48:49 -05002864 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002865 while (index < snap_count || links != head) {
2866 u64 snap_id;
2867 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002868 char *snap_name;
2869 u64 snap_size = 0;
2870 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002871
Alex Elder35938152012-08-02 11:29:46 -05002872 snap_id = index < snap_count ? snapc->snaps[index]
2873 : CEPH_NOSNAP;
2874 snap = links != head ? list_entry(links, struct rbd_snap, node)
2875 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002876 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002877
Alex Elder35938152012-08-02 11:29:46 -05002878 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2879 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002880
Alex Elder35938152012-08-02 11:29:46 -05002881 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002882
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002883 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderdaba5fd2012-10-26 17:25:23 -05002884 rbd_dev->exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002885 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002886 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002887 rbd_dev->spec->snap_id == snap->id ?
2888 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002889 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002890
Alex Elder35938152012-08-02 11:29:46 -05002891 /* Done with this list entry; advance */
2892
2893 links = next;
2894 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002895 }
Alex Elder35938152012-08-02 11:29:46 -05002896
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002897 snap_name = rbd_dev_snap_info(rbd_dev, index,
2898 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002899 if (IS_ERR(snap_name))
2900 return PTR_ERR(snap_name);
2901
Alex Elder9fcbb802012-08-23 23:48:49 -05002902 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2903 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002904 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2905 struct rbd_snap *new_snap;
2906
2907 /* We haven't seen this snapshot before */
2908
Alex Elderc8d18422012-07-10 20:30:11 -05002909 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002910 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002911 if (IS_ERR(new_snap)) {
2912 int err = PTR_ERR(new_snap);
2913
2914 dout(" failed to add dev, error %d\n", err);
2915
2916 return err;
2917 }
Alex Elder35938152012-08-02 11:29:46 -05002918
2919 /* New goes before existing, or at end of list */
2920
Alex Elder9fcbb802012-08-23 23:48:49 -05002921 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002922 if (snap)
2923 list_add_tail(&new_snap->node, &snap->node);
2924 else
Alex Elder523f3252012-08-30 00:16:37 -05002925 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002926 } else {
2927 /* Already have this one */
2928
Alex Elder9fcbb802012-08-23 23:48:49 -05002929 dout(" already present\n");
2930
Alex Eldercd892122012-07-03 16:01:19 -05002931 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002932 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002933 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002934
2935 /* Done with this list entry; advance */
2936
2937 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002938 }
Alex Elder35938152012-08-02 11:29:46 -05002939
2940 /* Advance to the next entry in the snapshot context */
2941
2942 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002943 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002944 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002945
2946 return 0;
2947}
2948
Alex Elder304f6802012-08-31 17:29:52 -05002949/*
2950 * Scan the list of snapshots and register the devices for any that
2951 * have not already been registered.
2952 */
2953static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2954{
2955 struct rbd_snap *snap;
2956 int ret = 0;
2957
2958 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002959 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2960 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002961
2962 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2963 if (!rbd_snap_registered(snap)) {
2964 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2965 if (ret < 0)
2966 break;
2967 }
2968 }
2969 dout("%s: returning %d\n", __func__, ret);
2970
2971 return ret;
2972}
2973
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002974static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2975{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002976 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002977 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002978
2979 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002980
Alex Eldercd789ab2012-08-30 00:16:38 -05002981 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002982 dev->bus = &rbd_bus_type;
2983 dev->type = &rbd_device_type;
2984 dev->parent = &rbd_root_dev;
2985 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002986 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002987 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002988
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002989 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002990
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002991 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002992}
2993
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002994static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2995{
2996 device_unregister(&rbd_dev->dev);
2997}
2998
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002999static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
3000{
3001 int ret, rc;
3002
3003 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05003004 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003005 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05003006 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003007 if (rc < 0)
3008 return rc;
3009 }
3010 } while (ret == -ERANGE);
3011
3012 return ret;
3013}
3014
Alex Eldere2839302012-08-29 17:11:06 -05003015static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003016
3017/*
Alex Elder499afd52012-02-02 08:13:29 -06003018 * Get a unique rbd identifier for the given new rbd_dev, and add
3019 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003020 */
Alex Eldere2839302012-08-29 17:11:06 -05003021static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003022{
Alex Eldere2839302012-08-29 17:11:06 -05003023 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003024
3025 spin_lock(&rbd_dev_list_lock);
3026 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3027 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003028 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3029 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003030}
Alex Elderb7f23c32012-01-29 13:57:43 -06003031
Alex Elder1ddbe942012-01-29 13:57:44 -06003032/*
Alex Elder499afd52012-02-02 08:13:29 -06003033 * Remove an rbd_dev from the global list, and record that its
3034 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003035 */
Alex Eldere2839302012-08-29 17:11:06 -05003036static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003037{
Alex Elderd184f6b2012-01-29 13:57:44 -06003038 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003039 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003040 int max_id;
3041
Alex Elderaafb2302012-09-06 16:00:54 -05003042 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003043
Alex Eldere2839302012-08-29 17:11:06 -05003044 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3045 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003046 spin_lock(&rbd_dev_list_lock);
3047 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003048
3049 /*
3050 * If the id being "put" is not the current maximum, there
3051 * is nothing special we need to do.
3052 */
Alex Eldere2839302012-08-29 17:11:06 -05003053 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003054 spin_unlock(&rbd_dev_list_lock);
3055 return;
3056 }
3057
3058 /*
3059 * We need to update the current maximum id. Search the
3060 * list to find out what it is. We're more likely to find
3061 * the maximum at the end, so search the list backward.
3062 */
3063 max_id = 0;
3064 list_for_each_prev(tmp, &rbd_dev_list) {
3065 struct rbd_device *rbd_dev;
3066
3067 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003068 if (rbd_dev->dev_id > max_id)
3069 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003070 }
Alex Elder499afd52012-02-02 08:13:29 -06003071 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003072
Alex Elder1ddbe942012-01-29 13:57:44 -06003073 /*
Alex Eldere2839302012-08-29 17:11:06 -05003074 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003075 * which case it now accurately reflects the new maximum.
3076 * Be careful not to overwrite the maximum value in that
3077 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003078 */
Alex Eldere2839302012-08-29 17:11:06 -05003079 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3080 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003081}
3082
Alex Eldera725f65e2012-02-02 08:13:30 -06003083/*
Alex Eldere28fff262012-02-02 08:13:30 -06003084 * Skips over white space at *buf, and updates *buf to point to the
3085 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003086 * the token (string of non-white space characters) found. Note
3087 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003088 */
3089static inline size_t next_token(const char **buf)
3090{
3091 /*
3092 * These are the characters that produce nonzero for
3093 * isspace() in the "C" and "POSIX" locales.
3094 */
3095 const char *spaces = " \f\n\r\t\v";
3096
3097 *buf += strspn(*buf, spaces); /* Find start of token */
3098
3099 return strcspn(*buf, spaces); /* Return token length */
3100}
3101
3102/*
3103 * Finds the next token in *buf, and if the provided token buffer is
3104 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003105 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3106 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003107 *
3108 * Returns the length of the token found (not including the '\0').
3109 * Return value will be 0 if no token is found, and it will be >=
3110 * token_size if the token would not fit.
3111 *
Alex Elder593a9e72012-02-07 12:03:37 -06003112 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003113 * found token. Note that this occurs even if the token buffer is
3114 * too small to hold it.
3115 */
3116static inline size_t copy_token(const char **buf,
3117 char *token,
3118 size_t token_size)
3119{
3120 size_t len;
3121
3122 len = next_token(buf);
3123 if (len < token_size) {
3124 memcpy(token, *buf, len);
3125 *(token + len) = '\0';
3126 }
3127 *buf += len;
3128
3129 return len;
3130}
3131
3132/*
Alex Elderea3352f2012-07-09 21:04:23 -05003133 * Finds the next token in *buf, dynamically allocates a buffer big
3134 * enough to hold a copy of it, and copies the token into the new
3135 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3136 * that a duplicate buffer is created even for a zero-length token.
3137 *
3138 * Returns a pointer to the newly-allocated duplicate, or a null
3139 * pointer if memory for the duplicate was not available. If
3140 * the lenp argument is a non-null pointer, the length of the token
3141 * (not including the '\0') is returned in *lenp.
3142 *
3143 * If successful, the *buf pointer will be updated to point beyond
3144 * the end of the found token.
3145 *
3146 * Note: uses GFP_KERNEL for allocation.
3147 */
3148static inline char *dup_token(const char **buf, size_t *lenp)
3149{
3150 char *dup;
3151 size_t len;
3152
3153 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003154 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003155 if (!dup)
3156 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003157 *(dup + len) = '\0';
3158 *buf += len;
3159
3160 if (lenp)
3161 *lenp = len;
3162
3163 return dup;
3164}
3165
3166/*
Alex Elder859c31d2012-10-25 23:34:42 -05003167 * Parse the options provided for an "rbd add" (i.e., rbd image
3168 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3169 * and the data written is passed here via a NUL-terminated buffer.
3170 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003171 *
Alex Elder859c31d2012-10-25 23:34:42 -05003172 * The information extracted from these options is recorded in
3173 * the other parameters which return dynamically-allocated
3174 * structures:
3175 * ceph_opts
3176 * The address of a pointer that will refer to a ceph options
3177 * structure. Caller must release the returned pointer using
3178 * ceph_destroy_options() when it is no longer needed.
3179 * rbd_opts
3180 * Address of an rbd options pointer. Fully initialized by
3181 * this function; caller must release with kfree().
3182 * spec
3183 * Address of an rbd image specification pointer. Fully
3184 * initialized by this function based on parsed options.
3185 * Caller must release with rbd_spec_put().
3186 *
3187 * The options passed take this form:
3188 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3189 * where:
3190 * <mon_addrs>
3191 * A comma-separated list of one or more monitor addresses.
3192 * A monitor address is an ip address, optionally followed
3193 * by a port number (separated by a colon).
3194 * I.e.: ip1[:port1][,ip2[:port2]...]
3195 * <options>
3196 * A comma-separated list of ceph and/or rbd options.
3197 * <pool_name>
3198 * The name of the rados pool containing the rbd image.
3199 * <image_name>
3200 * The name of the image in that pool to map.
3201 * <snap_id>
3202 * An optional snapshot id. If provided, the mapping will
3203 * present data from the image at the time that snapshot was
3204 * created. The image head is used if no snapshot id is
3205 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003206 */
Alex Elder859c31d2012-10-25 23:34:42 -05003207static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003208 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003209 struct rbd_options **opts,
3210 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003211{
Alex Elderd22f76e2012-07-12 10:46:35 -05003212 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003213 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003214 const char *mon_addrs;
3215 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003216 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003217 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003218 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003219 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003220
3221 /* The first four tokens are required */
3222
Alex Elder7ef32142012-02-02 08:13:30 -06003223 len = next_token(&buf);
3224 if (!len)
Alex Elderdc79b112012-10-25 23:34:41 -05003225 return -EINVAL; /* Missing monitor address(es) */
Alex Elder0ddebc02012-10-25 23:34:41 -05003226 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003227 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003228 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003229
Alex Elderdc79b112012-10-25 23:34:41 -05003230 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003231 options = dup_token(&buf, NULL);
3232 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003233 return -ENOMEM;
Alex Elderf28e5652012-10-25 23:34:41 -05003234 if (!*options)
3235 goto out_err; /* Missing options */
Alex Eldera725f65e2012-02-02 08:13:30 -06003236
Alex Elder859c31d2012-10-25 23:34:42 -05003237 spec = rbd_spec_alloc();
3238 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003239 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003240
3241 spec->pool_name = dup_token(&buf, NULL);
3242 if (!spec->pool_name)
3243 goto out_mem;
3244 if (!*spec->pool_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003245 goto out_err; /* Missing pool name */
Alex Eldere28fff262012-02-02 08:13:30 -06003246
Alex Elder69e7a022012-11-01 08:39:26 -05003247 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003248 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003249 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003250 if (!*spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003251 goto out_err; /* Missing image name */
Alex Eldere28fff262012-02-02 08:13:30 -06003252
Alex Elderf28e5652012-10-25 23:34:41 -05003253 /*
3254 * Snapshot name is optional; default is to use "-"
3255 * (indicating the head/no snapshot).
3256 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003257 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003258 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003259 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3260 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003261 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003262 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003263 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003264 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003265 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003266 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003267 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003268 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003269
Alex Elder0ddebc02012-10-25 23:34:41 -05003270 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003271
Alex Elder4e9afeb2012-10-25 23:34:41 -05003272 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3273 if (!rbd_opts)
3274 goto out_mem;
3275
3276 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003277
Alex Elder859c31d2012-10-25 23:34:42 -05003278 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003279 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003280 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003281 if (IS_ERR(copts)) {
3282 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003283 goto out_err;
3284 }
Alex Elder859c31d2012-10-25 23:34:42 -05003285 kfree(options);
3286
3287 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003288 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003289 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003290
Alex Elderdc79b112012-10-25 23:34:41 -05003291 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003292out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003293 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003294out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003295 kfree(rbd_opts);
3296 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003297 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003298
Alex Elderdc79b112012-10-25 23:34:41 -05003299 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003300}
3301
Alex Elder589d30e2012-07-10 20:30:11 -05003302/*
3303 * An rbd format 2 image has a unique identifier, distinct from the
3304 * name given to it by the user. Internally, that identifier is
3305 * what's used to specify the names of objects related to the image.
3306 *
3307 * A special "rbd id" object is used to map an rbd image name to its
3308 * id. If that object doesn't exist, then there is no v2 rbd image
3309 * with the supplied name.
3310 *
3311 * This function will record the given rbd_dev's image_id field if
3312 * it can be determined, and in that case will return 0. If any
3313 * errors occur a negative errno will be returned and the rbd_dev's
3314 * image_id field will be unchanged (and should be NULL).
3315 */
3316static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3317{
3318 int ret;
3319 size_t size;
3320 char *object_name;
3321 void *response;
3322 void *p;
3323
3324 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003325 * When probing a parent image, the image id is already
3326 * known (and the image name likely is not). There's no
3327 * need to fetch the image id again in this case.
3328 */
3329 if (rbd_dev->spec->image_id)
3330 return 0;
3331
3332 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003333 * First, see if the format 2 image id file exists, and if
3334 * so, get the image's persistent id from it.
3335 */
Alex Elder69e7a022012-11-01 08:39:26 -05003336 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003337 object_name = kmalloc(size, GFP_NOIO);
3338 if (!object_name)
3339 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003340 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003341 dout("rbd id object name is %s\n", object_name);
3342
3343 /* Response will be an encoded string, which includes a length */
3344
3345 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3346 response = kzalloc(size, GFP_NOIO);
3347 if (!response) {
3348 ret = -ENOMEM;
3349 goto out;
3350 }
3351
3352 ret = rbd_req_sync_exec(rbd_dev, object_name,
3353 "rbd", "get_id",
3354 NULL, 0,
3355 response, RBD_IMAGE_ID_LEN_MAX,
3356 CEPH_OSD_FLAG_READ, NULL);
3357 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3358 if (ret < 0)
3359 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003360 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003361
3362 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003363 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003364 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003365 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003366 if (IS_ERR(rbd_dev->spec->image_id)) {
3367 ret = PTR_ERR(rbd_dev->spec->image_id);
3368 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003369 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003370 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003371 }
3372out:
3373 kfree(response);
3374 kfree(object_name);
3375
3376 return ret;
3377}
3378
Alex Eldera30b71b2012-07-10 20:30:11 -05003379static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3380{
3381 int ret;
3382 size_t size;
3383
3384 /* Version 1 images have no id; empty string is used */
3385
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003386 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3387 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003388 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003389
3390 /* Record the header object name for this rbd image. */
3391
Alex Elder69e7a022012-11-01 08:39:26 -05003392 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003393 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3394 if (!rbd_dev->header_name) {
3395 ret = -ENOMEM;
3396 goto out_err;
3397 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003398 sprintf(rbd_dev->header_name, "%s%s",
3399 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003400
3401 /* Populate rbd image metadata */
3402
3403 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3404 if (ret < 0)
3405 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003406
3407 /* Version 1 images have no parent (no layering) */
3408
3409 rbd_dev->parent_spec = NULL;
3410 rbd_dev->parent_overlap = 0;
3411
Alex Eldera30b71b2012-07-10 20:30:11 -05003412 rbd_dev->image_format = 1;
3413
3414 dout("discovered version 1 image, header name is %s\n",
3415 rbd_dev->header_name);
3416
3417 return 0;
3418
3419out_err:
3420 kfree(rbd_dev->header_name);
3421 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003422 kfree(rbd_dev->spec->image_id);
3423 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003424
3425 return ret;
3426}
3427
3428static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3429{
3430 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003431 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003432 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003433
3434 /*
3435 * Image id was filled in by the caller. Record the header
3436 * object name for this rbd image.
3437 */
Alex Elder979ed482012-11-01 08:39:26 -05003438 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003439 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3440 if (!rbd_dev->header_name)
3441 return -ENOMEM;
3442 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003443 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003444
3445 /* Get the size and object order for the image */
3446
3447 ret = rbd_dev_v2_image_size(rbd_dev);
3448 if (ret < 0)
3449 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003450
3451 /* Get the object prefix (a.k.a. block_name) for the image */
3452
3453 ret = rbd_dev_v2_object_prefix(rbd_dev);
3454 if (ret < 0)
3455 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003456
Alex Elderd8891402012-10-09 13:50:17 -07003457 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003458
3459 ret = rbd_dev_v2_features(rbd_dev);
3460 if (ret < 0)
3461 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003462
Alex Elder86b00e02012-10-25 23:34:42 -05003463 /* If the image supports layering, get the parent info */
3464
3465 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3466 ret = rbd_dev_v2_parent_info(rbd_dev);
3467 if (ret < 0)
3468 goto out_err;
3469 }
3470
Alex Elder6e14b1a2012-07-03 16:01:19 -05003471 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003472
Alex Elder6e14b1a2012-07-03 16:01:19 -05003473 rbd_dev->header.crypt_type = 0;
3474 rbd_dev->header.comp_type = 0;
3475
3476 /* Get the snapshot context, plus the header version */
3477
3478 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003479 if (ret)
3480 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003481 rbd_dev->header.obj_version = ver;
3482
Alex Eldera30b71b2012-07-10 20:30:11 -05003483 rbd_dev->image_format = 2;
3484
3485 dout("discovered version 2 image, header name is %s\n",
3486 rbd_dev->header_name);
3487
Alex Elder35152972012-08-31 17:29:55 -05003488 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003489out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003490 rbd_dev->parent_overlap = 0;
3491 rbd_spec_put(rbd_dev->parent_spec);
3492 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003493 kfree(rbd_dev->header_name);
3494 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003495 kfree(rbd_dev->header.object_prefix);
3496 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003497
3498 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003499}
3500
Alex Elder83a06262012-10-30 15:47:17 -05003501static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3502{
3503 int ret;
3504
3505 /* no need to lock here, as rbd_dev is not registered yet */
3506 ret = rbd_dev_snaps_update(rbd_dev);
3507 if (ret)
3508 return ret;
3509
Alex Elder9e15b772012-10-30 19:40:33 -05003510 ret = rbd_dev_probe_update_spec(rbd_dev);
3511 if (ret)
3512 goto err_out_snaps;
3513
Alex Elder83a06262012-10-30 15:47:17 -05003514 ret = rbd_dev_set_mapping(rbd_dev);
3515 if (ret)
3516 goto err_out_snaps;
3517
3518 /* generate unique id: find highest unique id, add one */
3519 rbd_dev_id_get(rbd_dev);
3520
3521 /* Fill in the device name, now that we have its id. */
3522 BUILD_BUG_ON(DEV_NAME_LEN
3523 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3524 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3525
3526 /* Get our block major device number. */
3527
3528 ret = register_blkdev(0, rbd_dev->name);
3529 if (ret < 0)
3530 goto err_out_id;
3531 rbd_dev->major = ret;
3532
3533 /* Set up the blkdev mapping. */
3534
3535 ret = rbd_init_disk(rbd_dev);
3536 if (ret)
3537 goto err_out_blkdev;
3538
3539 ret = rbd_bus_add_dev(rbd_dev);
3540 if (ret)
3541 goto err_out_disk;
3542
3543 /*
3544 * At this point cleanup in the event of an error is the job
3545 * of the sysfs code (initiated by rbd_bus_del_dev()).
3546 */
3547 down_write(&rbd_dev->header_rwsem);
3548 ret = rbd_dev_snaps_register(rbd_dev);
3549 up_write(&rbd_dev->header_rwsem);
3550 if (ret)
3551 goto err_out_bus;
3552
3553 ret = rbd_init_watch_dev(rbd_dev);
3554 if (ret)
3555 goto err_out_bus;
3556
3557 /* Everything's ready. Announce the disk to the world. */
3558
3559 add_disk(rbd_dev->disk);
3560
3561 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3562 (unsigned long long) rbd_dev->mapping.size);
3563
3564 return ret;
3565err_out_bus:
3566 /* this will also clean up rest of rbd_dev stuff */
3567
3568 rbd_bus_del_dev(rbd_dev);
3569
3570 return ret;
3571err_out_disk:
3572 rbd_free_disk(rbd_dev);
3573err_out_blkdev:
3574 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3575err_out_id:
3576 rbd_dev_id_put(rbd_dev);
3577err_out_snaps:
3578 rbd_remove_all_snaps(rbd_dev);
3579
3580 return ret;
3581}
3582
Alex Eldera30b71b2012-07-10 20:30:11 -05003583/*
3584 * Probe for the existence of the header object for the given rbd
3585 * device. For format 2 images this includes determining the image
3586 * id.
3587 */
3588static int rbd_dev_probe(struct rbd_device *rbd_dev)
3589{
3590 int ret;
3591
3592 /*
3593 * Get the id from the image id object. If it's not a
3594 * format 2 image, we'll get ENOENT back, and we'll assume
3595 * it's a format 1 image.
3596 */
3597 ret = rbd_dev_image_id(rbd_dev);
3598 if (ret)
3599 ret = rbd_dev_v1_probe(rbd_dev);
3600 else
3601 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003602 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003603 dout("probe failed, returning %d\n", ret);
3604
Alex Elder83a06262012-10-30 15:47:17 -05003605 return ret;
3606 }
3607
3608 ret = rbd_dev_probe_finish(rbd_dev);
3609 if (ret)
3610 rbd_header_free(&rbd_dev->header);
3611
Alex Eldera30b71b2012-07-10 20:30:11 -05003612 return ret;
3613}
3614
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003615static ssize_t rbd_add(struct bus_type *bus,
3616 const char *buf,
3617 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003618{
Alex Eldercb8627c2012-07-09 21:04:23 -05003619 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003620 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003621 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003622 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003623 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003624 struct ceph_osd_client *osdc;
3625 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003626
3627 if (!try_module_get(THIS_MODULE))
3628 return -ENODEV;
3629
Alex Eldera725f65e2012-02-02 08:13:30 -06003630 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003631 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003632 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003633 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003634
Alex Elder9d3997f2012-10-25 23:34:42 -05003635 rbdc = rbd_get_client(ceph_opts);
3636 if (IS_ERR(rbdc)) {
3637 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003638 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003639 }
Alex Elderc53d5892012-10-25 23:34:42 -05003640 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003641
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003642 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003643 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003644 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003645 if (rc < 0)
3646 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003647 spec->pool_id = (u64) rc;
3648
Alex Elderc53d5892012-10-25 23:34:42 -05003649 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003650 if (!rbd_dev)
3651 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003652 rbdc = NULL; /* rbd_dev now owns this */
3653 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003654
Alex Elderbd4ba652012-10-25 23:34:42 -05003655 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003656 kfree(rbd_opts);
3657 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003658
Alex Eldera30b71b2012-07-10 20:30:11 -05003659 rc = rbd_dev_probe(rbd_dev);
3660 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05003661 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05003662
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003663 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05003664err_out_rbd_dev:
3665 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05003666err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003667 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003668err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003669 if (ceph_opts)
3670 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003671 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003672 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003673err_out_module:
3674 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003675
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003676 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003677
3678 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003679}
3680
Alex Elderde71a292012-07-03 16:01:19 -05003681static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003682{
3683 struct list_head *tmp;
3684 struct rbd_device *rbd_dev;
3685
Alex Eldere124a822012-01-29 13:57:44 -06003686 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003687 list_for_each(tmp, &rbd_dev_list) {
3688 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003689 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003690 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003691 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003692 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003693 }
Alex Eldere124a822012-01-29 13:57:44 -06003694 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003695 return NULL;
3696}
3697
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003698static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003699{
Alex Elder593a9e72012-02-07 12:03:37 -06003700 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003701
Alex Elder1dbb4392012-01-24 10:08:37 -06003702 if (rbd_dev->watch_request) {
3703 struct ceph_client *client = rbd_dev->rbd_client->client;
3704
3705 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003706 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003707 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003708 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003709 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003710
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003711
3712 /* clean up and free blkdev */
3713 rbd_free_disk(rbd_dev);
3714 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003715
Alex Elder2ac4e752012-07-10 20:30:10 -05003716 /* release allocated disk header fields */
3717 rbd_header_free(&rbd_dev->header);
3718
Alex Elder32eec682012-02-08 16:11:14 -06003719 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05003720 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05003721 rbd_assert(rbd_dev->rbd_client != NULL);
3722 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003723
3724 /* release module ref */
3725 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003726}
3727
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003728static ssize_t rbd_remove(struct bus_type *bus,
3729 const char *buf,
3730 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003731{
3732 struct rbd_device *rbd_dev = NULL;
3733 int target_id, rc;
3734 unsigned long ul;
3735 int ret = count;
3736
3737 rc = strict_strtoul(buf, 10, &ul);
3738 if (rc)
3739 return rc;
3740
3741 /* convert to int; abort if we lost anything in the conversion */
3742 target_id = (int) ul;
3743 if (target_id != ul)
3744 return -EINVAL;
3745
3746 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3747
3748 rbd_dev = __rbd_get_dev(target_id);
3749 if (!rbd_dev) {
3750 ret = -ENOENT;
3751 goto done;
3752 }
3753
Alex Elder42382b72012-11-16 09:29:16 -06003754 if (rbd_dev->open_count) {
3755 ret = -EBUSY;
3756 goto done;
3757 }
3758
Alex Elder41f38c22012-10-25 23:34:40 -05003759 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003760 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003761
3762done:
3763 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003764
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003765 return ret;
3766}
3767
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003768/*
3769 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003770 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003771 */
3772static int rbd_sysfs_init(void)
3773{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003774 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003775
Alex Elderfed4c142012-02-07 12:03:36 -06003776 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003777 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003778 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003779
Alex Elderfed4c142012-02-07 12:03:36 -06003780 ret = bus_register(&rbd_bus_type);
3781 if (ret < 0)
3782 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003783
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003784 return ret;
3785}
3786
3787static void rbd_sysfs_cleanup(void)
3788{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003789 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003790 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003791}
3792
3793int __init rbd_init(void)
3794{
3795 int rc;
3796
3797 rc = rbd_sysfs_init();
3798 if (rc)
3799 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003800 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003801 return 0;
3802}
3803
3804void __exit rbd_exit(void)
3805{
3806 rbd_sysfs_cleanup();
3807}
3808
3809module_init(rbd_init);
3810module_exit(rbd_exit);
3811
3812MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3813MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3814MODULE_DESCRIPTION("rados block device");
3815
3816/* following authorship retained from original osdblk.c */
3817MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3818
3819MODULE_LICENSE("GPL");