blob: 668936381ab0a59f81f6658705ef5e2ced80f441 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
165/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600166 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700167 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700168struct rbd_req_status {
169 int done;
Alex Elder8986cb32012-11-08 08:01:39 -0600170 s32 rc;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700171 u64 bytes;
172};
173
174/*
175 * a collection of requests
176 */
177struct rbd_req_coll {
178 int total;
179 int num_done;
180 struct kref kref;
181 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182};
183
Alex Elderf0f8cef2012-01-29 13:57:44 -0600184/*
185 * a single io request
186 */
187struct rbd_request {
188 struct request *rq; /* blk layer request */
189 struct bio *bio; /* cloned bio */
190 struct page **pages; /* list of used pages */
191 u64 len;
192 int coll_index;
193 struct rbd_req_coll *coll;
194};
195
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800196struct rbd_snap {
197 struct device dev;
198 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800199 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800200 struct list_head node;
201 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500202 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800203};
204
Alex Elderf84344f2012-08-31 17:29:51 -0500205struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500206 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500207 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500208 bool read_only;
209};
210
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700211/*
212 * a single device
213 */
214struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500215 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700216
217 int major; /* blkdev assigned major */
218 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700219
Alex Eldera30b71b2012-07-10 20:30:11 -0500220 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700221 struct rbd_client *rbd_client;
222
223 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
224
225 spinlock_t lock; /* queue lock */
226
227 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600228 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500229 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700230
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500231 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500232
Alex Elder0903e872012-11-14 12:25:19 -0600233 struct ceph_file_layout layout;
234
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700235 struct ceph_osd_event *watch_event;
236 struct ceph_osd_request *watch_request;
237
Alex Elder86b00e02012-10-25 23:34:42 -0500238 struct rbd_spec *parent_spec;
239 u64 parent_overlap;
240
Josh Durginc6666012011-11-21 17:11:12 -0800241 /* protects updating the header */
242 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500243
244 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700245
246 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800247
248 /* list of snapshots */
249 struct list_head snaps;
250
251 /* sysfs related */
252 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600253 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800254};
255
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700256static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600257
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600259static DEFINE_SPINLOCK(rbd_dev_list_lock);
260
Alex Elder432b8582012-01-29 13:57:44 -0600261static LIST_HEAD(rbd_client_list); /* clients */
262static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700263
Alex Elder304f6802012-08-31 17:29:52 -0500264static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
265static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
266
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800267static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500268static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800269
Alex Elderf0f8cef2012-01-29 13:57:44 -0600270static ssize_t rbd_add(struct bus_type *bus, const char *buf,
271 size_t count);
272static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
273 size_t count);
274
275static struct bus_attribute rbd_bus_attrs[] = {
276 __ATTR(add, S_IWUSR, NULL, rbd_add),
277 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
278 __ATTR_NULL
279};
280
281static struct bus_type rbd_bus_type = {
282 .name = "rbd",
283 .bus_attrs = rbd_bus_attrs,
284};
285
286static void rbd_root_dev_release(struct device *dev)
287{
288}
289
290static struct device rbd_root_dev = {
291 .init_name = "rbd",
292 .release = rbd_root_dev_release,
293};
294
Alex Elder06ecc6c2012-11-01 10:17:15 -0500295static __printf(2, 3)
296void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
297{
298 struct va_format vaf;
299 va_list args;
300
301 va_start(args, fmt);
302 vaf.fmt = fmt;
303 vaf.va = &args;
304
305 if (!rbd_dev)
306 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
307 else if (rbd_dev->disk)
308 printk(KERN_WARNING "%s: %s: %pV\n",
309 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
310 else if (rbd_dev->spec && rbd_dev->spec->image_name)
311 printk(KERN_WARNING "%s: image %s: %pV\n",
312 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
313 else if (rbd_dev->spec && rbd_dev->spec->image_id)
314 printk(KERN_WARNING "%s: id %s: %pV\n",
315 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
316 else /* punt */
317 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
318 RBD_DRV_NAME, rbd_dev, &vaf);
319 va_end(args);
320}
321
Alex Elderaafb2302012-09-06 16:00:54 -0500322#ifdef RBD_DEBUG
323#define rbd_assert(expr) \
324 if (unlikely(!(expr))) { \
325 printk(KERN_ERR "\nAssertion failure in %s() " \
326 "at line %d:\n\n" \
327 "\trbd_assert(%s);\n\n", \
328 __func__, __LINE__, #expr); \
329 BUG(); \
330 }
331#else /* !RBD_DEBUG */
332# define rbd_assert(expr) ((void) 0)
333#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800334
Alex Elder117973f2012-08-31 17:29:55 -0500335static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
336static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700337
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700338static int rbd_open(struct block_device *bdev, fmode_t mode)
339{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600340 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341
Alex Elderf84344f2012-08-31 17:29:51 -0500342 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700343 return -EROFS;
344
Alex Elder42382b72012-11-16 09:29:16 -0600345 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600346 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500347 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600348 rbd_dev->open_count++;
349 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700350
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351 return 0;
352}
353
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800354static int rbd_release(struct gendisk *disk, fmode_t mode)
355{
356 struct rbd_device *rbd_dev = disk->private_data;
357
Alex Elder42382b72012-11-16 09:29:16 -0600358 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
359 rbd_assert(rbd_dev->open_count > 0);
360 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600361 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600362 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800363
364 return 0;
365}
366
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700367static const struct block_device_operations rbd_bd_ops = {
368 .owner = THIS_MODULE,
369 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800370 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371};
372
373/*
374 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500375 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700376 */
Alex Elderf8c38922012-08-10 13:12:07 -0700377static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700378{
379 struct rbd_client *rbdc;
380 int ret = -ENOMEM;
381
382 dout("rbd_client_create\n");
383 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
384 if (!rbdc)
385 goto out_opt;
386
387 kref_init(&rbdc->kref);
388 INIT_LIST_HEAD(&rbdc->node);
389
Alex Elderbc534d862012-01-29 13:57:44 -0600390 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
391
Alex Elder43ae4702012-07-03 16:01:18 -0500392 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700393 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600394 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500395 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700396
397 ret = ceph_open_session(rbdc->client);
398 if (ret < 0)
399 goto out_err;
400
Alex Elder432b8582012-01-29 13:57:44 -0600401 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700402 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600403 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700404
Alex Elderbc534d862012-01-29 13:57:44 -0600405 mutex_unlock(&ctl_mutex);
406
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700407 dout("rbd_client_create created %p\n", rbdc);
408 return rbdc;
409
410out_err:
411 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600412out_mutex:
413 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700414 kfree(rbdc);
415out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500416 if (ceph_opts)
417 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400418 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419}
420
421/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700422 * Find a ceph client with specific addr and configuration. If
423 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700424 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700425static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700426{
427 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700428 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700429
Alex Elder43ae4702012-07-03 16:01:18 -0500430 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431 return NULL;
432
Alex Elder1f7ba332012-08-10 13:12:07 -0700433 spin_lock(&rbd_client_list_lock);
434 list_for_each_entry(client_node, &rbd_client_list, node) {
435 if (!ceph_compare_options(ceph_opts, client_node->client)) {
436 kref_get(&client_node->kref);
437 found = true;
438 break;
439 }
440 }
441 spin_unlock(&rbd_client_list_lock);
442
443 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444}
445
446/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700447 * mount options
448 */
449enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700450 Opt_last_int,
451 /* int args above */
452 Opt_last_string,
453 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700454 Opt_read_only,
455 Opt_read_write,
456 /* Boolean args above */
457 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700458};
459
Alex Elder43ae4702012-07-03 16:01:18 -0500460static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700461 /* int args above */
462 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500463 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700464 {Opt_read_only, "ro"}, /* Alternate spelling */
465 {Opt_read_write, "read_write"},
466 {Opt_read_write, "rw"}, /* Alternate spelling */
467 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700468 {-1, NULL}
469};
470
Alex Elder98571b52013-01-20 14:44:42 -0600471struct rbd_options {
472 bool read_only;
473};
474
475#define RBD_READ_ONLY_DEFAULT false
476
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700477static int parse_rbd_opts_token(char *c, void *private)
478{
Alex Elder43ae4702012-07-03 16:01:18 -0500479 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700480 substring_t argstr[MAX_OPT_ARGS];
481 int token, intval, ret;
482
Alex Elder43ae4702012-07-03 16:01:18 -0500483 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700484 if (token < 0)
485 return -EINVAL;
486
487 if (token < Opt_last_int) {
488 ret = match_int(&argstr[0], &intval);
489 if (ret < 0) {
490 pr_err("bad mount option arg (not int) "
491 "at '%s'\n", c);
492 return ret;
493 }
494 dout("got int token %d val %d\n", token, intval);
495 } else if (token > Opt_last_int && token < Opt_last_string) {
496 dout("got string token %d val %s\n", token,
497 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700498 } else if (token > Opt_last_string && token < Opt_last_bool) {
499 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700500 } else {
501 dout("got token %d\n", token);
502 }
503
504 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700505 case Opt_read_only:
506 rbd_opts->read_only = true;
507 break;
508 case Opt_read_write:
509 rbd_opts->read_only = false;
510 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700511 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500512 rbd_assert(false);
513 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700514 }
515 return 0;
516}
517
518/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700519 * Get a ceph client with specific addr and configuration, if one does
520 * not exist create it.
521 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500522static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700523{
Alex Elderf8c38922012-08-10 13:12:07 -0700524 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700525
Alex Elder1f7ba332012-08-10 13:12:07 -0700526 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500527 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500528 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500529 else
Alex Elderf8c38922012-08-10 13:12:07 -0700530 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700531
Alex Elder9d3997f2012-10-25 23:34:42 -0500532 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700533}
534
535/*
536 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600537 *
Alex Elder432b8582012-01-29 13:57:44 -0600538 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539 */
540static void rbd_client_release(struct kref *kref)
541{
542 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
543
544 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500545 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700546 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500547 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700548
549 ceph_destroy_client(rbdc->client);
550 kfree(rbdc);
551}
552
553/*
554 * Drop reference to ceph client node. If it's not referenced anymore, release
555 * it.
556 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500557static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558{
Alex Elderc53d5892012-10-25 23:34:42 -0500559 if (rbdc)
560 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700561}
562
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700563/*
564 * Destroy requests collection
565 */
566static void rbd_coll_release(struct kref *kref)
567{
568 struct rbd_req_coll *coll =
569 container_of(kref, struct rbd_req_coll, kref);
570
571 dout("rbd_coll_release %p\n", coll);
572 kfree(coll);
573}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700574
Alex Eldera30b71b2012-07-10 20:30:11 -0500575static bool rbd_image_format_valid(u32 image_format)
576{
577 return image_format == 1 || image_format == 2;
578}
579
Alex Elder8e94af82012-07-25 09:32:40 -0500580static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
581{
Alex Elder103a1502012-08-02 11:29:45 -0500582 size_t size;
583 u32 snap_count;
584
585 /* The header has to start with the magic rbd header text */
586 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
587 return false;
588
Alex Elderdb2388b2012-10-20 22:17:27 -0500589 /* The bio layer requires at least sector-sized I/O */
590
591 if (ondisk->options.order < SECTOR_SHIFT)
592 return false;
593
594 /* If we use u64 in a few spots we may be able to loosen this */
595
596 if (ondisk->options.order > 8 * sizeof (int) - 1)
597 return false;
598
Alex Elder103a1502012-08-02 11:29:45 -0500599 /*
600 * The size of a snapshot header has to fit in a size_t, and
601 * that limits the number of snapshots.
602 */
603 snap_count = le32_to_cpu(ondisk->snap_count);
604 size = SIZE_MAX - sizeof (struct ceph_snap_context);
605 if (snap_count > size / sizeof (__le64))
606 return false;
607
608 /*
609 * Not only that, but the size of the entire the snapshot
610 * header must also be representable in a size_t.
611 */
612 size -= snap_count * sizeof (__le64);
613 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
614 return false;
615
616 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500617}
618
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619/*
620 * Create a new header structure, translate header format from the on-disk
621 * header.
622 */
623static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500624 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625{
Alex Elderccece232012-07-10 20:30:10 -0500626 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500627 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500628 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500629 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700630
Alex Elder6a523252012-07-19 17:12:59 -0500631 memset(header, 0, sizeof (*header));
632
Alex Elder103a1502012-08-02 11:29:45 -0500633 snap_count = le32_to_cpu(ondisk->snap_count);
634
Alex Elder58c17b02012-08-23 23:22:06 -0500635 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
636 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500637 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500639 memcpy(header->object_prefix, ondisk->object_prefix, len);
640 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600641
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500643 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
644
Alex Elder621901d2012-08-23 23:22:06 -0500645 /* Save a copy of the snapshot names */
646
Alex Elderf785cc12012-08-23 23:22:06 -0500647 if (snap_names_len > (u64) SIZE_MAX)
648 return -EIO;
649 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500651 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500652 /*
653 * Note that rbd_dev_v1_header_read() guarantees
654 * the ondisk buffer we're working with has
655 * snap_names_len bytes beyond the end of the
656 * snapshot id array, this memcpy() is safe.
657 */
658 memcpy(header->snap_names, &ondisk->snaps[snap_count],
659 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500660
Alex Elder621901d2012-08-23 23:22:06 -0500661 /* Record each snapshot's size */
662
Alex Elderd2bb24e2012-07-26 23:37:14 -0500663 size = snap_count * sizeof (*header->snap_sizes);
664 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500666 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500667 for (i = 0; i < snap_count; i++)
668 header->snap_sizes[i] =
669 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670 } else {
Alex Elderccece232012-07-10 20:30:10 -0500671 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700672 header->snap_names = NULL;
673 header->snap_sizes = NULL;
674 }
Alex Elder849b4262012-07-09 21:04:24 -0500675
Alex Elder34b13182012-07-13 20:35:12 -0500676 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 header->obj_order = ondisk->options.order;
678 header->crypt_type = ondisk->options.crypt_type;
679 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500680
Alex Elder621901d2012-08-23 23:22:06 -0500681 /* Allocate and fill in the snapshot context */
682
Alex Elderf84344f2012-08-31 17:29:51 -0500683 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500684 size = sizeof (struct ceph_snap_context);
685 size += snap_count * sizeof (header->snapc->snaps[0]);
686 header->snapc = kzalloc(size, GFP_KERNEL);
687 if (!header->snapc)
688 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689
690 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500691 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500693 for (i = 0; i < snap_count; i++)
694 header->snapc->snaps[i] =
695 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696
697 return 0;
698
Alex Elder6a523252012-07-19 17:12:59 -0500699out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500700 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500701 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500703 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500704 kfree(header->object_prefix);
705 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500706
Alex Elder00f1f362012-02-07 12:03:36 -0600707 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708}
709
Alex Elder9e15b772012-10-30 19:40:33 -0500710static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
711{
712 struct rbd_snap *snap;
713
714 if (snap_id == CEPH_NOSNAP)
715 return RBD_SNAP_HEAD_NAME;
716
717 list_for_each_entry(snap, &rbd_dev->snaps, node)
718 if (snap_id == snap->id)
719 return snap->name;
720
721 return NULL;
722}
723
Alex Elder8836b992012-08-30 14:42:15 -0500724static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700725{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726
Alex Eldere86924a2012-07-10 20:30:11 -0500727 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600728
Alex Eldere86924a2012-07-10 20:30:11 -0500729 list_for_each_entry(snap, &rbd_dev->snaps, node) {
730 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500731 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500732 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500733 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600734
Alex Eldere86924a2012-07-10 20:30:11 -0500735 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600736 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700737 }
Alex Eldere86924a2012-07-10 20:30:11 -0500738
Alex Elder00f1f362012-02-07 12:03:36 -0600739 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700740}
741
Alex Elder819d52b2012-10-25 23:34:41 -0500742static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743{
Alex Elder78dc4472012-07-19 08:49:18 -0500744 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500746 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800747 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500748 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500749 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500750 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500751 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500753 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700754 if (ret < 0)
755 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500756 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700757 }
Alex Elderd78b6502012-11-09 08:43:15 -0600758 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700760 return ret;
761}
762
763static void rbd_header_free(struct rbd_image_header *header)
764{
Alex Elder849b4262012-07-09 21:04:24 -0500765 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500766 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500768 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500769 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500770 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800771 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500772 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700773}
774
Alex Elder98571b52013-01-20 14:44:42 -0600775static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700776{
Alex Elder65ccfe22012-08-09 10:33:26 -0700777 char *name;
778 u64 segment;
779 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700780
Alex Elder2fd82b92012-11-09 15:05:54 -0600781 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700782 if (!name)
783 return NULL;
784 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600785 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700786 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600787 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700788 pr_err("error formatting segment name for #%llu (%d)\n",
789 segment, ret);
790 kfree(name);
791 name = NULL;
792 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700793
Alex Elder65ccfe22012-08-09 10:33:26 -0700794 return name;
795}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700796
Alex Elder65ccfe22012-08-09 10:33:26 -0700797static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
798{
799 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800
Alex Elder65ccfe22012-08-09 10:33:26 -0700801 return offset & (segment_size - 1);
802}
803
804static u64 rbd_segment_length(struct rbd_device *rbd_dev,
805 u64 offset, u64 length)
806{
807 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
808
809 offset &= segment_size - 1;
810
Alex Elderaafb2302012-09-06 16:00:54 -0500811 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700812 if (offset + length > segment_size)
813 length = segment_size - offset;
814
815 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700816}
817
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700818static int rbd_get_num_segments(struct rbd_image_header *header,
819 u64 ofs, u64 len)
820{
Alex Elderdf111be2012-08-09 10:33:26 -0700821 u64 start_seg;
822 u64 end_seg;
Alex Elder38901e02013-01-10 12:56:58 -0600823 u64 result;
Alex Elderdf111be2012-08-09 10:33:26 -0700824
825 if (!len)
826 return 0;
827 if (len - 1 > U64_MAX - ofs)
828 return -ERANGE;
829
830 start_seg = ofs >> header->obj_order;
831 end_seg = (ofs + len - 1) >> header->obj_order;
832
Alex Elder38901e02013-01-10 12:56:58 -0600833 result = end_seg - start_seg + 1;
834 if (result > (u64) INT_MAX)
835 return -ERANGE;
836
837 return (int) result;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700838}
839
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700840/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700841 * returns the size of an object in the image
842 */
843static u64 rbd_obj_bytes(struct rbd_image_header *header)
844{
845 return 1 << header->obj_order;
846}
847
848/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700849 * bio helpers
850 */
851
852static void bio_chain_put(struct bio *chain)
853{
854 struct bio *tmp;
855
856 while (chain) {
857 tmp = chain;
858 chain = chain->bi_next;
859 bio_put(tmp);
860 }
861}
862
863/*
864 * zeros a bio chain, starting at specific offset
865 */
866static void zero_bio_chain(struct bio *chain, int start_ofs)
867{
868 struct bio_vec *bv;
869 unsigned long flags;
870 void *buf;
871 int i;
872 int pos = 0;
873
874 while (chain) {
875 bio_for_each_segment(bv, chain, i) {
876 if (pos + bv->bv_len > start_ofs) {
877 int remainder = max(start_ofs - pos, 0);
878 buf = bvec_kmap_irq(bv, &flags);
879 memset(buf + remainder, 0,
880 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200881 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882 }
883 pos += bv->bv_len;
884 }
885
886 chain = chain->bi_next;
887 }
888}
889
890/*
Alex Elderf7760da2012-10-20 22:17:27 -0500891 * Clone a portion of a bio, starting at the given byte offset
892 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700893 */
Alex Elderf7760da2012-10-20 22:17:27 -0500894static struct bio *bio_clone_range(struct bio *bio_src,
895 unsigned int offset,
896 unsigned int len,
897 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898{
Alex Elderf7760da2012-10-20 22:17:27 -0500899 struct bio_vec *bv;
900 unsigned int resid;
901 unsigned short idx;
902 unsigned int voff;
903 unsigned short end_idx;
904 unsigned short vcnt;
905 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700906
Alex Elderf7760da2012-10-20 22:17:27 -0500907 /* Handle the easy case for the caller */
908
909 if (!offset && len == bio_src->bi_size)
910 return bio_clone(bio_src, gfpmask);
911
912 if (WARN_ON_ONCE(!len))
913 return NULL;
914 if (WARN_ON_ONCE(len > bio_src->bi_size))
915 return NULL;
916 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
917 return NULL;
918
919 /* Find first affected segment... */
920
921 resid = offset;
922 __bio_for_each_segment(bv, bio_src, idx, 0) {
923 if (resid < bv->bv_len)
924 break;
925 resid -= bv->bv_len;
926 }
927 voff = resid;
928
929 /* ...and the last affected segment */
930
931 resid += len;
932 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
933 if (resid <= bv->bv_len)
934 break;
935 resid -= bv->bv_len;
936 }
937 vcnt = end_idx - idx + 1;
938
939 /* Build the clone */
940
941 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
942 if (!bio)
943 return NULL; /* ENOMEM */
944
945 bio->bi_bdev = bio_src->bi_bdev;
946 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
947 bio->bi_rw = bio_src->bi_rw;
948 bio->bi_flags |= 1 << BIO_CLONED;
949
950 /*
951 * Copy over our part of the bio_vec, then update the first
952 * and last (or only) entries.
953 */
954 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
955 vcnt * sizeof (struct bio_vec));
956 bio->bi_io_vec[0].bv_offset += voff;
957 if (vcnt > 1) {
958 bio->bi_io_vec[0].bv_len -= voff;
959 bio->bi_io_vec[vcnt - 1].bv_len = resid;
960 } else {
961 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962 }
963
Alex Elderf7760da2012-10-20 22:17:27 -0500964 bio->bi_vcnt = vcnt;
965 bio->bi_size = len;
966 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700967
Alex Elderf7760da2012-10-20 22:17:27 -0500968 return bio;
969}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700970
Alex Elderf7760da2012-10-20 22:17:27 -0500971/*
972 * Clone a portion of a bio chain, starting at the given byte offset
973 * into the first bio in the source chain and continuing for the
974 * number of bytes indicated. The result is another bio chain of
975 * exactly the given length, or a null pointer on error.
976 *
977 * The bio_src and offset parameters are both in-out. On entry they
978 * refer to the first source bio and the offset into that bio where
979 * the start of data to be cloned is located.
980 *
981 * On return, bio_src is updated to refer to the bio in the source
982 * chain that contains first un-cloned byte, and *offset will
983 * contain the offset of that byte within that bio.
984 */
985static struct bio *bio_chain_clone_range(struct bio **bio_src,
986 unsigned int *offset,
987 unsigned int len,
988 gfp_t gfpmask)
989{
990 struct bio *bi = *bio_src;
991 unsigned int off = *offset;
992 struct bio *chain = NULL;
993 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700994
Alex Elderf7760da2012-10-20 22:17:27 -0500995 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700996
Alex Elderf7760da2012-10-20 22:17:27 -0500997 if (!bi || off >= bi->bi_size || !len)
998 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Alex Elderf7760da2012-10-20 22:17:27 -05001000 end = &chain;
1001 while (len) {
1002 unsigned int bi_size;
1003 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
Alex Elderf5400b72012-11-01 10:17:15 -05001005 if (!bi) {
1006 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001007 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001008 }
Alex Elderf7760da2012-10-20 22:17:27 -05001009 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1010 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1011 if (!bio)
1012 goto out_err; /* ENOMEM */
1013
1014 *end = bio;
1015 end = &bio->bi_next;
1016
1017 off += bi_size;
1018 if (off == bi->bi_size) {
1019 bi = bi->bi_next;
1020 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001021 }
Alex Elderf7760da2012-10-20 22:17:27 -05001022 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023 }
Alex Elderf7760da2012-10-20 22:17:27 -05001024 *bio_src = bi;
1025 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026
Alex Elderf7760da2012-10-20 22:17:27 -05001027 return chain;
1028out_err:
1029 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031 return NULL;
1032}
1033
Alex Elder8d23bf22012-11-19 22:55:21 -06001034struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1035{
1036 struct ceph_osd_req_op *op;
1037 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001038 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001039
1040 op = kzalloc(sizeof (*op), GFP_NOIO);
1041 if (!op)
1042 return NULL;
1043 op->op = opcode;
1044 va_start(args, opcode);
1045 switch (opcode) {
1046 case CEPH_OSD_OP_READ:
1047 case CEPH_OSD_OP_WRITE:
1048 /* rbd_osd_req_op_create(READ, offset, length) */
1049 /* rbd_osd_req_op_create(WRITE, offset, length) */
1050 op->extent.offset = va_arg(args, u64);
1051 op->extent.length = va_arg(args, u64);
1052 if (opcode == CEPH_OSD_OP_WRITE)
1053 op->payload_len = op->extent.length;
1054 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001055 case CEPH_OSD_OP_CALL:
1056 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1057 op->cls.class_name = va_arg(args, char *);
1058 size = strlen(op->cls.class_name);
1059 rbd_assert(size <= (size_t) U8_MAX);
1060 op->cls.class_len = size;
1061 op->payload_len = size;
1062
1063 op->cls.method_name = va_arg(args, char *);
1064 size = strlen(op->cls.method_name);
1065 rbd_assert(size <= (size_t) U8_MAX);
1066 op->cls.method_len = size;
1067 op->payload_len += size;
1068
1069 op->cls.argc = 0;
1070 op->cls.indata = va_arg(args, void *);
1071 size = va_arg(args, size_t);
1072 rbd_assert(size <= (size_t) U32_MAX);
1073 op->cls.indata_len = (u32) size;
1074 op->payload_len += size;
1075 break;
Alex Elder5efea492012-11-19 22:55:21 -06001076 case CEPH_OSD_OP_NOTIFY_ACK:
1077 case CEPH_OSD_OP_WATCH:
1078 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1079 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1080 op->watch.cookie = va_arg(args, u64);
1081 op->watch.ver = va_arg(args, u64);
1082 op->watch.ver = cpu_to_le64(op->watch.ver);
1083 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1084 op->watch.flag = (u8) 1;
1085 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001086 default:
1087 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1088 kfree(op);
1089 op = NULL;
1090 break;
1091 }
1092 va_end(args);
1093
1094 return op;
1095}
1096
1097static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1098{
1099 kfree(op);
1100}
1101
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001102static void rbd_coll_end_req_index(struct request *rq,
1103 struct rbd_req_coll *coll,
1104 int index,
Alex Elder8986cb32012-11-08 08:01:39 -06001105 s32 ret, u64 len)
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001106{
1107 struct request_queue *q;
1108 int min, max, i;
1109
Alex Elderbd919d42012-07-13 20:35:11 -05001110 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
Alex Elder8986cb32012-11-08 08:01:39 -06001111 coll, index, (int)ret, (unsigned long long)len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001112
1113 if (!rq)
1114 return;
1115
1116 if (!coll) {
1117 blk_end_request(rq, ret, len);
1118 return;
1119 }
1120
1121 q = rq->q;
1122
1123 spin_lock_irq(q->queue_lock);
1124 coll->status[index].done = 1;
1125 coll->status[index].rc = ret;
1126 coll->status[index].bytes = len;
1127 max = min = coll->num_done;
1128 while (max < coll->total && coll->status[max].done)
1129 max++;
1130
1131 for (i = min; i<max; i++) {
Alex Elder8986cb32012-11-08 08:01:39 -06001132 __blk_end_request(rq, (int)coll->status[i].rc,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001133 coll->status[i].bytes);
1134 coll->num_done++;
1135 kref_put(&coll->kref, rbd_coll_release);
1136 }
1137 spin_unlock_irq(q->queue_lock);
1138}
1139
Alex Elder725afc92012-11-08 08:01:39 -06001140static void rbd_coll_end_req(struct rbd_request *rbd_req,
Alex Elder8986cb32012-11-08 08:01:39 -06001141 s32 ret, u64 len)
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001142{
Alex Elder725afc92012-11-08 08:01:39 -06001143 rbd_coll_end_req_index(rbd_req->rq,
1144 rbd_req->coll, rbd_req->coll_index,
1145 ret, len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001146}
1147
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148/*
1149 * Send ceph osd request
1150 */
1151static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001152 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153 struct ceph_snap_context *snapc,
1154 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001155 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001156 struct bio *bio,
1157 struct page **pages,
1158 int num_pages,
1159 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001160 struct ceph_osd_req_op *op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001161 struct rbd_req_coll *coll,
1162 int coll_index,
Alex Elder5f29ddd2012-11-08 08:01:39 -06001163 void (*rbd_cb)(struct ceph_osd_request *,
1164 struct ceph_msg *),
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001165 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001166{
Alex Elder1dbb4392012-01-24 10:08:37 -06001167 struct ceph_osd_client *osdc;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001168 struct ceph_osd_request *osd_req;
1169 struct rbd_request *rbd_req = NULL;
1170 struct timespec mtime = CURRENT_TIME;
1171 int ret;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001172
Alex Elderf7760da2012-10-20 22:17:27 -05001173 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1174 object_name, (unsigned long long) ofs,
1175 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001176
Alex Elder0ce1a792012-07-03 16:01:18 -05001177 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder30573d62012-11-13 21:11:15 -06001178 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001179 if (!osd_req)
1180 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181
Alex Elderd178a9e2012-11-13 21:11:15 -06001182 osd_req->r_flags = flags;
Alex Elder54a54002012-11-13 21:11:15 -06001183 osd_req->r_pages = pages;
1184 if (bio) {
1185 osd_req->r_bio = bio;
1186 bio_get(osd_req->r_bio);
1187 }
Alex Elder2e53c6c2012-11-30 09:59:47 -06001188
Alex Elder18216652012-11-30 09:59:47 -06001189 if (coll) {
Alex Elder2e53c6c2012-11-30 09:59:47 -06001190 ret = -ENOMEM;
1191 rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
1192 if (!rbd_req)
1193 goto done_osd_req;
1194
1195 rbd_req->rq = rq;
1196 rbd_req->bio = bio;
1197 rbd_req->pages = pages;
1198 rbd_req->len = len;
1199 rbd_req->coll = coll;
Alex Elder18216652012-11-30 09:59:47 -06001200 rbd_req->coll_index = coll_index;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001201 }
1202
Alex Elder5f29ddd2012-11-08 08:01:39 -06001203 osd_req->r_callback = rbd_cb;
Alex Elder5f29ddd2012-11-08 08:01:39 -06001204 osd_req->r_priv = rbd_req;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205
Alex Elder5f29ddd2012-11-08 08:01:39 -06001206 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1207 osd_req->r_oid_len = strlen(osd_req->r_oid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208
Alex Elder0903e872012-11-14 12:25:19 -06001209 osd_req->r_file_layout = rbd_dev->layout; /* struct */
Alex Eldere01e7922012-11-14 12:25:18 -06001210 osd_req->r_num_pages = calc_pages_for(ofs, len);
1211 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Alex Elder30573d62012-11-13 21:11:15 -06001213 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
Alex Elderae7ca4a32012-11-13 21:11:15 -06001214 snapc, snapid, &mtime);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215
Alex Elder8b84de72012-11-20 14:17:17 -06001216 if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001217 ceph_osdc_set_request_linger(osdc, osd_req);
Alex Elder8b84de72012-11-20 14:17:17 -06001218 rbd_dev->watch_request = osd_req;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001219 }
1220
Alex Elder5f29ddd2012-11-08 08:01:39 -06001221 ret = ceph_osdc_start_request(osdc, osd_req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222 if (ret < 0)
1223 goto done_err;
1224
1225 if (!rbd_cb) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001226 u64 version;
1227
1228 ret = ceph_osdc_wait_request(osdc, osd_req);
1229 version = le64_to_cpu(osd_req->r_reassert_version.version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 if (ver)
Alex Elder5f29ddd2012-11-08 08:01:39 -06001231 *ver = version;
1232 dout("reassert_ver=%llu\n", (unsigned long long) version);
1233 ceph_osdc_put_request(osd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 }
1235 return ret;
1236
1237done_err:
Alex Elder2e53c6c2012-11-30 09:59:47 -06001238 if (bio)
1239 bio_chain_put(osd_req->r_bio);
Alex Elder725afc92012-11-08 08:01:39 -06001240 kfree(rbd_req);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001241done_osd_req:
1242 ceph_osdc_put_request(osd_req);
1243
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001244 return ret;
1245}
1246
1247/*
1248 * Ceph osd op callback
1249 */
Alex Elder5f29ddd2012-11-08 08:01:39 -06001250static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001252 struct rbd_request *rbd_req = osd_req->r_priv;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001253 struct ceph_osd_reply_head *replyhead;
1254 struct ceph_osd_op *op;
Alex Elder8986cb32012-11-08 08:01:39 -06001255 s32 rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001256 u64 bytes;
1257 int read_op;
1258
1259 /* parse reply */
1260 replyhead = msg->front.iov_base;
1261 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1262 op = (void *)(replyhead + 1);
Alex Elder8986cb32012-11-08 08:01:39 -06001263 rc = (s32)le32_to_cpu(replyhead->result);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001264 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001265 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266
Alex Elderbd919d42012-07-13 20:35:11 -05001267 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1268 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001269
Alex Elder8986cb32012-11-08 08:01:39 -06001270 if (rc == (s32)-ENOENT && read_op) {
Alex Elder725afc92012-11-08 08:01:39 -06001271 zero_bio_chain(rbd_req->bio, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272 rc = 0;
Alex Elder725afc92012-11-08 08:01:39 -06001273 } else if (rc == 0 && read_op && bytes < rbd_req->len) {
1274 zero_bio_chain(rbd_req->bio, bytes);
1275 bytes = rbd_req->len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001276 }
1277
Alex Elder725afc92012-11-08 08:01:39 -06001278 rbd_coll_end_req(rbd_req, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001279
Alex Elder725afc92012-11-08 08:01:39 -06001280 if (rbd_req->bio)
1281 bio_chain_put(rbd_req->bio);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001282
Alex Elder5f29ddd2012-11-08 08:01:39 -06001283 ceph_osdc_put_request(osd_req);
Alex Elder725afc92012-11-08 08:01:39 -06001284 kfree(rbd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001285}
1286
Alex Elder5f29ddd2012-11-08 08:01:39 -06001287static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1288 struct ceph_msg *msg)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001289{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001290 ceph_osdc_put_request(osd_req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001291}
1292
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001293/*
1294 * Do a synchronous ceph osd operation
1295 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001296static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001297 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001298 struct ceph_osd_req_op *op,
Alex Elderaded07e2012-07-03 16:01:18 -05001299 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001300 u64 ofs, u64 inbound_size,
1301 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001302 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001303{
1304 int ret;
1305 struct page **pages;
1306 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001307
Alex Elder30573d62012-11-13 21:11:15 -06001308 rbd_assert(op != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001309
Alex Elderf8d4de62012-07-03 16:01:19 -05001310 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001311 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001312 if (IS_ERR(pages))
1313 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001314
Alex Elder25704ac2012-11-09 08:43:16 -06001315 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderf8d4de62012-07-03 16:01:19 -05001316 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001317 pages, num_pages,
1318 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001319 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001320 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321 NULL,
Alex Elder8b84de72012-11-20 14:17:17 -06001322 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001323 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001324 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001325
Alex Elderf8d4de62012-07-03 16:01:19 -05001326 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1327 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001328
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001329done:
1330 ceph_release_page_vector(pages, num_pages);
1331 return ret;
1332}
1333
1334/*
1335 * Do an asynchronous ceph osd operation
1336 */
1337static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001338 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001339 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001340 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001341 struct bio *bio,
1342 struct rbd_req_coll *coll,
1343 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001344{
Alex Elder98571b52013-01-20 14:44:42 -06001345 const char *seg_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001346 u64 seg_ofs;
1347 u64 seg_len;
1348 int ret;
Alex Elder139b4312012-11-13 21:11:15 -06001349 struct ceph_osd_req_op *op;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001350 int opcode;
1351 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001352 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001353
Alex Elder65ccfe22012-08-09 10:33:26 -07001354 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001355 if (!seg_name)
1356 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001357 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1358 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001359
Alex Elderff2e4bb2012-10-10 18:59:29 -07001360 if (rq_data_dir(rq) == WRITE) {
1361 opcode = CEPH_OSD_OP_WRITE;
1362 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001363 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001364 } else {
1365 opcode = CEPH_OSD_OP_READ;
1366 flags = CEPH_OSD_FLAG_READ;
Alex Eldera7b4c652012-11-09 08:43:15 -06001367 rbd_assert(!snapc);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001368 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001369 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001370
Alex Elder57cfc102012-06-26 12:57:03 -07001371 ret = -ENOMEM;
Alex Elder8d23bf22012-11-19 22:55:21 -06001372 op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
Alex Elder139b4312012-11-13 21:11:15 -06001373 if (!op)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001374 goto done;
1375
1376 /* we've taken care of segment sizes earlier when we
1377 cloned the bios. We should never have a segment
1378 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001379 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001380
1381 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1382 seg_name, seg_ofs, seg_len,
1383 bio,
1384 NULL, 0,
1385 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001386 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001387 coll, coll_index,
Alex Elder8b84de72012-11-20 14:17:17 -06001388 rbd_req_cb, NULL);
Alex Eldercd323ac2012-11-08 08:01:39 -06001389 if (ret < 0)
1390 rbd_coll_end_req_index(rq, coll, coll_index,
1391 (s32)ret, seg_len);
Alex Elder8d23bf22012-11-19 22:55:21 -06001392 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001393done:
1394 kfree(seg_name);
1395 return ret;
1396}
1397
1398/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001399 * Request sync osd read
1400 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001401static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001402 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001403 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001404 char *buf,
1405 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001406{
Alex Elder139b4312012-11-13 21:11:15 -06001407 struct ceph_osd_req_op *op;
Alex Elder913d2fd2012-06-26 12:57:03 -07001408 int ret;
1409
Alex Elder8d23bf22012-11-19 22:55:21 -06001410 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
Alex Elder139b4312012-11-13 21:11:15 -06001411 if (!op)
Alex Elder913d2fd2012-06-26 12:57:03 -07001412 return -ENOMEM;
1413
Alex Elder25704ac2012-11-09 08:43:16 -06001414 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
Alex Elder8b84de72012-11-20 14:17:17 -06001415 op, object_name, ofs, len, buf, ver);
Alex Elder8d23bf22012-11-19 22:55:21 -06001416 rbd_osd_req_op_destroy(op);
Alex Elder913d2fd2012-06-26 12:57:03 -07001417
1418 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001419}
1420
1421/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001422 * Request sync osd watch
1423 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001424static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001425 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001426 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001427{
Alex Elder139b4312012-11-13 21:11:15 -06001428 struct ceph_osd_req_op *op;
Sage Weil11f77002011-05-12 16:13:54 -07001429 int ret;
1430
Alex Elder5efea492012-11-19 22:55:21 -06001431 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
Alex Elder139b4312012-11-13 21:11:15 -06001432 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001433 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001434
Alex Elder0ce1a792012-07-03 16:01:18 -05001435 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001436 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001437 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001438 CEPH_OSD_FLAG_READ,
Alex Elder30573d62012-11-13 21:11:15 -06001439 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001440 NULL, 0,
Alex Elder8b84de72012-11-20 14:17:17 -06001441 rbd_simple_req_cb, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001442
Alex Elder5efea492012-11-19 22:55:21 -06001443 rbd_osd_req_op_destroy(op);
1444
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001445 return ret;
1446}
1447
1448static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1449{
Alex Elder0ce1a792012-07-03 16:01:18 -05001450 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001451 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001452 int rc;
1453
Alex Elder0ce1a792012-07-03 16:01:18 -05001454 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001455 return;
1456
Alex Elderbd919d42012-07-13 20:35:11 -05001457 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1458 rbd_dev->header_name, (unsigned long long) notify_id,
1459 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001460 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001461 if (rc)
Alex Elder06ecc6c2012-11-01 10:17:15 -05001462 rbd_warn(rbd_dev, "got notification but failed to "
1463 " update snaps: %d\n", rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001464
Alex Elder7f0a24d2012-07-25 09:32:40 -05001465 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001466}
1467
1468/*
Alex Elder907703d2012-11-13 21:11:15 -06001469 * Request sync osd watch/unwatch. The value of "start" determines
1470 * whether a watch request is being initiated or torn down.
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001471 */
Alex Elder907703d2012-11-13 21:11:15 -06001472static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001473{
Alex Elder5efea492012-11-19 22:55:21 -06001474 struct ceph_osd_req_op *op;
1475 int ret = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001476
Alex Elderc0430642013-01-18 12:31:09 -06001477 rbd_assert(start ^ !!rbd_dev->watch_event);
1478 rbd_assert(start ^ !!rbd_dev->watch_request);
1479
Alex Elder907703d2012-11-13 21:11:15 -06001480 if (start) {
1481 struct ceph_osd_client *osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001482
Alex Elder907703d2012-11-13 21:11:15 -06001483 osdc = &rbd_dev->rbd_client->client->osdc;
1484 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1485 &rbd_dev->watch_event);
1486 if (ret < 0)
Alex Elder5efea492012-11-19 22:55:21 -06001487 return ret;
Alex Elder907703d2012-11-13 21:11:15 -06001488 }
1489
Alex Elder5efea492012-11-19 22:55:21 -06001490 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1491 rbd_dev->watch_event->cookie,
1492 rbd_dev->header.obj_version, start);
1493 if (op)
1494 ret = rbd_req_sync_op(rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001495 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Alex Elder907703d2012-11-13 21:11:15 -06001496 op, rbd_dev->header_name,
Alex Elder8b84de72012-11-20 14:17:17 -06001497 0, 0, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001498
Alex Elder5efea492012-11-19 22:55:21 -06001499 /* Cancel the event if we're tearing down, or on error */
1500
1501 if (!start || !op || ret < 0) {
Alex Elder907703d2012-11-13 21:11:15 -06001502 ceph_osdc_cancel_event(rbd_dev->watch_event);
1503 rbd_dev->watch_event = NULL;
1504 }
Alex Elder5efea492012-11-19 22:55:21 -06001505 rbd_osd_req_op_destroy(op);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001506
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001507 return ret;
1508}
1509
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001510/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001511 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001512 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001513static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001514 const char *object_name,
1515 const char *class_name,
1516 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001517 const char *outbound,
1518 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001519 char *inbound,
1520 size_t inbound_size,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001521 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522{
Alex Elder139b4312012-11-13 21:11:15 -06001523 struct ceph_osd_req_op *op;
Alex Elder57cfc102012-06-26 12:57:03 -07001524 int ret;
1525
Alex Elder3cb4a682012-06-26 12:57:03 -07001526 /*
1527 * Any input parameters required by the method we're calling
1528 * will be sent along with the class and method names as
1529 * part of the message payload. That data and its size are
1530 * supplied via the indata and indata_len fields (named from
1531 * the perspective of the server side) in the OSD request
1532 * operation.
1533 */
Alex Elder2647ba32012-11-19 22:55:21 -06001534 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1535 method_name, outbound, outbound_size);
Alex Elder139b4312012-11-13 21:11:15 -06001536 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001537 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538
Alex Elder30573d62012-11-13 21:11:15 -06001539 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
Alex Elderf8d4de62012-07-03 16:01:19 -05001540 object_name, 0, inbound_size, inbound,
Alex Elder8b84de72012-11-20 14:17:17 -06001541 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001542
Alex Elder2647ba32012-11-19 22:55:21 -06001543 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001544
1545 dout("cls_exec returned %d\n", ret);
1546 return ret;
1547}
1548
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001549static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1550{
1551 struct rbd_req_coll *coll =
1552 kzalloc(sizeof(struct rbd_req_coll) +
1553 sizeof(struct rbd_req_status) * num_reqs,
1554 GFP_ATOMIC);
1555
1556 if (!coll)
1557 return NULL;
1558 coll->total = num_reqs;
1559 kref_init(&coll->kref);
1560 return coll;
1561}
1562
Alex Elder8295cda2012-11-08 08:01:39 -06001563static int rbd_dev_do_request(struct request *rq,
1564 struct rbd_device *rbd_dev,
1565 struct ceph_snap_context *snapc,
1566 u64 ofs, unsigned int size,
1567 struct bio *bio_chain)
1568{
1569 int num_segs;
1570 struct rbd_req_coll *coll;
1571 unsigned int bio_offset;
1572 int cur_seg = 0;
1573
1574 dout("%s 0x%x bytes at 0x%llx\n",
1575 rq_data_dir(rq) == WRITE ? "write" : "read",
1576 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1577
1578 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1579 if (num_segs <= 0)
1580 return num_segs;
1581
1582 coll = rbd_alloc_coll(num_segs);
1583 if (!coll)
1584 return -ENOMEM;
1585
1586 bio_offset = 0;
1587 do {
1588 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1589 unsigned int clone_size;
1590 struct bio *bio_clone;
1591
1592 BUG_ON(limit > (u64)UINT_MAX);
1593 clone_size = (unsigned int)limit;
1594 dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
1595
1596 kref_get(&coll->kref);
1597
1598 /* Pass a cloned bio chain via an osd request */
1599
1600 bio_clone = bio_chain_clone_range(&bio_chain,
1601 &bio_offset, clone_size,
1602 GFP_ATOMIC);
1603 if (bio_clone)
1604 (void)rbd_do_op(rq, rbd_dev, snapc,
1605 ofs, clone_size,
1606 bio_clone, coll, cur_seg);
1607 else
1608 rbd_coll_end_req_index(rq, coll, cur_seg,
1609 (s32)-ENOMEM,
1610 clone_size);
1611 size -= clone_size;
1612 ofs += clone_size;
1613
1614 cur_seg++;
1615 } while (size > 0);
1616 kref_put(&coll->kref, rbd_coll_release);
1617
1618 return 0;
1619}
1620
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001621/*
1622 * block device queue callback
1623 */
1624static void rbd_rq_fn(struct request_queue *q)
1625{
1626 struct rbd_device *rbd_dev = q->queuedata;
Alex Elderb395e8b2012-11-08 08:01:39 -06001627 bool read_only = rbd_dev->mapping.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001628 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001629
Alex Elder00f1f362012-02-07 12:03:36 -06001630 while ((rq = blk_fetch_request(q))) {
Alex Elderb395e8b2012-11-08 08:01:39 -06001631 struct ceph_snap_context *snapc = NULL;
1632 unsigned int size = 0;
Alex Elder8295cda2012-11-08 08:01:39 -06001633 int result;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001634
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001635 dout("fetched request\n");
1636
Alex Elderb395e8b2012-11-08 08:01:39 -06001637 /* Filter out block requests we don't understand */
1638
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001639 if ((rq->cmd_type != REQ_TYPE_FS)) {
1640 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001641 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001642 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001643 spin_unlock_irq(q->queue_lock);
1644
Alex Eldera7b4c652012-11-09 08:43:15 -06001645 /* Write requests need a reference to the snapshot context */
Alex Elderb395e8b2012-11-08 08:01:39 -06001646
Alex Eldera7b4c652012-11-09 08:43:15 -06001647 if (rq_data_dir(rq) == WRITE) {
1648 result = -EROFS;
1649 if (read_only) /* Can't write to a read-only device */
1650 goto out_end_request;
Alex Elderb395e8b2012-11-08 08:01:39 -06001651
Alex Eldera7b4c652012-11-09 08:43:15 -06001652 /*
1653 * Note that each osd request will take its
1654 * own reference to the snapshot context
1655 * supplied. The reference we take here
1656 * just guarantees the one we provide stays
1657 * valid.
1658 */
1659 down_read(&rbd_dev->header_rwsem);
Alex Elderb395e8b2012-11-08 08:01:39 -06001660 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
Alex Eldera7b4c652012-11-09 08:43:15 -06001661 up_read(&rbd_dev->header_rwsem);
Alex Elderb395e8b2012-11-08 08:01:39 -06001662 rbd_assert(snapc != NULL);
Alex Eldera7b4c652012-11-09 08:43:15 -06001663 } else if (!atomic_read(&rbd_dev->exists)) {
Alex Elderb395e8b2012-11-08 08:01:39 -06001664 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1665 dout("request for non-existent snapshot");
1666 result = -ENXIO;
1667 goto out_end_request;
1668 }
Alex Elderf7760da2012-10-20 22:17:27 -05001669
Alex Elderb395e8b2012-11-08 08:01:39 -06001670 size = blk_rq_bytes(rq);
1671 result = rbd_dev_do_request(rq, rbd_dev, snapc,
1672 blk_rq_pos(rq) * SECTOR_SIZE,
1673 size, rq->bio);
1674out_end_request:
Alex Eldera7b4c652012-11-09 08:43:15 -06001675 if (snapc)
1676 ceph_put_snap_context(snapc);
Alex Elder8295cda2012-11-08 08:01:39 -06001677 spin_lock_irq(q->queue_lock);
1678 if (!size || result < 0)
1679 __blk_end_request_all(rq, result);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001680 }
1681}
1682
1683/*
1684 * a queue callback. Makes sure that we don't create a bio that spans across
1685 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001686 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001687 */
1688static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1689 struct bio_vec *bvec)
1690{
1691 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05001692 sector_t sector_offset;
1693 sector_t sectors_per_obj;
1694 sector_t obj_sector_offset;
1695 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001696
Alex Eldere5cfeed2012-10-20 22:17:27 -05001697 /*
1698 * Find how far into its rbd object the partition-relative
1699 * bio start sector is to offset relative to the enclosing
1700 * device.
1701 */
1702 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1703 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1704 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001705
Alex Eldere5cfeed2012-10-20 22:17:27 -05001706 /*
1707 * Compute the number of bytes from that offset to the end
1708 * of the object. Account for what's already used by the bio.
1709 */
1710 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1711 if (ret > bmd->bi_size)
1712 ret -= bmd->bi_size;
1713 else
1714 ret = 0;
1715
1716 /*
1717 * Don't send back more than was asked for. And if the bio
1718 * was empty, let the whole thing through because: "Note
1719 * that a block device *must* allow a single page to be
1720 * added to an empty bio."
1721 */
1722 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1723 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1724 ret = (int) bvec->bv_len;
1725
1726 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001727}
1728
1729static void rbd_free_disk(struct rbd_device *rbd_dev)
1730{
1731 struct gendisk *disk = rbd_dev->disk;
1732
1733 if (!disk)
1734 return;
1735
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001736 if (disk->flags & GENHD_FL_UP)
1737 del_gendisk(disk);
1738 if (disk->queue)
1739 blk_cleanup_queue(disk->queue);
1740 put_disk(disk);
1741}
1742
1743/*
Alex Elder4156d992012-08-02 11:29:46 -05001744 * Read the complete header for the given rbd device.
1745 *
1746 * Returns a pointer to a dynamically-allocated buffer containing
1747 * the complete and validated header. Caller can pass the address
1748 * of a variable that will be filled in with the version of the
1749 * header object at the time it was read.
1750 *
1751 * Returns a pointer-coded errno if a failure occurs.
1752 */
1753static struct rbd_image_header_ondisk *
1754rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1755{
1756 struct rbd_image_header_ondisk *ondisk = NULL;
1757 u32 snap_count = 0;
1758 u64 names_size = 0;
1759 u32 want_count;
1760 int ret;
1761
1762 /*
1763 * The complete header will include an array of its 64-bit
1764 * snapshot ids, followed by the names of those snapshots as
1765 * a contiguous block of NUL-terminated strings. Note that
1766 * the number of snapshots could change by the time we read
1767 * it in, in which case we re-read it.
1768 */
1769 do {
1770 size_t size;
1771
1772 kfree(ondisk);
1773
1774 size = sizeof (*ondisk);
1775 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1776 size += names_size;
1777 ondisk = kmalloc(size, GFP_KERNEL);
1778 if (!ondisk)
1779 return ERR_PTR(-ENOMEM);
1780
Alex Elder47756182012-11-09 08:43:15 -06001781 ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05001782 0, size,
1783 (char *) ondisk, version);
1784
1785 if (ret < 0)
1786 goto out_err;
1787 if (WARN_ON((size_t) ret < size)) {
1788 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05001789 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1790 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05001791 goto out_err;
1792 }
1793 if (!rbd_dev_ondisk_valid(ondisk)) {
1794 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05001795 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05001796 goto out_err;
1797 }
1798
1799 names_size = le64_to_cpu(ondisk->snap_names_len);
1800 want_count = snap_count;
1801 snap_count = le32_to_cpu(ondisk->snap_count);
1802 } while (snap_count != want_count);
1803
1804 return ondisk;
1805
1806out_err:
1807 kfree(ondisk);
1808
1809 return ERR_PTR(ret);
1810}
1811
1812/*
1813 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001814 */
1815static int rbd_read_header(struct rbd_device *rbd_dev,
1816 struct rbd_image_header *header)
1817{
Alex Elder4156d992012-08-02 11:29:46 -05001818 struct rbd_image_header_ondisk *ondisk;
1819 u64 ver = 0;
1820 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001821
Alex Elder4156d992012-08-02 11:29:46 -05001822 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1823 if (IS_ERR(ondisk))
1824 return PTR_ERR(ondisk);
1825 ret = rbd_header_from_disk(header, ondisk);
1826 if (ret >= 0)
1827 header->obj_version = ver;
1828 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001829
Alex Elder4156d992012-08-02 11:29:46 -05001830 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001831}
1832
Alex Elder41f38c22012-10-25 23:34:40 -05001833static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001834{
1835 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001836 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001837
Alex Eldera0593292012-07-19 09:09:27 -05001838 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001839 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001840}
1841
Alex Elder94785542012-10-09 13:50:17 -07001842static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1843{
1844 sector_t size;
1845
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001846 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001847 return;
1848
1849 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1850 dout("setting size to %llu sectors", (unsigned long long) size);
1851 rbd_dev->mapping.size = (u64) size;
1852 set_capacity(rbd_dev->disk, size);
1853}
1854
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001855/*
1856 * only read the first part of the ondisk header, without the snaps info
1857 */
Alex Elder117973f2012-08-31 17:29:55 -05001858static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859{
1860 int ret;
1861 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001862
1863 ret = rbd_read_header(rbd_dev, &h);
1864 if (ret < 0)
1865 return ret;
1866
Josh Durgina51aa0c2011-12-05 10:35:04 -08001867 down_write(&rbd_dev->header_rwsem);
1868
Alex Elder94785542012-10-09 13:50:17 -07001869 /* Update image size, and check for resize of mapped image */
1870 rbd_dev->header.image_size = h.image_size;
1871 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001872
Alex Elder849b4262012-07-09 21:04:24 -05001873 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001874 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001875 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001876 /* osd requests may still refer to snapc */
1877 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001878
Alex Elderb8136232012-07-25 09:32:41 -05001879 if (hver)
1880 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001881 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001882 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001883 rbd_dev->header.snapc = h.snapc;
1884 rbd_dev->header.snap_names = h.snap_names;
1885 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001886 /* Free the extra copy of the object prefix */
1887 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1888 kfree(h.object_prefix);
1889
Alex Elder304f6802012-08-31 17:29:52 -05001890 ret = rbd_dev_snaps_update(rbd_dev);
1891 if (!ret)
1892 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001893
Josh Durginc6666012011-11-21 17:11:12 -08001894 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001895
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001896 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001897}
1898
Alex Elder117973f2012-08-31 17:29:55 -05001899static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001900{
1901 int ret;
1902
Alex Elder117973f2012-08-31 17:29:55 -05001903 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001904 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001905 if (rbd_dev->image_format == 1)
1906 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1907 else
1908 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001909 mutex_unlock(&ctl_mutex);
1910
1911 return ret;
1912}
1913
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001914static int rbd_init_disk(struct rbd_device *rbd_dev)
1915{
1916 struct gendisk *disk;
1917 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001918 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001919
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001920 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001921 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1922 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001923 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001924
Alex Elderf0f8cef2012-01-29 13:57:44 -06001925 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001926 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001927 disk->major = rbd_dev->major;
1928 disk->first_minor = 0;
1929 disk->fops = &rbd_bd_ops;
1930 disk->private_data = rbd_dev;
1931
1932 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001933 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1934 if (!q)
1935 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001936
Alex Elder593a9e72012-02-07 12:03:37 -06001937 /* We use the default size, but let's be explicit about it. */
1938 blk_queue_physical_block_size(q, SECTOR_SIZE);
1939
Josh Durgin029bcbd2011-07-22 11:35:23 -07001940 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001941 segment_size = rbd_obj_bytes(&rbd_dev->header);
1942 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1943 blk_queue_max_segment_size(q, segment_size);
1944 blk_queue_io_min(q, segment_size);
1945 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001946
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001947 blk_queue_merge_bvec(q, rbd_merge_bvec);
1948 disk->queue = q;
1949
1950 q->queuedata = rbd_dev;
1951
1952 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001953
Alex Elder12f02942012-08-29 17:11:07 -05001954 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1955
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001956 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001957out_disk:
1958 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001959
1960 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001961}
1962
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001963/*
1964 sysfs
1965*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001966
Alex Elder593a9e72012-02-07 12:03:37 -06001967static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1968{
1969 return container_of(dev, struct rbd_device, dev);
1970}
1971
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001972static ssize_t rbd_size_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001974{
Alex Elder593a9e72012-02-07 12:03:37 -06001975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001976 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001977
Josh Durgina51aa0c2011-12-05 10:35:04 -08001978 down_read(&rbd_dev->header_rwsem);
1979 size = get_capacity(rbd_dev->disk);
1980 up_read(&rbd_dev->header_rwsem);
1981
1982 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001983}
1984
Alex Elder34b13182012-07-13 20:35:12 -05001985/*
1986 * Note this shows the features for whatever's mapped, which is not
1987 * necessarily the base image.
1988 */
1989static ssize_t rbd_features_show(struct device *dev,
1990 struct device_attribute *attr, char *buf)
1991{
1992 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1993
1994 return sprintf(buf, "0x%016llx\n",
1995 (unsigned long long) rbd_dev->mapping.features);
1996}
1997
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001998static ssize_t rbd_major_show(struct device *dev,
1999 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002000{
Alex Elder593a9e72012-02-07 12:03:37 -06002001 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002002
2003 return sprintf(buf, "%d\n", rbd_dev->major);
2004}
2005
2006static ssize_t rbd_client_id_show(struct device *dev,
2007 struct device_attribute *attr, char *buf)
2008{
Alex Elder593a9e72012-02-07 12:03:37 -06002009 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002010
Alex Elder1dbb4392012-01-24 10:08:37 -06002011 return sprintf(buf, "client%lld\n",
2012 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002013}
2014
2015static ssize_t rbd_pool_show(struct device *dev,
2016 struct device_attribute *attr, char *buf)
2017{
Alex Elder593a9e72012-02-07 12:03:37 -06002018 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002019
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002020 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002021}
2022
Alex Elder9bb2f332012-07-12 10:46:35 -05002023static ssize_t rbd_pool_id_show(struct device *dev,
2024 struct device_attribute *attr, char *buf)
2025{
2026 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2027
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002028 return sprintf(buf, "%llu\n",
2029 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002030}
2031
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002032static ssize_t rbd_name_show(struct device *dev,
2033 struct device_attribute *attr, char *buf)
2034{
Alex Elder593a9e72012-02-07 12:03:37 -06002035 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002036
Alex Eldera92ffdf2012-10-30 19:40:33 -05002037 if (rbd_dev->spec->image_name)
2038 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2039
2040 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002041}
2042
Alex Elder589d30e2012-07-10 20:30:11 -05002043static ssize_t rbd_image_id_show(struct device *dev,
2044 struct device_attribute *attr, char *buf)
2045{
2046 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2047
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002048 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002049}
2050
Alex Elder34b13182012-07-13 20:35:12 -05002051/*
2052 * Shows the name of the currently-mapped snapshot (or
2053 * RBD_SNAP_HEAD_NAME for the base image).
2054 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002055static ssize_t rbd_snap_show(struct device *dev,
2056 struct device_attribute *attr,
2057 char *buf)
2058{
Alex Elder593a9e72012-02-07 12:03:37 -06002059 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002060
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002061 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002062}
2063
Alex Elder86b00e02012-10-25 23:34:42 -05002064/*
2065 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2066 * for the parent image. If there is no parent, simply shows
2067 * "(no parent image)".
2068 */
2069static ssize_t rbd_parent_show(struct device *dev,
2070 struct device_attribute *attr,
2071 char *buf)
2072{
2073 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2074 struct rbd_spec *spec = rbd_dev->parent_spec;
2075 int count;
2076 char *bufp = buf;
2077
2078 if (!spec)
2079 return sprintf(buf, "(no parent image)\n");
2080
2081 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2082 (unsigned long long) spec->pool_id, spec->pool_name);
2083 if (count < 0)
2084 return count;
2085 bufp += count;
2086
2087 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2088 spec->image_name ? spec->image_name : "(unknown)");
2089 if (count < 0)
2090 return count;
2091 bufp += count;
2092
2093 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2094 (unsigned long long) spec->snap_id, spec->snap_name);
2095 if (count < 0)
2096 return count;
2097 bufp += count;
2098
2099 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2100 if (count < 0)
2101 return count;
2102 bufp += count;
2103
2104 return (ssize_t) (bufp - buf);
2105}
2106
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002107static ssize_t rbd_image_refresh(struct device *dev,
2108 struct device_attribute *attr,
2109 const char *buf,
2110 size_t size)
2111{
Alex Elder593a9e72012-02-07 12:03:37 -06002112 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002113 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002114
Alex Elder117973f2012-08-31 17:29:55 -05002115 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002116
2117 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002118}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002119
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002120static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002121static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002122static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2123static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2124static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002125static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002126static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002127static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002128static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2129static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002130static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002131
2132static struct attribute *rbd_attrs[] = {
2133 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002134 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002135 &dev_attr_major.attr,
2136 &dev_attr_client_id.attr,
2137 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002138 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002139 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002140 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002141 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002142 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002143 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002144 NULL
2145};
2146
2147static struct attribute_group rbd_attr_group = {
2148 .attrs = rbd_attrs,
2149};
2150
2151static const struct attribute_group *rbd_attr_groups[] = {
2152 &rbd_attr_group,
2153 NULL
2154};
2155
2156static void rbd_sysfs_dev_release(struct device *dev)
2157{
2158}
2159
2160static struct device_type rbd_device_type = {
2161 .name = "rbd",
2162 .groups = rbd_attr_groups,
2163 .release = rbd_sysfs_dev_release,
2164};
2165
2166
2167/*
2168 sysfs - snapshots
2169*/
2170
2171static ssize_t rbd_snap_size_show(struct device *dev,
2172 struct device_attribute *attr,
2173 char *buf)
2174{
2175 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2176
Josh Durgin35915382011-12-05 18:25:13 -08002177 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002178}
2179
2180static ssize_t rbd_snap_id_show(struct device *dev,
2181 struct device_attribute *attr,
2182 char *buf)
2183{
2184 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2185
Josh Durgin35915382011-12-05 18:25:13 -08002186 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002187}
2188
Alex Elder34b13182012-07-13 20:35:12 -05002189static ssize_t rbd_snap_features_show(struct device *dev,
2190 struct device_attribute *attr,
2191 char *buf)
2192{
2193 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2194
2195 return sprintf(buf, "0x%016llx\n",
2196 (unsigned long long) snap->features);
2197}
2198
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002199static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2200static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002201static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002202
2203static struct attribute *rbd_snap_attrs[] = {
2204 &dev_attr_snap_size.attr,
2205 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002206 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002207 NULL,
2208};
2209
2210static struct attribute_group rbd_snap_attr_group = {
2211 .attrs = rbd_snap_attrs,
2212};
2213
2214static void rbd_snap_dev_release(struct device *dev)
2215{
2216 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2217 kfree(snap->name);
2218 kfree(snap);
2219}
2220
2221static const struct attribute_group *rbd_snap_attr_groups[] = {
2222 &rbd_snap_attr_group,
2223 NULL
2224};
2225
2226static struct device_type rbd_snap_device_type = {
2227 .groups = rbd_snap_attr_groups,
2228 .release = rbd_snap_dev_release,
2229};
2230
Alex Elder8b8fb992012-10-26 17:25:24 -05002231static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2232{
2233 kref_get(&spec->kref);
2234
2235 return spec;
2236}
2237
2238static void rbd_spec_free(struct kref *kref);
2239static void rbd_spec_put(struct rbd_spec *spec)
2240{
2241 if (spec)
2242 kref_put(&spec->kref, rbd_spec_free);
2243}
2244
2245static struct rbd_spec *rbd_spec_alloc(void)
2246{
2247 struct rbd_spec *spec;
2248
2249 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2250 if (!spec)
2251 return NULL;
2252 kref_init(&spec->kref);
2253
2254 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2255
2256 return spec;
2257}
2258
2259static void rbd_spec_free(struct kref *kref)
2260{
2261 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2262
2263 kfree(spec->pool_name);
2264 kfree(spec->image_id);
2265 kfree(spec->image_name);
2266 kfree(spec->snap_name);
2267 kfree(spec);
2268}
2269
Alex Elderc53d5892012-10-25 23:34:42 -05002270struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2271 struct rbd_spec *spec)
2272{
2273 struct rbd_device *rbd_dev;
2274
2275 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2276 if (!rbd_dev)
2277 return NULL;
2278
2279 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002280 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002281 INIT_LIST_HEAD(&rbd_dev->node);
2282 INIT_LIST_HEAD(&rbd_dev->snaps);
2283 init_rwsem(&rbd_dev->header_rwsem);
2284
2285 rbd_dev->spec = spec;
2286 rbd_dev->rbd_client = rbdc;
2287
Alex Elder0903e872012-11-14 12:25:19 -06002288 /* Initialize the layout used for all rbd requests */
2289
2290 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2291 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2292 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2293 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2294
Alex Elderc53d5892012-10-25 23:34:42 -05002295 return rbd_dev;
2296}
2297
2298static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2299{
Alex Elder86b00e02012-10-25 23:34:42 -05002300 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002301 kfree(rbd_dev->header_name);
2302 rbd_put_client(rbd_dev->rbd_client);
2303 rbd_spec_put(rbd_dev->spec);
2304 kfree(rbd_dev);
2305}
2306
Alex Elder304f6802012-08-31 17:29:52 -05002307static bool rbd_snap_registered(struct rbd_snap *snap)
2308{
2309 bool ret = snap->dev.type == &rbd_snap_device_type;
2310 bool reg = device_is_registered(&snap->dev);
2311
2312 rbd_assert(!ret ^ reg);
2313
2314 return ret;
2315}
2316
Alex Elder41f38c22012-10-25 23:34:40 -05002317static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002318{
2319 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002320 if (device_is_registered(&snap->dev))
2321 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002322}
2323
Alex Elder14e70852012-07-19 09:09:27 -05002324static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002325 struct device *parent)
2326{
2327 struct device *dev = &snap->dev;
2328 int ret;
2329
2330 dev->type = &rbd_snap_device_type;
2331 dev->parent = parent;
2332 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002333 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002334 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2335
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002336 ret = device_register(dev);
2337
2338 return ret;
2339}
2340
Alex Elder4e891e02012-07-10 20:30:10 -05002341static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002342 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002343 u64 snap_id, u64 snap_size,
2344 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002345{
Alex Elder4e891e02012-07-10 20:30:10 -05002346 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002347 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002348
2349 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002350 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002351 return ERR_PTR(-ENOMEM);
2352
2353 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002354 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002355 if (!snap->name)
2356 goto err;
2357
Alex Elderc8d18422012-07-10 20:30:11 -05002358 snap->id = snap_id;
2359 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002360 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002361
2362 return snap;
2363
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002364err:
2365 kfree(snap->name);
2366 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002367
2368 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002369}
2370
Alex Eldercd892122012-07-03 16:01:19 -05002371static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2372 u64 *snap_size, u64 *snap_features)
2373{
2374 char *snap_name;
2375
2376 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2377
2378 *snap_size = rbd_dev->header.snap_sizes[which];
2379 *snap_features = 0; /* No features for v1 */
2380
2381 /* Skip over names until we find the one we are looking for */
2382
2383 snap_name = rbd_dev->header.snap_names;
2384 while (which--)
2385 snap_name += strlen(snap_name) + 1;
2386
2387 return snap_name;
2388}
2389
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002390/*
Alex Elder9d475de2012-07-03 16:01:19 -05002391 * Get the size and object order for an image snapshot, or if
2392 * snap_id is CEPH_NOSNAP, gets this information for the base
2393 * image.
2394 */
2395static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2396 u8 *order, u64 *snap_size)
2397{
2398 __le64 snapid = cpu_to_le64(snap_id);
2399 int ret;
2400 struct {
2401 u8 order;
2402 __le64 size;
2403 } __attribute__ ((packed)) size_buf = { 0 };
2404
2405 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2406 "rbd", "get_size",
2407 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002408 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder9d475de2012-07-03 16:01:19 -05002409 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2410 if (ret < 0)
2411 return ret;
2412
2413 *order = size_buf.order;
2414 *snap_size = le64_to_cpu(size_buf.size);
2415
2416 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2417 (unsigned long long) snap_id, (unsigned int) *order,
2418 (unsigned long long) *snap_size);
2419
2420 return 0;
2421}
2422
2423static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2424{
2425 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2426 &rbd_dev->header.obj_order,
2427 &rbd_dev->header.image_size);
2428}
2429
Alex Elder1e130192012-07-03 16:01:19 -05002430static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2431{
2432 void *reply_buf;
2433 int ret;
2434 void *p;
2435
2436 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2437 if (!reply_buf)
2438 return -ENOMEM;
2439
2440 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2441 "rbd", "get_object_prefix",
2442 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002443 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder1e130192012-07-03 16:01:19 -05002444 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2445 if (ret < 0)
2446 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002447 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002448
2449 p = reply_buf;
2450 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2451 p + RBD_OBJ_PREFIX_LEN_MAX,
2452 NULL, GFP_NOIO);
2453
2454 if (IS_ERR(rbd_dev->header.object_prefix)) {
2455 ret = PTR_ERR(rbd_dev->header.object_prefix);
2456 rbd_dev->header.object_prefix = NULL;
2457 } else {
2458 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2459 }
2460
2461out:
2462 kfree(reply_buf);
2463
2464 return ret;
2465}
2466
Alex Elderb1b54022012-07-03 16:01:19 -05002467static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2468 u64 *snap_features)
2469{
2470 __le64 snapid = cpu_to_le64(snap_id);
2471 struct {
2472 __le64 features;
2473 __le64 incompat;
2474 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002475 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002476 int ret;
2477
2478 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2479 "rbd", "get_features",
2480 (char *) &snapid, sizeof (snapid),
2481 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002482 NULL);
Alex Elderb1b54022012-07-03 16:01:19 -05002483 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2484 if (ret < 0)
2485 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002486
2487 incompat = le64_to_cpu(features_buf.incompat);
2488 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002489 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002490
Alex Elderb1b54022012-07-03 16:01:19 -05002491 *snap_features = le64_to_cpu(features_buf.features);
2492
2493 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2494 (unsigned long long) snap_id,
2495 (unsigned long long) *snap_features,
2496 (unsigned long long) le64_to_cpu(features_buf.incompat));
2497
2498 return 0;
2499}
2500
2501static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2502{
2503 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2504 &rbd_dev->header.features);
2505}
2506
Alex Elder86b00e02012-10-25 23:34:42 -05002507static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2508{
2509 struct rbd_spec *parent_spec;
2510 size_t size;
2511 void *reply_buf = NULL;
2512 __le64 snapid;
2513 void *p;
2514 void *end;
2515 char *image_id;
2516 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002517 int ret;
2518
2519 parent_spec = rbd_spec_alloc();
2520 if (!parent_spec)
2521 return -ENOMEM;
2522
2523 size = sizeof (__le64) + /* pool_id */
2524 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2525 sizeof (__le64) + /* snap_id */
2526 sizeof (__le64); /* overlap */
2527 reply_buf = kmalloc(size, GFP_KERNEL);
2528 if (!reply_buf) {
2529 ret = -ENOMEM;
2530 goto out_err;
2531 }
2532
2533 snapid = cpu_to_le64(CEPH_NOSNAP);
2534 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2535 "rbd", "get_parent",
2536 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002537 (char *) reply_buf, size, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002538 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2539 if (ret < 0)
2540 goto out_err;
2541
2542 ret = -ERANGE;
2543 p = reply_buf;
2544 end = (char *) reply_buf + size;
2545 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2546 if (parent_spec->pool_id == CEPH_NOPOOL)
2547 goto out; /* No parent? No problem. */
2548
Alex Elder0903e872012-11-14 12:25:19 -06002549 /* The ceph file layout needs to fit pool id in 32 bits */
2550
2551 ret = -EIO;
2552 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2553 goto out;
2554
Alex Elder979ed482012-11-01 08:39:26 -05002555 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002556 if (IS_ERR(image_id)) {
2557 ret = PTR_ERR(image_id);
2558 goto out_err;
2559 }
2560 parent_spec->image_id = image_id;
2561 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2562 ceph_decode_64_safe(&p, end, overlap, out_err);
2563
2564 rbd_dev->parent_overlap = overlap;
2565 rbd_dev->parent_spec = parent_spec;
2566 parent_spec = NULL; /* rbd_dev now owns this */
2567out:
2568 ret = 0;
2569out_err:
2570 kfree(reply_buf);
2571 rbd_spec_put(parent_spec);
2572
2573 return ret;
2574}
2575
Alex Elder9e15b772012-10-30 19:40:33 -05002576static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2577{
2578 size_t image_id_size;
2579 char *image_id;
2580 void *p;
2581 void *end;
2582 size_t size;
2583 void *reply_buf = NULL;
2584 size_t len = 0;
2585 char *image_name = NULL;
2586 int ret;
2587
2588 rbd_assert(!rbd_dev->spec->image_name);
2589
Alex Elder69e7a022012-11-01 08:39:26 -05002590 len = strlen(rbd_dev->spec->image_id);
2591 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002592 image_id = kmalloc(image_id_size, GFP_KERNEL);
2593 if (!image_id)
2594 return NULL;
2595
2596 p = image_id;
2597 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002598 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002599
2600 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2601 reply_buf = kmalloc(size, GFP_KERNEL);
2602 if (!reply_buf)
2603 goto out;
2604
2605 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2606 "rbd", "dir_get_name",
2607 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06002608 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05002609 if (ret < 0)
2610 goto out;
2611 p = reply_buf;
2612 end = (char *) reply_buf + size;
2613 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2614 if (IS_ERR(image_name))
2615 image_name = NULL;
2616 else
2617 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2618out:
2619 kfree(reply_buf);
2620 kfree(image_id);
2621
2622 return image_name;
2623}
2624
2625/*
2626 * When a parent image gets probed, we only have the pool, image,
2627 * and snapshot ids but not the names of any of them. This call
2628 * is made later to fill in those names. It has to be done after
2629 * rbd_dev_snaps_update() has completed because some of the
2630 * information (in particular, snapshot name) is not available
2631 * until then.
2632 */
2633static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2634{
2635 struct ceph_osd_client *osdc;
2636 const char *name;
2637 void *reply_buf = NULL;
2638 int ret;
2639
2640 if (rbd_dev->spec->pool_name)
2641 return 0; /* Already have the names */
2642
2643 /* Look up the pool name */
2644
2645 osdc = &rbd_dev->rbd_client->client->osdc;
2646 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05002647 if (!name) {
2648 rbd_warn(rbd_dev, "there is no pool with id %llu",
2649 rbd_dev->spec->pool_id); /* Really a BUG() */
2650 return -EIO;
2651 }
Alex Elder9e15b772012-10-30 19:40:33 -05002652
2653 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2654 if (!rbd_dev->spec->pool_name)
2655 return -ENOMEM;
2656
2657 /* Fetch the image name; tolerate failure here */
2658
2659 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05002660 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05002661 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05002662 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05002663 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05002664
2665 /* Look up the snapshot name. */
2666
2667 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2668 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05002669 rbd_warn(rbd_dev, "no snapshot with id %llu",
2670 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05002671 ret = -EIO;
2672 goto out_err;
2673 }
2674 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2675 if(!rbd_dev->spec->snap_name)
2676 goto out_err;
2677
2678 return 0;
2679out_err:
2680 kfree(reply_buf);
2681 kfree(rbd_dev->spec->pool_name);
2682 rbd_dev->spec->pool_name = NULL;
2683
2684 return ret;
2685}
2686
Alex Elder6e14b1a2012-07-03 16:01:19 -05002687static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002688{
2689 size_t size;
2690 int ret;
2691 void *reply_buf;
2692 void *p;
2693 void *end;
2694 u64 seq;
2695 u32 snap_count;
2696 struct ceph_snap_context *snapc;
2697 u32 i;
2698
2699 /*
2700 * We'll need room for the seq value (maximum snapshot id),
2701 * snapshot count, and array of that many snapshot ids.
2702 * For now we have a fixed upper limit on the number we're
2703 * prepared to receive.
2704 */
2705 size = sizeof (__le64) + sizeof (__le32) +
2706 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2707 reply_buf = kzalloc(size, GFP_KERNEL);
2708 if (!reply_buf)
2709 return -ENOMEM;
2710
2711 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2712 "rbd", "get_snapcontext",
2713 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002714 reply_buf, size, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002715 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2716 if (ret < 0)
2717 goto out;
2718
2719 ret = -ERANGE;
2720 p = reply_buf;
2721 end = (char *) reply_buf + size;
2722 ceph_decode_64_safe(&p, end, seq, out);
2723 ceph_decode_32_safe(&p, end, snap_count, out);
2724
2725 /*
2726 * Make sure the reported number of snapshot ids wouldn't go
2727 * beyond the end of our buffer. But before checking that,
2728 * make sure the computed size of the snapshot context we
2729 * allocate is representable in a size_t.
2730 */
2731 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2732 / sizeof (u64)) {
2733 ret = -EINVAL;
2734 goto out;
2735 }
2736 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2737 goto out;
2738
2739 size = sizeof (struct ceph_snap_context) +
2740 snap_count * sizeof (snapc->snaps[0]);
2741 snapc = kmalloc(size, GFP_KERNEL);
2742 if (!snapc) {
2743 ret = -ENOMEM;
2744 goto out;
2745 }
2746
2747 atomic_set(&snapc->nref, 1);
2748 snapc->seq = seq;
2749 snapc->num_snaps = snap_count;
2750 for (i = 0; i < snap_count; i++)
2751 snapc->snaps[i] = ceph_decode_64(&p);
2752
2753 rbd_dev->header.snapc = snapc;
2754
2755 dout(" snap context seq = %llu, snap_count = %u\n",
2756 (unsigned long long) seq, (unsigned int) snap_count);
2757
2758out:
2759 kfree(reply_buf);
2760
2761 return 0;
2762}
2763
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002764static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2765{
2766 size_t size;
2767 void *reply_buf;
2768 __le64 snap_id;
2769 int ret;
2770 void *p;
2771 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002772 char *snap_name;
2773
2774 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2775 reply_buf = kmalloc(size, GFP_KERNEL);
2776 if (!reply_buf)
2777 return ERR_PTR(-ENOMEM);
2778
2779 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2780 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2781 "rbd", "get_snapshot_name",
2782 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06002783 reply_buf, size, NULL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002784 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2785 if (ret < 0)
2786 goto out;
2787
2788 p = reply_buf;
2789 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002790 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002791 if (IS_ERR(snap_name)) {
2792 ret = PTR_ERR(snap_name);
2793 goto out;
2794 } else {
2795 dout(" snap_id 0x%016llx snap_name = %s\n",
2796 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2797 }
2798 kfree(reply_buf);
2799
2800 return snap_name;
2801out:
2802 kfree(reply_buf);
2803
2804 return ERR_PTR(ret);
2805}
2806
2807static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2808 u64 *snap_size, u64 *snap_features)
2809{
Alex Eldere0b49862013-01-09 14:44:18 -06002810 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002811 u8 order;
2812 int ret;
2813
2814 snap_id = rbd_dev->header.snapc->snaps[which];
2815 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2816 if (ret)
2817 return ERR_PTR(ret);
2818 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2819 if (ret)
2820 return ERR_PTR(ret);
2821
2822 return rbd_dev_v2_snap_name(rbd_dev, which);
2823}
2824
2825static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2826 u64 *snap_size, u64 *snap_features)
2827{
2828 if (rbd_dev->image_format == 1)
2829 return rbd_dev_v1_snap_info(rbd_dev, which,
2830 snap_size, snap_features);
2831 if (rbd_dev->image_format == 2)
2832 return rbd_dev_v2_snap_info(rbd_dev, which,
2833 snap_size, snap_features);
2834 return ERR_PTR(-EINVAL);
2835}
2836
Alex Elder117973f2012-08-31 17:29:55 -05002837static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2838{
2839 int ret;
2840 __u8 obj_order;
2841
2842 down_write(&rbd_dev->header_rwsem);
2843
2844 /* Grab old order first, to see if it changes */
2845
2846 obj_order = rbd_dev->header.obj_order,
2847 ret = rbd_dev_v2_image_size(rbd_dev);
2848 if (ret)
2849 goto out;
2850 if (rbd_dev->header.obj_order != obj_order) {
2851 ret = -EIO;
2852 goto out;
2853 }
2854 rbd_update_mapping_size(rbd_dev);
2855
2856 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2857 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2858 if (ret)
2859 goto out;
2860 ret = rbd_dev_snaps_update(rbd_dev);
2861 dout("rbd_dev_snaps_update returned %d\n", ret);
2862 if (ret)
2863 goto out;
2864 ret = rbd_dev_snaps_register(rbd_dev);
2865 dout("rbd_dev_snaps_register returned %d\n", ret);
2866out:
2867 up_write(&rbd_dev->header_rwsem);
2868
2869 return ret;
2870}
2871
Alex Elder9d475de2012-07-03 16:01:19 -05002872/*
Alex Elder35938152012-08-02 11:29:46 -05002873 * Scan the rbd device's current snapshot list and compare it to the
2874 * newly-received snapshot context. Remove any existing snapshots
2875 * not present in the new snapshot context. Add a new snapshot for
2876 * any snaphots in the snapshot context not in the current list.
2877 * And verify there are no changes to snapshots we already know
2878 * about.
2879 *
2880 * Assumes the snapshots in the snapshot context are sorted by
2881 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2882 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002883 */
Alex Elder304f6802012-08-31 17:29:52 -05002884static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002885{
Alex Elder35938152012-08-02 11:29:46 -05002886 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2887 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002888 struct list_head *head = &rbd_dev->snaps;
2889 struct list_head *links = head->next;
2890 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002891
Alex Elder9fcbb802012-08-23 23:48:49 -05002892 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002893 while (index < snap_count || links != head) {
2894 u64 snap_id;
2895 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002896 char *snap_name;
2897 u64 snap_size = 0;
2898 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002899
Alex Elder35938152012-08-02 11:29:46 -05002900 snap_id = index < snap_count ? snapc->snaps[index]
2901 : CEPH_NOSNAP;
2902 snap = links != head ? list_entry(links, struct rbd_snap, node)
2903 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002904 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002905
Alex Elder35938152012-08-02 11:29:46 -05002906 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2907 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002908
Alex Elder35938152012-08-02 11:29:46 -05002909 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002910
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002911 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06002912 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05002913 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002914 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002915 rbd_dev->spec->snap_id == snap->id ?
2916 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002917 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002918
Alex Elder35938152012-08-02 11:29:46 -05002919 /* Done with this list entry; advance */
2920
2921 links = next;
2922 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002923 }
Alex Elder35938152012-08-02 11:29:46 -05002924
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002925 snap_name = rbd_dev_snap_info(rbd_dev, index,
2926 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002927 if (IS_ERR(snap_name))
2928 return PTR_ERR(snap_name);
2929
Alex Elder9fcbb802012-08-23 23:48:49 -05002930 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2931 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002932 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2933 struct rbd_snap *new_snap;
2934
2935 /* We haven't seen this snapshot before */
2936
Alex Elderc8d18422012-07-10 20:30:11 -05002937 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002938 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002939 if (IS_ERR(new_snap)) {
2940 int err = PTR_ERR(new_snap);
2941
2942 dout(" failed to add dev, error %d\n", err);
2943
2944 return err;
2945 }
Alex Elder35938152012-08-02 11:29:46 -05002946
2947 /* New goes before existing, or at end of list */
2948
Alex Elder9fcbb802012-08-23 23:48:49 -05002949 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002950 if (snap)
2951 list_add_tail(&new_snap->node, &snap->node);
2952 else
Alex Elder523f3252012-08-30 00:16:37 -05002953 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002954 } else {
2955 /* Already have this one */
2956
Alex Elder9fcbb802012-08-23 23:48:49 -05002957 dout(" already present\n");
2958
Alex Eldercd892122012-07-03 16:01:19 -05002959 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002960 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002961 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002962
2963 /* Done with this list entry; advance */
2964
2965 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002966 }
Alex Elder35938152012-08-02 11:29:46 -05002967
2968 /* Advance to the next entry in the snapshot context */
2969
2970 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002971 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002972 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002973
2974 return 0;
2975}
2976
Alex Elder304f6802012-08-31 17:29:52 -05002977/*
2978 * Scan the list of snapshots and register the devices for any that
2979 * have not already been registered.
2980 */
2981static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2982{
2983 struct rbd_snap *snap;
2984 int ret = 0;
2985
2986 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002987 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2988 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002989
2990 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2991 if (!rbd_snap_registered(snap)) {
2992 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2993 if (ret < 0)
2994 break;
2995 }
2996 }
2997 dout("%s: returning %d\n", __func__, ret);
2998
2999 return ret;
3000}
3001
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003002static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3003{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003004 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003005 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003006
3007 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003008
Alex Eldercd789ab2012-08-30 00:16:38 -05003009 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003010 dev->bus = &rbd_bus_type;
3011 dev->type = &rbd_device_type;
3012 dev->parent = &rbd_root_dev;
3013 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003014 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003015 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003016
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003017 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003018
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003019 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003020}
3021
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003022static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3023{
3024 device_unregister(&rbd_dev->dev);
3025}
3026
Alex Eldere2839302012-08-29 17:11:06 -05003027static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003028
3029/*
Alex Elder499afd52012-02-02 08:13:29 -06003030 * Get a unique rbd identifier for the given new rbd_dev, and add
3031 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003032 */
Alex Eldere2839302012-08-29 17:11:06 -05003033static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003034{
Alex Eldere2839302012-08-29 17:11:06 -05003035 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003036
3037 spin_lock(&rbd_dev_list_lock);
3038 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3039 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003040 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3041 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003042}
Alex Elderb7f23c32012-01-29 13:57:43 -06003043
Alex Elder1ddbe942012-01-29 13:57:44 -06003044/*
Alex Elder499afd52012-02-02 08:13:29 -06003045 * Remove an rbd_dev from the global list, and record that its
3046 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003047 */
Alex Eldere2839302012-08-29 17:11:06 -05003048static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003049{
Alex Elderd184f6b2012-01-29 13:57:44 -06003050 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003051 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003052 int max_id;
3053
Alex Elderaafb2302012-09-06 16:00:54 -05003054 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003055
Alex Eldere2839302012-08-29 17:11:06 -05003056 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3057 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003058 spin_lock(&rbd_dev_list_lock);
3059 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003060
3061 /*
3062 * If the id being "put" is not the current maximum, there
3063 * is nothing special we need to do.
3064 */
Alex Eldere2839302012-08-29 17:11:06 -05003065 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003066 spin_unlock(&rbd_dev_list_lock);
3067 return;
3068 }
3069
3070 /*
3071 * We need to update the current maximum id. Search the
3072 * list to find out what it is. We're more likely to find
3073 * the maximum at the end, so search the list backward.
3074 */
3075 max_id = 0;
3076 list_for_each_prev(tmp, &rbd_dev_list) {
3077 struct rbd_device *rbd_dev;
3078
3079 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003080 if (rbd_dev->dev_id > max_id)
3081 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003082 }
Alex Elder499afd52012-02-02 08:13:29 -06003083 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003084
Alex Elder1ddbe942012-01-29 13:57:44 -06003085 /*
Alex Eldere2839302012-08-29 17:11:06 -05003086 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003087 * which case it now accurately reflects the new maximum.
3088 * Be careful not to overwrite the maximum value in that
3089 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003090 */
Alex Eldere2839302012-08-29 17:11:06 -05003091 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3092 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003093}
3094
Alex Eldera725f65e2012-02-02 08:13:30 -06003095/*
Alex Eldere28fff262012-02-02 08:13:30 -06003096 * Skips over white space at *buf, and updates *buf to point to the
3097 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003098 * the token (string of non-white space characters) found. Note
3099 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003100 */
3101static inline size_t next_token(const char **buf)
3102{
3103 /*
3104 * These are the characters that produce nonzero for
3105 * isspace() in the "C" and "POSIX" locales.
3106 */
3107 const char *spaces = " \f\n\r\t\v";
3108
3109 *buf += strspn(*buf, spaces); /* Find start of token */
3110
3111 return strcspn(*buf, spaces); /* Return token length */
3112}
3113
3114/*
3115 * Finds the next token in *buf, and if the provided token buffer is
3116 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003117 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3118 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003119 *
3120 * Returns the length of the token found (not including the '\0').
3121 * Return value will be 0 if no token is found, and it will be >=
3122 * token_size if the token would not fit.
3123 *
Alex Elder593a9e72012-02-07 12:03:37 -06003124 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003125 * found token. Note that this occurs even if the token buffer is
3126 * too small to hold it.
3127 */
3128static inline size_t copy_token(const char **buf,
3129 char *token,
3130 size_t token_size)
3131{
3132 size_t len;
3133
3134 len = next_token(buf);
3135 if (len < token_size) {
3136 memcpy(token, *buf, len);
3137 *(token + len) = '\0';
3138 }
3139 *buf += len;
3140
3141 return len;
3142}
3143
3144/*
Alex Elderea3352f2012-07-09 21:04:23 -05003145 * Finds the next token in *buf, dynamically allocates a buffer big
3146 * enough to hold a copy of it, and copies the token into the new
3147 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3148 * that a duplicate buffer is created even for a zero-length token.
3149 *
3150 * Returns a pointer to the newly-allocated duplicate, or a null
3151 * pointer if memory for the duplicate was not available. If
3152 * the lenp argument is a non-null pointer, the length of the token
3153 * (not including the '\0') is returned in *lenp.
3154 *
3155 * If successful, the *buf pointer will be updated to point beyond
3156 * the end of the found token.
3157 *
3158 * Note: uses GFP_KERNEL for allocation.
3159 */
3160static inline char *dup_token(const char **buf, size_t *lenp)
3161{
3162 char *dup;
3163 size_t len;
3164
3165 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003166 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003167 if (!dup)
3168 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003169 *(dup + len) = '\0';
3170 *buf += len;
3171
3172 if (lenp)
3173 *lenp = len;
3174
3175 return dup;
3176}
3177
3178/*
Alex Elder859c31d2012-10-25 23:34:42 -05003179 * Parse the options provided for an "rbd add" (i.e., rbd image
3180 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3181 * and the data written is passed here via a NUL-terminated buffer.
3182 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003183 *
Alex Elder859c31d2012-10-25 23:34:42 -05003184 * The information extracted from these options is recorded in
3185 * the other parameters which return dynamically-allocated
3186 * structures:
3187 * ceph_opts
3188 * The address of a pointer that will refer to a ceph options
3189 * structure. Caller must release the returned pointer using
3190 * ceph_destroy_options() when it is no longer needed.
3191 * rbd_opts
3192 * Address of an rbd options pointer. Fully initialized by
3193 * this function; caller must release with kfree().
3194 * spec
3195 * Address of an rbd image specification pointer. Fully
3196 * initialized by this function based on parsed options.
3197 * Caller must release with rbd_spec_put().
3198 *
3199 * The options passed take this form:
3200 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3201 * where:
3202 * <mon_addrs>
3203 * A comma-separated list of one or more monitor addresses.
3204 * A monitor address is an ip address, optionally followed
3205 * by a port number (separated by a colon).
3206 * I.e.: ip1[:port1][,ip2[:port2]...]
3207 * <options>
3208 * A comma-separated list of ceph and/or rbd options.
3209 * <pool_name>
3210 * The name of the rados pool containing the rbd image.
3211 * <image_name>
3212 * The name of the image in that pool to map.
3213 * <snap_id>
3214 * An optional snapshot id. If provided, the mapping will
3215 * present data from the image at the time that snapshot was
3216 * created. The image head is used if no snapshot id is
3217 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003218 */
Alex Elder859c31d2012-10-25 23:34:42 -05003219static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003220 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003221 struct rbd_options **opts,
3222 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003223{
Alex Elderd22f76e2012-07-12 10:46:35 -05003224 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003225 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003226 const char *mon_addrs;
3227 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003228 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003229 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003230 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003231 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003232
3233 /* The first four tokens are required */
3234
Alex Elder7ef32142012-02-02 08:13:30 -06003235 len = next_token(&buf);
Alex Elder4fb5d672012-11-01 10:17:15 -05003236 if (!len) {
3237 rbd_warn(NULL, "no monitor address(es) provided");
3238 return -EINVAL;
3239 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003240 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003241 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003242 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003243
Alex Elderdc79b112012-10-25 23:34:41 -05003244 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003245 options = dup_token(&buf, NULL);
3246 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003247 return -ENOMEM;
Alex Elder4fb5d672012-11-01 10:17:15 -05003248 if (!*options) {
3249 rbd_warn(NULL, "no options provided");
3250 goto out_err;
3251 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003252
Alex Elder859c31d2012-10-25 23:34:42 -05003253 spec = rbd_spec_alloc();
3254 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003255 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003256
3257 spec->pool_name = dup_token(&buf, NULL);
3258 if (!spec->pool_name)
3259 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003260 if (!*spec->pool_name) {
3261 rbd_warn(NULL, "no pool name provided");
3262 goto out_err;
3263 }
Alex Eldere28fff262012-02-02 08:13:30 -06003264
Alex Elder69e7a022012-11-01 08:39:26 -05003265 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003266 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003267 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003268 if (!*spec->image_name) {
3269 rbd_warn(NULL, "no image name provided");
3270 goto out_err;
3271 }
Alex Eldere28fff262012-02-02 08:13:30 -06003272
Alex Elderf28e5652012-10-25 23:34:41 -05003273 /*
3274 * Snapshot name is optional; default is to use "-"
3275 * (indicating the head/no snapshot).
3276 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003277 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003278 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003279 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3280 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003281 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003282 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003283 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003284 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003285 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003286 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003287 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003288 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003289
Alex Elder0ddebc02012-10-25 23:34:41 -05003290 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003291
Alex Elder4e9afeb2012-10-25 23:34:41 -05003292 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3293 if (!rbd_opts)
3294 goto out_mem;
3295
3296 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003297
Alex Elder859c31d2012-10-25 23:34:42 -05003298 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003299 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003300 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003301 if (IS_ERR(copts)) {
3302 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003303 goto out_err;
3304 }
Alex Elder859c31d2012-10-25 23:34:42 -05003305 kfree(options);
3306
3307 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003308 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003309 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003310
Alex Elderdc79b112012-10-25 23:34:41 -05003311 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003312out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003313 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003314out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003315 kfree(rbd_opts);
3316 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003317 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003318
Alex Elderdc79b112012-10-25 23:34:41 -05003319 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003320}
3321
Alex Elder589d30e2012-07-10 20:30:11 -05003322/*
3323 * An rbd format 2 image has a unique identifier, distinct from the
3324 * name given to it by the user. Internally, that identifier is
3325 * what's used to specify the names of objects related to the image.
3326 *
3327 * A special "rbd id" object is used to map an rbd image name to its
3328 * id. If that object doesn't exist, then there is no v2 rbd image
3329 * with the supplied name.
3330 *
3331 * This function will record the given rbd_dev's image_id field if
3332 * it can be determined, and in that case will return 0. If any
3333 * errors occur a negative errno will be returned and the rbd_dev's
3334 * image_id field will be unchanged (and should be NULL).
3335 */
3336static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3337{
3338 int ret;
3339 size_t size;
3340 char *object_name;
3341 void *response;
3342 void *p;
3343
3344 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003345 * When probing a parent image, the image id is already
3346 * known (and the image name likely is not). There's no
3347 * need to fetch the image id again in this case.
3348 */
3349 if (rbd_dev->spec->image_id)
3350 return 0;
3351
3352 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003353 * First, see if the format 2 image id file exists, and if
3354 * so, get the image's persistent id from it.
3355 */
Alex Elder69e7a022012-11-01 08:39:26 -05003356 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003357 object_name = kmalloc(size, GFP_NOIO);
3358 if (!object_name)
3359 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003360 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003361 dout("rbd id object name is %s\n", object_name);
3362
3363 /* Response will be an encoded string, which includes a length */
3364
3365 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3366 response = kzalloc(size, GFP_NOIO);
3367 if (!response) {
3368 ret = -ENOMEM;
3369 goto out;
3370 }
3371
3372 ret = rbd_req_sync_exec(rbd_dev, object_name,
3373 "rbd", "get_id",
3374 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003375 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003376 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3377 if (ret < 0)
3378 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003379 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003380
3381 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003382 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003383 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003384 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003385 if (IS_ERR(rbd_dev->spec->image_id)) {
3386 ret = PTR_ERR(rbd_dev->spec->image_id);
3387 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003388 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003389 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003390 }
3391out:
3392 kfree(response);
3393 kfree(object_name);
3394
3395 return ret;
3396}
3397
Alex Eldera30b71b2012-07-10 20:30:11 -05003398static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3399{
3400 int ret;
3401 size_t size;
3402
3403 /* Version 1 images have no id; empty string is used */
3404
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003405 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3406 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003407 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003408
3409 /* Record the header object name for this rbd image. */
3410
Alex Elder69e7a022012-11-01 08:39:26 -05003411 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003412 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3413 if (!rbd_dev->header_name) {
3414 ret = -ENOMEM;
3415 goto out_err;
3416 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003417 sprintf(rbd_dev->header_name, "%s%s",
3418 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003419
3420 /* Populate rbd image metadata */
3421
3422 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3423 if (ret < 0)
3424 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003425
3426 /* Version 1 images have no parent (no layering) */
3427
3428 rbd_dev->parent_spec = NULL;
3429 rbd_dev->parent_overlap = 0;
3430
Alex Eldera30b71b2012-07-10 20:30:11 -05003431 rbd_dev->image_format = 1;
3432
3433 dout("discovered version 1 image, header name is %s\n",
3434 rbd_dev->header_name);
3435
3436 return 0;
3437
3438out_err:
3439 kfree(rbd_dev->header_name);
3440 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003441 kfree(rbd_dev->spec->image_id);
3442 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003443
3444 return ret;
3445}
3446
3447static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3448{
3449 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003450 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003451 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003452
3453 /*
3454 * Image id was filled in by the caller. Record the header
3455 * object name for this rbd image.
3456 */
Alex Elder979ed482012-11-01 08:39:26 -05003457 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003458 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3459 if (!rbd_dev->header_name)
3460 return -ENOMEM;
3461 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003462 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003463
3464 /* Get the size and object order for the image */
3465
3466 ret = rbd_dev_v2_image_size(rbd_dev);
3467 if (ret < 0)
3468 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003469
3470 /* Get the object prefix (a.k.a. block_name) for the image */
3471
3472 ret = rbd_dev_v2_object_prefix(rbd_dev);
3473 if (ret < 0)
3474 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003475
Alex Elderd8891402012-10-09 13:50:17 -07003476 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003477
3478 ret = rbd_dev_v2_features(rbd_dev);
3479 if (ret < 0)
3480 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003481
Alex Elder86b00e02012-10-25 23:34:42 -05003482 /* If the image supports layering, get the parent info */
3483
3484 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3485 ret = rbd_dev_v2_parent_info(rbd_dev);
3486 if (ret < 0)
3487 goto out_err;
3488 }
3489
Alex Elder6e14b1a2012-07-03 16:01:19 -05003490 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003491
Alex Elder6e14b1a2012-07-03 16:01:19 -05003492 rbd_dev->header.crypt_type = 0;
3493 rbd_dev->header.comp_type = 0;
3494
3495 /* Get the snapshot context, plus the header version */
3496
3497 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003498 if (ret)
3499 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003500 rbd_dev->header.obj_version = ver;
3501
Alex Eldera30b71b2012-07-10 20:30:11 -05003502 rbd_dev->image_format = 2;
3503
3504 dout("discovered version 2 image, header name is %s\n",
3505 rbd_dev->header_name);
3506
Alex Elder35152972012-08-31 17:29:55 -05003507 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003508out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003509 rbd_dev->parent_overlap = 0;
3510 rbd_spec_put(rbd_dev->parent_spec);
3511 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003512 kfree(rbd_dev->header_name);
3513 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003514 kfree(rbd_dev->header.object_prefix);
3515 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003516
3517 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003518}
3519
Alex Elder83a06262012-10-30 15:47:17 -05003520static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3521{
3522 int ret;
3523
3524 /* no need to lock here, as rbd_dev is not registered yet */
3525 ret = rbd_dev_snaps_update(rbd_dev);
3526 if (ret)
3527 return ret;
3528
Alex Elder9e15b772012-10-30 19:40:33 -05003529 ret = rbd_dev_probe_update_spec(rbd_dev);
3530 if (ret)
3531 goto err_out_snaps;
3532
Alex Elder83a06262012-10-30 15:47:17 -05003533 ret = rbd_dev_set_mapping(rbd_dev);
3534 if (ret)
3535 goto err_out_snaps;
3536
3537 /* generate unique id: find highest unique id, add one */
3538 rbd_dev_id_get(rbd_dev);
3539
3540 /* Fill in the device name, now that we have its id. */
3541 BUILD_BUG_ON(DEV_NAME_LEN
3542 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3543 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3544
3545 /* Get our block major device number. */
3546
3547 ret = register_blkdev(0, rbd_dev->name);
3548 if (ret < 0)
3549 goto err_out_id;
3550 rbd_dev->major = ret;
3551
3552 /* Set up the blkdev mapping. */
3553
3554 ret = rbd_init_disk(rbd_dev);
3555 if (ret)
3556 goto err_out_blkdev;
3557
3558 ret = rbd_bus_add_dev(rbd_dev);
3559 if (ret)
3560 goto err_out_disk;
3561
3562 /*
3563 * At this point cleanup in the event of an error is the job
3564 * of the sysfs code (initiated by rbd_bus_del_dev()).
3565 */
3566 down_write(&rbd_dev->header_rwsem);
3567 ret = rbd_dev_snaps_register(rbd_dev);
3568 up_write(&rbd_dev->header_rwsem);
3569 if (ret)
3570 goto err_out_bus;
3571
Alex Elderc0430642013-01-18 12:31:09 -06003572 ret = rbd_req_sync_watch(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05003573 if (ret)
3574 goto err_out_bus;
3575
3576 /* Everything's ready. Announce the disk to the world. */
3577
3578 add_disk(rbd_dev->disk);
3579
3580 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3581 (unsigned long long) rbd_dev->mapping.size);
3582
3583 return ret;
3584err_out_bus:
3585 /* this will also clean up rest of rbd_dev stuff */
3586
3587 rbd_bus_del_dev(rbd_dev);
3588
3589 return ret;
3590err_out_disk:
3591 rbd_free_disk(rbd_dev);
3592err_out_blkdev:
3593 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3594err_out_id:
3595 rbd_dev_id_put(rbd_dev);
3596err_out_snaps:
3597 rbd_remove_all_snaps(rbd_dev);
3598
3599 return ret;
3600}
3601
Alex Eldera30b71b2012-07-10 20:30:11 -05003602/*
3603 * Probe for the existence of the header object for the given rbd
3604 * device. For format 2 images this includes determining the image
3605 * id.
3606 */
3607static int rbd_dev_probe(struct rbd_device *rbd_dev)
3608{
3609 int ret;
3610
3611 /*
3612 * Get the id from the image id object. If it's not a
3613 * format 2 image, we'll get ENOENT back, and we'll assume
3614 * it's a format 1 image.
3615 */
3616 ret = rbd_dev_image_id(rbd_dev);
3617 if (ret)
3618 ret = rbd_dev_v1_probe(rbd_dev);
3619 else
3620 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003621 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003622 dout("probe failed, returning %d\n", ret);
3623
Alex Elder83a06262012-10-30 15:47:17 -05003624 return ret;
3625 }
3626
3627 ret = rbd_dev_probe_finish(rbd_dev);
3628 if (ret)
3629 rbd_header_free(&rbd_dev->header);
3630
Alex Eldera30b71b2012-07-10 20:30:11 -05003631 return ret;
3632}
3633
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003634static ssize_t rbd_add(struct bus_type *bus,
3635 const char *buf,
3636 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003637{
Alex Eldercb8627c2012-07-09 21:04:23 -05003638 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003639 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003640 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003641 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003642 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003643 struct ceph_osd_client *osdc;
3644 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003645
3646 if (!try_module_get(THIS_MODULE))
3647 return -ENODEV;
3648
Alex Eldera725f65e2012-02-02 08:13:30 -06003649 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003650 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003651 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003652 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003653
Alex Elder9d3997f2012-10-25 23:34:42 -05003654 rbdc = rbd_get_client(ceph_opts);
3655 if (IS_ERR(rbdc)) {
3656 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003657 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003658 }
Alex Elderc53d5892012-10-25 23:34:42 -05003659 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003660
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003661 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003662 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003663 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003664 if (rc < 0)
3665 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003666 spec->pool_id = (u64) rc;
3667
Alex Elder0903e872012-11-14 12:25:19 -06003668 /* The ceph file layout needs to fit pool id in 32 bits */
3669
3670 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
3671 rc = -EIO;
3672 goto err_out_client;
3673 }
3674
Alex Elderc53d5892012-10-25 23:34:42 -05003675 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003676 if (!rbd_dev)
3677 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003678 rbdc = NULL; /* rbd_dev now owns this */
3679 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003680
Alex Elderbd4ba652012-10-25 23:34:42 -05003681 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003682 kfree(rbd_opts);
3683 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003684
Alex Eldera30b71b2012-07-10 20:30:11 -05003685 rc = rbd_dev_probe(rbd_dev);
3686 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05003687 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05003688
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003689 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05003690err_out_rbd_dev:
3691 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05003692err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003693 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003694err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003695 if (ceph_opts)
3696 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003697 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003698 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003699err_out_module:
3700 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003701
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003702 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003703
3704 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003705}
3706
Alex Elderde71a292012-07-03 16:01:19 -05003707static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003708{
3709 struct list_head *tmp;
3710 struct rbd_device *rbd_dev;
3711
Alex Eldere124a822012-01-29 13:57:44 -06003712 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003713 list_for_each(tmp, &rbd_dev_list) {
3714 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003715 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003716 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003717 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003718 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003719 }
Alex Eldere124a822012-01-29 13:57:44 -06003720 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003721 return NULL;
3722}
3723
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003724static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003725{
Alex Elder593a9e72012-02-07 12:03:37 -06003726 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003727
Alex Elder1dbb4392012-01-24 10:08:37 -06003728 if (rbd_dev->watch_request) {
3729 struct ceph_client *client = rbd_dev->rbd_client->client;
3730
3731 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003732 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003733 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003734 if (rbd_dev->watch_event)
Alex Elder907703d2012-11-13 21:11:15 -06003735 rbd_req_sync_watch(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003736
3737 /* clean up and free blkdev */
3738 rbd_free_disk(rbd_dev);
3739 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003740
Alex Elder2ac4e752012-07-10 20:30:10 -05003741 /* release allocated disk header fields */
3742 rbd_header_free(&rbd_dev->header);
3743
Alex Elder32eec682012-02-08 16:11:14 -06003744 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05003745 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05003746 rbd_assert(rbd_dev->rbd_client != NULL);
3747 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003748
3749 /* release module ref */
3750 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003751}
3752
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003753static ssize_t rbd_remove(struct bus_type *bus,
3754 const char *buf,
3755 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003756{
3757 struct rbd_device *rbd_dev = NULL;
3758 int target_id, rc;
3759 unsigned long ul;
3760 int ret = count;
3761
3762 rc = strict_strtoul(buf, 10, &ul);
3763 if (rc)
3764 return rc;
3765
3766 /* convert to int; abort if we lost anything in the conversion */
3767 target_id = (int) ul;
3768 if (target_id != ul)
3769 return -EINVAL;
3770
3771 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3772
3773 rbd_dev = __rbd_get_dev(target_id);
3774 if (!rbd_dev) {
3775 ret = -ENOENT;
3776 goto done;
3777 }
3778
Alex Elder42382b72012-11-16 09:29:16 -06003779 if (rbd_dev->open_count) {
3780 ret = -EBUSY;
3781 goto done;
3782 }
3783
Alex Elder41f38c22012-10-25 23:34:40 -05003784 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003785 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003786
3787done:
3788 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003789
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003790 return ret;
3791}
3792
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003793/*
3794 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003795 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003796 */
3797static int rbd_sysfs_init(void)
3798{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003799 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003800
Alex Elderfed4c142012-02-07 12:03:36 -06003801 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003802 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003803 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003804
Alex Elderfed4c142012-02-07 12:03:36 -06003805 ret = bus_register(&rbd_bus_type);
3806 if (ret < 0)
3807 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003808
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003809 return ret;
3810}
3811
3812static void rbd_sysfs_cleanup(void)
3813{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003814 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003815 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003816}
3817
3818int __init rbd_init(void)
3819{
3820 int rc;
3821
3822 rc = rbd_sysfs_init();
3823 if (rc)
3824 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003825 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003826 return 0;
3827}
3828
3829void __exit rbd_exit(void)
3830{
3831 rbd_sysfs_cleanup();
3832}
3833
3834module_init(rbd_init);
3835module_exit(rbd_exit);
3836
3837MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3838MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3839MODULE_DESCRIPTION("rados block device");
3840
3841/* following authorship retained from original osdblk.c */
3842MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3843
3844MODULE_LICENSE("GPL");