blob: 1428795571c95490d02a6017b5b2635add7889ce [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder9969ebc2013-01-18 12:31:10 -0600173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600176
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
181
182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
199 s32 result;
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600203 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204
205 struct kref kref;
206};
207
208struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */
214 union {
215 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
226};
227
228#define for_each_obj_request(ireq, oreq) \
229 list_for_each_entry(oreq, &ireq->obj_requests, links)
230#define for_each_obj_request_from(ireq, oreq) \
231 list_for_each_entry_from(oreq, &ireq->obj_requests, links)
232#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235struct rbd_snap {
236 struct device dev;
237 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800238 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239 struct list_head node;
240 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500241 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800242};
243
Alex Elderf84344f2012-08-31 17:29:51 -0500244struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500245 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500246 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500247 bool read_only;
248};
249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250/*
251 * a single device
252 */
253struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500254 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255
256 int major; /* blkdev assigned major */
257 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258
Alex Eldera30b71b2012-07-10 20:30:11 -0500259 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260 struct rbd_client *rbd_client;
261
262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
264 spinlock_t lock; /* queue lock */
265
266 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600267 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500268 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500270 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500271
Alex Elder0903e872012-11-14 12:25:19 -0600272 struct ceph_file_layout layout;
273
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274 struct ceph_osd_event *watch_event;
275 struct ceph_osd_request *watch_request;
276
Alex Elder86b00e02012-10-25 23:34:42 -0500277 struct rbd_spec *parent_spec;
278 u64 parent_overlap;
279
Josh Durginc6666012011-11-21 17:11:12 -0800280 /* protects updating the header */
281 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500282
283 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284
285 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */
291 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600292 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800293};
294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600296
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600298static DEFINE_SPINLOCK(rbd_dev_list_lock);
299
Alex Elder432b8582012-01-29 13:57:44 -0600300static LIST_HEAD(rbd_client_list); /* clients */
301static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302
Alex Elder304f6802012-08-31 17:29:52 -0500303static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
304static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
305
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800306static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500307static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800308
Alex Elderf0f8cef2012-01-29 13:57:44 -0600309static ssize_t rbd_add(struct bus_type *bus, const char *buf,
310 size_t count);
311static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
312 size_t count);
313
314static struct bus_attribute rbd_bus_attrs[] = {
315 __ATTR(add, S_IWUSR, NULL, rbd_add),
316 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
317 __ATTR_NULL
318};
319
320static struct bus_type rbd_bus_type = {
321 .name = "rbd",
322 .bus_attrs = rbd_bus_attrs,
323};
324
325static void rbd_root_dev_release(struct device *dev)
326{
327}
328
329static struct device rbd_root_dev = {
330 .init_name = "rbd",
331 .release = rbd_root_dev_release,
332};
333
Alex Elder06ecc6c2012-11-01 10:17:15 -0500334static __printf(2, 3)
335void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
336{
337 struct va_format vaf;
338 va_list args;
339
340 va_start(args, fmt);
341 vaf.fmt = fmt;
342 vaf.va = &args;
343
344 if (!rbd_dev)
345 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
346 else if (rbd_dev->disk)
347 printk(KERN_WARNING "%s: %s: %pV\n",
348 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
349 else if (rbd_dev->spec && rbd_dev->spec->image_name)
350 printk(KERN_WARNING "%s: image %s: %pV\n",
351 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
352 else if (rbd_dev->spec && rbd_dev->spec->image_id)
353 printk(KERN_WARNING "%s: id %s: %pV\n",
354 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
355 else /* punt */
356 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
357 RBD_DRV_NAME, rbd_dev, &vaf);
358 va_end(args);
359}
360
Alex Elderaafb2302012-09-06 16:00:54 -0500361#ifdef RBD_DEBUG
362#define rbd_assert(expr) \
363 if (unlikely(!(expr))) { \
364 printk(KERN_ERR "\nAssertion failure in %s() " \
365 "at line %d:\n\n" \
366 "\trbd_assert(%s);\n\n", \
367 __func__, __LINE__, #expr); \
368 BUG(); \
369 }
370#else /* !RBD_DEBUG */
371# define rbd_assert(expr) ((void) 0)
372#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800373
Alex Elder117973f2012-08-31 17:29:55 -0500374static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
375static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700376
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377static int rbd_open(struct block_device *bdev, fmode_t mode)
378{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600379 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380
Alex Elderf84344f2012-08-31 17:29:51 -0500381 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382 return -EROFS;
383
Alex Elder42382b72012-11-16 09:29:16 -0600384 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600385 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500386 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600387 rbd_dev->open_count++;
388 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700389
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700390 return 0;
391}
392
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800393static int rbd_release(struct gendisk *disk, fmode_t mode)
394{
395 struct rbd_device *rbd_dev = disk->private_data;
396
Alex Elder42382b72012-11-16 09:29:16 -0600397 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398 rbd_assert(rbd_dev->open_count > 0);
399 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600400 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600401 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800402
403 return 0;
404}
405
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406static const struct block_device_operations rbd_bd_ops = {
407 .owner = THIS_MODULE,
408 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800409 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410};
411
412/*
413 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500414 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415 */
Alex Elderf8c38922012-08-10 13:12:07 -0700416static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417{
418 struct rbd_client *rbdc;
419 int ret = -ENOMEM;
420
421 dout("rbd_client_create\n");
422 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
423 if (!rbdc)
424 goto out_opt;
425
426 kref_init(&rbdc->kref);
427 INIT_LIST_HEAD(&rbdc->node);
428
Alex Elderbc534d862012-01-29 13:57:44 -0600429 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
430
Alex Elder43ae4702012-07-03 16:01:18 -0500431 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600433 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500434 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435
436 ret = ceph_open_session(rbdc->client);
437 if (ret < 0)
438 goto out_err;
439
Alex Elder432b8582012-01-29 13:57:44 -0600440 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600442 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443
Alex Elderbc534d862012-01-29 13:57:44 -0600444 mutex_unlock(&ctl_mutex);
445
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446 dout("rbd_client_create created %p\n", rbdc);
447 return rbdc;
448
449out_err:
450 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600451out_mutex:
452 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 kfree(rbdc);
454out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500455 if (ceph_opts)
456 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400457 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458}
459
460/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700461 * Find a ceph client with specific addr and configuration. If
462 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700464static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465{
466 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700467 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468
Alex Elder43ae4702012-07-03 16:01:18 -0500469 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470 return NULL;
471
Alex Elder1f7ba332012-08-10 13:12:07 -0700472 spin_lock(&rbd_client_list_lock);
473 list_for_each_entry(client_node, &rbd_client_list, node) {
474 if (!ceph_compare_options(ceph_opts, client_node->client)) {
475 kref_get(&client_node->kref);
476 found = true;
477 break;
478 }
479 }
480 spin_unlock(&rbd_client_list_lock);
481
482 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483}
484
485/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700486 * mount options
487 */
488enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700489 Opt_last_int,
490 /* int args above */
491 Opt_last_string,
492 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700493 Opt_read_only,
494 Opt_read_write,
495 /* Boolean args above */
496 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700497};
498
Alex Elder43ae4702012-07-03 16:01:18 -0500499static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700500 /* int args above */
501 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500502 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700503 {Opt_read_only, "ro"}, /* Alternate spelling */
504 {Opt_read_write, "read_write"},
505 {Opt_read_write, "rw"}, /* Alternate spelling */
506 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700507 {-1, NULL}
508};
509
Alex Elder98571b52013-01-20 14:44:42 -0600510struct rbd_options {
511 bool read_only;
512};
513
514#define RBD_READ_ONLY_DEFAULT false
515
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700516static int parse_rbd_opts_token(char *c, void *private)
517{
Alex Elder43ae4702012-07-03 16:01:18 -0500518 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700519 substring_t argstr[MAX_OPT_ARGS];
520 int token, intval, ret;
521
Alex Elder43ae4702012-07-03 16:01:18 -0500522 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700523 if (token < 0)
524 return -EINVAL;
525
526 if (token < Opt_last_int) {
527 ret = match_int(&argstr[0], &intval);
528 if (ret < 0) {
529 pr_err("bad mount option arg (not int) "
530 "at '%s'\n", c);
531 return ret;
532 }
533 dout("got int token %d val %d\n", token, intval);
534 } else if (token > Opt_last_int && token < Opt_last_string) {
535 dout("got string token %d val %s\n", token,
536 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700537 } else if (token > Opt_last_string && token < Opt_last_bool) {
538 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700539 } else {
540 dout("got token %d\n", token);
541 }
542
543 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700544 case Opt_read_only:
545 rbd_opts->read_only = true;
546 break;
547 case Opt_read_write:
548 rbd_opts->read_only = false;
549 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700550 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500551 rbd_assert(false);
552 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700553 }
554 return 0;
555}
556
557/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 * Get a ceph client with specific addr and configuration, if one does
559 * not exist create it.
560 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500561static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562{
Alex Elderf8c38922012-08-10 13:12:07 -0700563 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700564
Alex Elder1f7ba332012-08-10 13:12:07 -0700565 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500566 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500567 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500568 else
Alex Elderf8c38922012-08-10 13:12:07 -0700569 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder9d3997f2012-10-25 23:34:42 -0500571 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572}
573
574/*
575 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600576 *
Alex Elder432b8582012-01-29 13:57:44 -0600577 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 */
579static void rbd_client_release(struct kref *kref)
580{
581 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
582
583 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500584 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500586 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587
588 ceph_destroy_client(rbdc->client);
589 kfree(rbdc);
590}
591
592/*
593 * Drop reference to ceph client node. If it's not referenced anymore, release
594 * it.
595 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500596static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597{
Alex Elderc53d5892012-10-25 23:34:42 -0500598 if (rbdc)
599 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600}
601
Alex Eldera30b71b2012-07-10 20:30:11 -0500602static bool rbd_image_format_valid(u32 image_format)
603{
604 return image_format == 1 || image_format == 2;
605}
606
Alex Elder8e94af82012-07-25 09:32:40 -0500607static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
608{
Alex Elder103a1502012-08-02 11:29:45 -0500609 size_t size;
610 u32 snap_count;
611
612 /* The header has to start with the magic rbd header text */
613 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
614 return false;
615
Alex Elderdb2388b2012-10-20 22:17:27 -0500616 /* The bio layer requires at least sector-sized I/O */
617
618 if (ondisk->options.order < SECTOR_SHIFT)
619 return false;
620
621 /* If we use u64 in a few spots we may be able to loosen this */
622
623 if (ondisk->options.order > 8 * sizeof (int) - 1)
624 return false;
625
Alex Elder103a1502012-08-02 11:29:45 -0500626 /*
627 * The size of a snapshot header has to fit in a size_t, and
628 * that limits the number of snapshots.
629 */
630 snap_count = le32_to_cpu(ondisk->snap_count);
631 size = SIZE_MAX - sizeof (struct ceph_snap_context);
632 if (snap_count > size / sizeof (__le64))
633 return false;
634
635 /*
636 * Not only that, but the size of the entire the snapshot
637 * header must also be representable in a size_t.
638 */
639 size -= snap_count * sizeof (__le64);
640 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
641 return false;
642
643 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500644}
645
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646/*
647 * Create a new header structure, translate header format from the on-disk
648 * header.
649 */
650static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500651 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652{
Alex Elderccece232012-07-10 20:30:10 -0500653 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500654 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500655 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500656 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657
Alex Elder6a523252012-07-19 17:12:59 -0500658 memset(header, 0, sizeof (*header));
659
Alex Elder103a1502012-08-02 11:29:45 -0500660 snap_count = le32_to_cpu(ondisk->snap_count);
661
Alex Elder58c17b02012-08-23 23:22:06 -0500662 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
663 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500664 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500666 memcpy(header->object_prefix, ondisk->object_prefix, len);
667 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600668
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500670 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
671
Alex Elder621901d2012-08-23 23:22:06 -0500672 /* Save a copy of the snapshot names */
673
Alex Elderf785cc12012-08-23 23:22:06 -0500674 if (snap_names_len > (u64) SIZE_MAX)
675 return -EIO;
676 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500678 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500679 /*
680 * Note that rbd_dev_v1_header_read() guarantees
681 * the ondisk buffer we're working with has
682 * snap_names_len bytes beyond the end of the
683 * snapshot id array, this memcpy() is safe.
684 */
685 memcpy(header->snap_names, &ondisk->snaps[snap_count],
686 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500687
Alex Elder621901d2012-08-23 23:22:06 -0500688 /* Record each snapshot's size */
689
Alex Elderd2bb24e2012-07-26 23:37:14 -0500690 size = snap_count * sizeof (*header->snap_sizes);
691 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500693 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500694 for (i = 0; i < snap_count; i++)
695 header->snap_sizes[i] =
696 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 } else {
Alex Elderccece232012-07-10 20:30:10 -0500698 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 header->snap_names = NULL;
700 header->snap_sizes = NULL;
701 }
Alex Elder849b4262012-07-09 21:04:24 -0500702
Alex Elder34b13182012-07-13 20:35:12 -0500703 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 header->obj_order = ondisk->options.order;
705 header->crypt_type = ondisk->options.crypt_type;
706 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500707
Alex Elder621901d2012-08-23 23:22:06 -0500708 /* Allocate and fill in the snapshot context */
709
Alex Elderf84344f2012-08-31 17:29:51 -0500710 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500711 size = sizeof (struct ceph_snap_context);
712 size += snap_count * sizeof (header->snapc->snaps[0]);
713 header->snapc = kzalloc(size, GFP_KERNEL);
714 if (!header->snapc)
715 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
717 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500718 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500720 for (i = 0; i < snap_count; i++)
721 header->snapc->snaps[i] =
722 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
724 return 0;
725
Alex Elder6a523252012-07-19 17:12:59 -0500726out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500727 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500728 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500730 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500731 kfree(header->object_prefix);
732 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500733
Alex Elder00f1f362012-02-07 12:03:36 -0600734 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735}
736
Alex Elder9e15b772012-10-30 19:40:33 -0500737static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
738{
739 struct rbd_snap *snap;
740
741 if (snap_id == CEPH_NOSNAP)
742 return RBD_SNAP_HEAD_NAME;
743
744 list_for_each_entry(snap, &rbd_dev->snaps, node)
745 if (snap_id == snap->id)
746 return snap->name;
747
748 return NULL;
749}
750
Alex Elder8836b992012-08-30 14:42:15 -0500751static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753
Alex Eldere86924a2012-07-10 20:30:11 -0500754 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600755
Alex Eldere86924a2012-07-10 20:30:11 -0500756 list_for_each_entry(snap, &rbd_dev->snaps, node) {
757 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500758 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500759 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500760 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600761
Alex Eldere86924a2012-07-10 20:30:11 -0500762 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600763 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700764 }
Alex Eldere86924a2012-07-10 20:30:11 -0500765
Alex Elder00f1f362012-02-07 12:03:36 -0600766 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767}
768
Alex Elder819d52b2012-10-25 23:34:41 -0500769static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770{
Alex Elder78dc4472012-07-19 08:49:18 -0500771 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700772
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500773 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800774 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500775 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500776 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500777 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500778 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500780 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781 if (ret < 0)
782 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500783 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 }
Alex Elderd78b6502012-11-09 08:43:15 -0600785 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 return ret;
788}
789
790static void rbd_header_free(struct rbd_image_header *header)
791{
Alex Elder849b4262012-07-09 21:04:24 -0500792 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500793 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700794 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500795 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500796 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500797 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800798 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500799 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800}
801
Alex Elder98571b52013-01-20 14:44:42 -0600802static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803{
Alex Elder65ccfe22012-08-09 10:33:26 -0700804 char *name;
805 u64 segment;
806 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807
Alex Elder2fd82b92012-11-09 15:05:54 -0600808 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700809 if (!name)
810 return NULL;
811 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600812 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700813 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600814 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700815 pr_err("error formatting segment name for #%llu (%d)\n",
816 segment, ret);
817 kfree(name);
818 name = NULL;
819 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820
Alex Elder65ccfe22012-08-09 10:33:26 -0700821 return name;
822}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700823
Alex Elder65ccfe22012-08-09 10:33:26 -0700824static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
825{
826 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827
Alex Elder65ccfe22012-08-09 10:33:26 -0700828 return offset & (segment_size - 1);
829}
830
831static u64 rbd_segment_length(struct rbd_device *rbd_dev,
832 u64 offset, u64 length)
833{
834 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
835
836 offset &= segment_size - 1;
837
Alex Elderaafb2302012-09-06 16:00:54 -0500838 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700839 if (offset + length > segment_size)
840 length = segment_size - offset;
841
842 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700843}
844
845/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700846 * returns the size of an object in the image
847 */
848static u64 rbd_obj_bytes(struct rbd_image_header *header)
849{
850 return 1 << header->obj_order;
851}
852
853/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854 * bio helpers
855 */
856
857static void bio_chain_put(struct bio *chain)
858{
859 struct bio *tmp;
860
861 while (chain) {
862 tmp = chain;
863 chain = chain->bi_next;
864 bio_put(tmp);
865 }
866}
867
868/*
869 * zeros a bio chain, starting at specific offset
870 */
871static void zero_bio_chain(struct bio *chain, int start_ofs)
872{
873 struct bio_vec *bv;
874 unsigned long flags;
875 void *buf;
876 int i;
877 int pos = 0;
878
879 while (chain) {
880 bio_for_each_segment(bv, chain, i) {
881 if (pos + bv->bv_len > start_ofs) {
882 int remainder = max(start_ofs - pos, 0);
883 buf = bvec_kmap_irq(bv, &flags);
884 memset(buf + remainder, 0,
885 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200886 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 }
888 pos += bv->bv_len;
889 }
890
891 chain = chain->bi_next;
892 }
893}
894
895/*
Alex Elderf7760da2012-10-20 22:17:27 -0500896 * Clone a portion of a bio, starting at the given byte offset
897 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898 */
Alex Elderf7760da2012-10-20 22:17:27 -0500899static struct bio *bio_clone_range(struct bio *bio_src,
900 unsigned int offset,
901 unsigned int len,
902 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903{
Alex Elderf7760da2012-10-20 22:17:27 -0500904 struct bio_vec *bv;
905 unsigned int resid;
906 unsigned short idx;
907 unsigned int voff;
908 unsigned short end_idx;
909 unsigned short vcnt;
910 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911
Alex Elderf7760da2012-10-20 22:17:27 -0500912 /* Handle the easy case for the caller */
913
914 if (!offset && len == bio_src->bi_size)
915 return bio_clone(bio_src, gfpmask);
916
917 if (WARN_ON_ONCE(!len))
918 return NULL;
919 if (WARN_ON_ONCE(len > bio_src->bi_size))
920 return NULL;
921 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
922 return NULL;
923
924 /* Find first affected segment... */
925
926 resid = offset;
927 __bio_for_each_segment(bv, bio_src, idx, 0) {
928 if (resid < bv->bv_len)
929 break;
930 resid -= bv->bv_len;
931 }
932 voff = resid;
933
934 /* ...and the last affected segment */
935
936 resid += len;
937 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
938 if (resid <= bv->bv_len)
939 break;
940 resid -= bv->bv_len;
941 }
942 vcnt = end_idx - idx + 1;
943
944 /* Build the clone */
945
946 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
947 if (!bio)
948 return NULL; /* ENOMEM */
949
950 bio->bi_bdev = bio_src->bi_bdev;
951 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
952 bio->bi_rw = bio_src->bi_rw;
953 bio->bi_flags |= 1 << BIO_CLONED;
954
955 /*
956 * Copy over our part of the bio_vec, then update the first
957 * and last (or only) entries.
958 */
959 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
960 vcnt * sizeof (struct bio_vec));
961 bio->bi_io_vec[0].bv_offset += voff;
962 if (vcnt > 1) {
963 bio->bi_io_vec[0].bv_len -= voff;
964 bio->bi_io_vec[vcnt - 1].bv_len = resid;
965 } else {
966 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 }
968
Alex Elderf7760da2012-10-20 22:17:27 -0500969 bio->bi_vcnt = vcnt;
970 bio->bi_size = len;
971 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700972
Alex Elderf7760da2012-10-20 22:17:27 -0500973 return bio;
974}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Alex Elderf7760da2012-10-20 22:17:27 -0500976/*
977 * Clone a portion of a bio chain, starting at the given byte offset
978 * into the first bio in the source chain and continuing for the
979 * number of bytes indicated. The result is another bio chain of
980 * exactly the given length, or a null pointer on error.
981 *
982 * The bio_src and offset parameters are both in-out. On entry they
983 * refer to the first source bio and the offset into that bio where
984 * the start of data to be cloned is located.
985 *
986 * On return, bio_src is updated to refer to the bio in the source
987 * chain that contains first un-cloned byte, and *offset will
988 * contain the offset of that byte within that bio.
989 */
990static struct bio *bio_chain_clone_range(struct bio **bio_src,
991 unsigned int *offset,
992 unsigned int len,
993 gfp_t gfpmask)
994{
995 struct bio *bi = *bio_src;
996 unsigned int off = *offset;
997 struct bio *chain = NULL;
998 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Alex Elderf7760da2012-10-20 22:17:27 -05001000 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001001
Alex Elderf7760da2012-10-20 22:17:27 -05001002 if (!bi || off >= bi->bi_size || !len)
1003 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
Alex Elderf7760da2012-10-20 22:17:27 -05001005 end = &chain;
1006 while (len) {
1007 unsigned int bi_size;
1008 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001009
Alex Elderf5400b72012-11-01 10:17:15 -05001010 if (!bi) {
1011 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001012 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001013 }
Alex Elderf7760da2012-10-20 22:17:27 -05001014 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1015 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1016 if (!bio)
1017 goto out_err; /* ENOMEM */
1018
1019 *end = bio;
1020 end = &bio->bi_next;
1021
1022 off += bi_size;
1023 if (off == bi->bi_size) {
1024 bi = bi->bi_next;
1025 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026 }
Alex Elderf7760da2012-10-20 22:17:27 -05001027 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028 }
Alex Elderf7760da2012-10-20 22:17:27 -05001029 *bio_src = bi;
1030 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031
Alex Elderf7760da2012-10-20 22:17:27 -05001032 return chain;
1033out_err:
1034 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036 return NULL;
1037}
1038
Alex Elderbf0d5f502012-11-22 00:00:08 -06001039static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1040{
1041 kref_get(&obj_request->kref);
1042}
1043
1044static void rbd_obj_request_destroy(struct kref *kref);
1045static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1046{
1047 rbd_assert(obj_request != NULL);
1048 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1049}
1050
1051static void rbd_img_request_get(struct rbd_img_request *img_request)
1052{
1053 kref_get(&img_request->kref);
1054}
1055
1056static void rbd_img_request_destroy(struct kref *kref);
1057static void rbd_img_request_put(struct rbd_img_request *img_request)
1058{
1059 rbd_assert(img_request != NULL);
1060 kref_put(&img_request->kref, rbd_img_request_destroy);
1061}
1062
1063static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1064 struct rbd_obj_request *obj_request)
1065{
1066 rbd_obj_request_get(obj_request);
1067 obj_request->img_request = img_request;
1068 list_add_tail(&obj_request->links, &img_request->obj_requests);
1069 obj_request->which = img_request->obj_request_count++;
1070 rbd_assert(obj_request->which != BAD_WHICH);
1071}
1072
1073static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1074 struct rbd_obj_request *obj_request)
1075{
1076 rbd_assert(obj_request->which != BAD_WHICH);
1077 obj_request->which = BAD_WHICH;
1078 list_del(&obj_request->links);
1079 rbd_assert(obj_request->img_request == img_request);
1080 obj_request->callback = NULL;
1081 obj_request->img_request = NULL;
1082 rbd_obj_request_put(obj_request);
1083}
1084
1085static bool obj_request_type_valid(enum obj_request_type type)
1086{
1087 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001088 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001089 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001090 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001091 return true;
1092 default:
1093 return false;
1094 }
1095}
1096
Alex Elder8d23bf22012-11-19 22:55:21 -06001097struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1098{
1099 struct ceph_osd_req_op *op;
1100 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001101 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001102
1103 op = kzalloc(sizeof (*op), GFP_NOIO);
1104 if (!op)
1105 return NULL;
1106 op->op = opcode;
1107 va_start(args, opcode);
1108 switch (opcode) {
1109 case CEPH_OSD_OP_READ:
1110 case CEPH_OSD_OP_WRITE:
1111 /* rbd_osd_req_op_create(READ, offset, length) */
1112 /* rbd_osd_req_op_create(WRITE, offset, length) */
1113 op->extent.offset = va_arg(args, u64);
1114 op->extent.length = va_arg(args, u64);
1115 if (opcode == CEPH_OSD_OP_WRITE)
1116 op->payload_len = op->extent.length;
1117 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001118 case CEPH_OSD_OP_CALL:
1119 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1120 op->cls.class_name = va_arg(args, char *);
1121 size = strlen(op->cls.class_name);
1122 rbd_assert(size <= (size_t) U8_MAX);
1123 op->cls.class_len = size;
1124 op->payload_len = size;
1125
1126 op->cls.method_name = va_arg(args, char *);
1127 size = strlen(op->cls.method_name);
1128 rbd_assert(size <= (size_t) U8_MAX);
1129 op->cls.method_len = size;
1130 op->payload_len += size;
1131
1132 op->cls.argc = 0;
1133 op->cls.indata = va_arg(args, void *);
1134 size = va_arg(args, size_t);
1135 rbd_assert(size <= (size_t) U32_MAX);
1136 op->cls.indata_len = (u32) size;
1137 op->payload_len += size;
1138 break;
Alex Elder5efea492012-11-19 22:55:21 -06001139 case CEPH_OSD_OP_NOTIFY_ACK:
1140 case CEPH_OSD_OP_WATCH:
1141 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1142 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1143 op->watch.cookie = va_arg(args, u64);
1144 op->watch.ver = va_arg(args, u64);
1145 op->watch.ver = cpu_to_le64(op->watch.ver);
1146 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1147 op->watch.flag = (u8) 1;
1148 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001149 default:
1150 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1151 kfree(op);
1152 op = NULL;
1153 break;
1154 }
1155 va_end(args);
1156
1157 return op;
1158}
1159
1160static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1161{
1162 kfree(op);
1163}
1164
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165/*
1166 * Send ceph osd request
1167 */
1168static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001169 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 struct ceph_snap_context *snapc,
1171 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001172 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173 struct bio *bio,
1174 struct page **pages,
1175 int num_pages,
1176 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001177 struct ceph_osd_req_op *op,
Alex Elder5f29ddd2012-11-08 08:01:39 -06001178 void (*rbd_cb)(struct ceph_osd_request *,
1179 struct ceph_msg *),
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001180 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181{
Alex Elder1dbb4392012-01-24 10:08:37 -06001182 struct ceph_osd_client *osdc;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001183 struct ceph_osd_request *osd_req;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001184 struct timespec mtime = CURRENT_TIME;
1185 int ret;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001186
Alex Elder7d250b92012-11-30 17:53:04 -06001187 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n",
Alex Elderf7760da2012-10-20 22:17:27 -05001188 object_name, (unsigned long long) ofs,
Alex Elder7d250b92012-11-30 17:53:04 -06001189 (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190
Alex Elder0ce1a792012-07-03 16:01:18 -05001191 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder30573d62012-11-13 21:11:15 -06001192 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001193 if (!osd_req)
1194 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195
Alex Elderd178a9e2012-11-13 21:11:15 -06001196 osd_req->r_flags = flags;
Alex Elder54a54002012-11-13 21:11:15 -06001197 osd_req->r_pages = pages;
1198 if (bio) {
1199 osd_req->r_bio = bio;
1200 bio_get(osd_req->r_bio);
1201 }
Alex Elder2e53c6c2012-11-30 09:59:47 -06001202
Alex Elder5f29ddd2012-11-08 08:01:39 -06001203 osd_req->r_callback = rbd_cb;
Alex Elder7d250b92012-11-30 17:53:04 -06001204 osd_req->r_priv = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205
Alex Elder5f29ddd2012-11-08 08:01:39 -06001206 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1207 osd_req->r_oid_len = strlen(osd_req->r_oid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208
Alex Elder0903e872012-11-14 12:25:19 -06001209 osd_req->r_file_layout = rbd_dev->layout; /* struct */
Alex Eldere01e7922012-11-14 12:25:18 -06001210 osd_req->r_num_pages = calc_pages_for(ofs, len);
1211 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Alex Elder30573d62012-11-13 21:11:15 -06001213 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
Alex Elderae7ca4a32012-11-13 21:11:15 -06001214 snapc, snapid, &mtime);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215
Alex Elder8b84de72012-11-20 14:17:17 -06001216 if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001217 ceph_osdc_set_request_linger(osdc, osd_req);
Alex Elder8b84de72012-11-20 14:17:17 -06001218 rbd_dev->watch_request = osd_req;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001219 }
1220
Alex Elder5f29ddd2012-11-08 08:01:39 -06001221 ret = ceph_osdc_start_request(osdc, osd_req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222 if (ret < 0)
1223 goto done_err;
1224
1225 if (!rbd_cb) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001226 u64 version;
1227
1228 ret = ceph_osdc_wait_request(osdc, osd_req);
1229 version = le64_to_cpu(osd_req->r_reassert_version.version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 if (ver)
Alex Elder5f29ddd2012-11-08 08:01:39 -06001231 *ver = version;
1232 dout("reassert_ver=%llu\n", (unsigned long long) version);
1233 ceph_osdc_put_request(osd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 }
1235 return ret;
1236
1237done_err:
Alex Elder2e53c6c2012-11-30 09:59:47 -06001238 if (bio)
1239 bio_chain_put(osd_req->r_bio);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001240 ceph_osdc_put_request(osd_req);
1241
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242 return ret;
1243}
1244
Alex Elder5f29ddd2012-11-08 08:01:39 -06001245static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1246 struct ceph_msg *msg)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001247{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001248 ceph_osdc_put_request(osd_req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001249}
1250
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251/*
1252 * Do a synchronous ceph osd operation
1253 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001254static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001256 struct ceph_osd_req_op *op,
Alex Elderaded07e2012-07-03 16:01:18 -05001257 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001258 u64 ofs, u64 inbound_size,
1259 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261{
1262 int ret;
1263 struct page **pages;
1264 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001265
Alex Elder30573d62012-11-13 21:11:15 -06001266 rbd_assert(op != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001267
Alex Elderf8d4de62012-07-03 16:01:19 -05001268 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001269 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001270 if (IS_ERR(pages))
1271 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272
Alex Elder25704ac2012-11-09 08:43:16 -06001273 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderf8d4de62012-07-03 16:01:19 -05001274 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001275 pages, num_pages,
1276 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001277 op,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001278 NULL,
Alex Elder8b84de72012-11-20 14:17:17 -06001279 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001280 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001281 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001282
Alex Elderf8d4de62012-07-03 16:01:19 -05001283 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1284 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001285
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001286done:
1287 ceph_release_page_vector(pages, num_pages);
1288 return ret;
1289}
1290
Alex Elderbf0d5f502012-11-22 00:00:08 -06001291static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1292 struct rbd_obj_request *obj_request)
1293{
1294 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1295}
1296
1297static void rbd_img_request_complete(struct rbd_img_request *img_request)
1298{
1299 if (img_request->callback)
1300 img_request->callback(img_request);
1301 else
1302 rbd_img_request_put(img_request);
1303}
1304
Alex Elder788e2df2013-01-17 12:25:27 -06001305/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1306
1307static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1308{
1309 return wait_for_completion_interruptible(&obj_request->completion);
1310}
1311
Alex Elder9969ebc2013-01-18 12:31:10 -06001312static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
1313 struct ceph_osd_op *op)
1314{
1315 atomic_set(&obj_request->done, 1);
1316}
1317
Alex Elderbf0d5f502012-11-22 00:00:08 -06001318static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1319{
1320 if (obj_request->callback)
1321 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001322 else
1323 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001324}
1325
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001326/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327 * Request sync osd watch
1328 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001329static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001331 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001332{
Alex Elder139b4312012-11-13 21:11:15 -06001333 struct ceph_osd_req_op *op;
Sage Weil11f77002011-05-12 16:13:54 -07001334 int ret;
1335
Alex Elder5efea492012-11-19 22:55:21 -06001336 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
Alex Elder139b4312012-11-13 21:11:15 -06001337 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001338 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001339
Alex Elder0ce1a792012-07-03 16:01:18 -05001340 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001341 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001342 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343 CEPH_OSD_FLAG_READ,
Alex Elder30573d62012-11-13 21:11:15 -06001344 op,
Alex Elder8b84de72012-11-20 14:17:17 -06001345 rbd_simple_req_cb, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001346
Alex Elder5efea492012-11-19 22:55:21 -06001347 rbd_osd_req_op_destroy(op);
1348
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001349 return ret;
1350}
1351
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001352/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001353 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001354 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001355static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001356 const char *object_name,
1357 const char *class_name,
1358 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001359 const char *outbound,
1360 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001361 char *inbound,
1362 size_t inbound_size,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001363 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001364{
Alex Elder139b4312012-11-13 21:11:15 -06001365 struct ceph_osd_req_op *op;
Alex Elder57cfc102012-06-26 12:57:03 -07001366 int ret;
1367
Alex Elder3cb4a682012-06-26 12:57:03 -07001368 /*
1369 * Any input parameters required by the method we're calling
1370 * will be sent along with the class and method names as
1371 * part of the message payload. That data and its size are
1372 * supplied via the indata and indata_len fields (named from
1373 * the perspective of the server side) in the OSD request
1374 * operation.
1375 */
Alex Elder2647ba32012-11-19 22:55:21 -06001376 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1377 method_name, outbound, outbound_size);
Alex Elder139b4312012-11-13 21:11:15 -06001378 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001379 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001380
Alex Elder30573d62012-11-13 21:11:15 -06001381 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
Alex Elderf8d4de62012-07-03 16:01:19 -05001382 object_name, 0, inbound_size, inbound,
Alex Elder8b84de72012-11-20 14:17:17 -06001383 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001384
Alex Elder2647ba32012-11-19 22:55:21 -06001385 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001386
1387 dout("cls_exec returned %d\n", ret);
1388 return ret;
1389}
1390
Alex Elderbf0d5f502012-11-22 00:00:08 -06001391static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1392 struct ceph_osd_op *op)
1393{
1394 u64 xferred;
1395
1396 /*
1397 * We support a 64-bit length, but ultimately it has to be
1398 * passed to blk_end_request(), which takes an unsigned int.
1399 */
1400 xferred = le64_to_cpu(op->extent.length);
1401 rbd_assert(xferred < (u64) UINT_MAX);
1402 if (obj_request->result == (s32) -ENOENT) {
1403 zero_bio_chain(obj_request->bio_list, 0);
1404 obj_request->result = 0;
1405 } else if (xferred < obj_request->length && !obj_request->result) {
1406 zero_bio_chain(obj_request->bio_list, xferred);
1407 xferred = obj_request->length;
1408 }
1409 obj_request->xferred = xferred;
1410 atomic_set(&obj_request->done, 1);
1411}
1412
1413static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1414 struct ceph_osd_op *op)
1415{
1416 obj_request->xferred = le64_to_cpu(op->extent.length);
1417 atomic_set(&obj_request->done, 1);
1418}
1419
1420static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1421 struct ceph_msg *msg)
1422{
1423 struct rbd_obj_request *obj_request = osd_req->r_priv;
1424 struct ceph_osd_reply_head *reply_head;
1425 struct ceph_osd_op *op;
1426 u32 num_ops;
1427 u16 opcode;
1428
1429 rbd_assert(osd_req == obj_request->osd_req);
1430 rbd_assert(!!obj_request->img_request ^
1431 (obj_request->which == BAD_WHICH));
1432
1433 obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1434 reply_head = msg->front.iov_base;
1435 obj_request->result = (s32) le32_to_cpu(reply_head->result);
1436 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1437
1438 num_ops = le32_to_cpu(reply_head->num_ops);
1439 WARN_ON(num_ops != 1); /* For now */
1440
1441 op = &reply_head->ops[0];
1442 opcode = le16_to_cpu(op->op);
1443 switch (opcode) {
1444 case CEPH_OSD_OP_READ:
1445 rbd_osd_read_callback(obj_request, op);
1446 break;
1447 case CEPH_OSD_OP_WRITE:
1448 rbd_osd_write_callback(obj_request, op);
1449 break;
Alex Elderb8d70032012-11-30 17:53:04 -06001450 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001451 case CEPH_OSD_OP_WATCH:
1452 rbd_osd_trivial_callback(obj_request, op);
1453 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001454 default:
1455 rbd_warn(NULL, "%s: unsupported op %hu\n",
1456 obj_request->object_name, (unsigned short) opcode);
1457 break;
1458 }
1459
1460 if (atomic_read(&obj_request->done))
1461 rbd_obj_request_complete(obj_request);
1462}
1463
1464static struct ceph_osd_request *rbd_osd_req_create(
1465 struct rbd_device *rbd_dev,
1466 bool write_request,
1467 struct rbd_obj_request *obj_request,
1468 struct ceph_osd_req_op *op)
1469{
1470 struct rbd_img_request *img_request = obj_request->img_request;
1471 struct ceph_snap_context *snapc = NULL;
1472 struct ceph_osd_client *osdc;
1473 struct ceph_osd_request *osd_req;
1474 struct timespec now;
1475 struct timespec *mtime;
1476 u64 snap_id = CEPH_NOSNAP;
1477 u64 offset = obj_request->offset;
1478 u64 length = obj_request->length;
1479
1480 if (img_request) {
1481 rbd_assert(img_request->write_request == write_request);
1482 if (img_request->write_request)
1483 snapc = img_request->snapc;
1484 else
1485 snap_id = img_request->snap_id;
1486 }
1487
1488 /* Allocate and initialize the request, for the single op */
1489
1490 osdc = &rbd_dev->rbd_client->client->osdc;
1491 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1492 if (!osd_req)
1493 return NULL; /* ENOMEM */
1494
1495 rbd_assert(obj_request_type_valid(obj_request->type));
1496 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001497 case OBJ_REQUEST_NODATA:
1498 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001499 case OBJ_REQUEST_BIO:
1500 rbd_assert(obj_request->bio_list != NULL);
1501 osd_req->r_bio = obj_request->bio_list;
1502 bio_get(osd_req->r_bio);
1503 /* osd client requires "num pages" even for bio */
1504 osd_req->r_num_pages = calc_pages_for(offset, length);
1505 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001506 case OBJ_REQUEST_PAGES:
1507 osd_req->r_pages = obj_request->pages;
1508 osd_req->r_num_pages = obj_request->page_count;
1509 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1510 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001511 }
1512
1513 if (write_request) {
1514 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1515 now = CURRENT_TIME;
1516 mtime = &now;
1517 } else {
1518 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1519 mtime = NULL; /* not needed for reads */
1520 offset = 0; /* These are not used... */
1521 length = 0; /* ...for osd read requests */
1522 }
1523
1524 osd_req->r_callback = rbd_osd_req_callback;
1525 osd_req->r_priv = obj_request;
1526
1527 osd_req->r_oid_len = strlen(obj_request->object_name);
1528 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1529 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1530
1531 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1532
1533 /* osd_req will get its own reference to snapc (if non-null) */
1534
1535 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1536 snapc, snap_id, mtime);
1537
1538 return osd_req;
1539}
1540
1541static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1542{
1543 ceph_osdc_put_request(osd_req);
1544}
1545
1546/* object_name is assumed to be a non-null pointer and NUL-terminated */
1547
1548static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1549 u64 offset, u64 length,
1550 enum obj_request_type type)
1551{
1552 struct rbd_obj_request *obj_request;
1553 size_t size;
1554 char *name;
1555
1556 rbd_assert(obj_request_type_valid(type));
1557
1558 size = strlen(object_name) + 1;
1559 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1560 if (!obj_request)
1561 return NULL;
1562
1563 name = (char *)(obj_request + 1);
1564 obj_request->object_name = memcpy(name, object_name, size);
1565 obj_request->offset = offset;
1566 obj_request->length = length;
1567 obj_request->which = BAD_WHICH;
1568 obj_request->type = type;
1569 INIT_LIST_HEAD(&obj_request->links);
1570 atomic_set(&obj_request->done, 0);
Alex Elder788e2df2013-01-17 12:25:27 -06001571 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001572 kref_init(&obj_request->kref);
1573
1574 return obj_request;
1575}
1576
1577static void rbd_obj_request_destroy(struct kref *kref)
1578{
1579 struct rbd_obj_request *obj_request;
1580
1581 obj_request = container_of(kref, struct rbd_obj_request, kref);
1582
1583 rbd_assert(obj_request->img_request == NULL);
1584 rbd_assert(obj_request->which == BAD_WHICH);
1585
1586 if (obj_request->osd_req)
1587 rbd_osd_req_destroy(obj_request->osd_req);
1588
1589 rbd_assert(obj_request_type_valid(obj_request->type));
1590 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001591 case OBJ_REQUEST_NODATA:
1592 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001593 case OBJ_REQUEST_BIO:
1594 if (obj_request->bio_list)
1595 bio_chain_put(obj_request->bio_list);
1596 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001597 case OBJ_REQUEST_PAGES:
1598 if (obj_request->pages)
1599 ceph_release_page_vector(obj_request->pages,
1600 obj_request->page_count);
1601 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001602 }
1603
1604 kfree(obj_request);
1605}
1606
1607/*
1608 * Caller is responsible for filling in the list of object requests
1609 * that comprises the image request, and the Linux request pointer
1610 * (if there is one).
1611 */
1612struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1613 u64 offset, u64 length,
1614 bool write_request)
1615{
1616 struct rbd_img_request *img_request;
1617 struct ceph_snap_context *snapc = NULL;
1618
1619 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1620 if (!img_request)
1621 return NULL;
1622
1623 if (write_request) {
1624 down_read(&rbd_dev->header_rwsem);
1625 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1626 up_read(&rbd_dev->header_rwsem);
1627 if (WARN_ON(!snapc)) {
1628 kfree(img_request);
1629 return NULL; /* Shouldn't happen */
1630 }
1631 }
1632
1633 img_request->rq = NULL;
1634 img_request->rbd_dev = rbd_dev;
1635 img_request->offset = offset;
1636 img_request->length = length;
1637 img_request->write_request = write_request;
1638 if (write_request)
1639 img_request->snapc = snapc;
1640 else
1641 img_request->snap_id = rbd_dev->spec->snap_id;
1642 spin_lock_init(&img_request->completion_lock);
1643 img_request->next_completion = 0;
1644 img_request->callback = NULL;
1645 img_request->obj_request_count = 0;
1646 INIT_LIST_HEAD(&img_request->obj_requests);
1647 kref_init(&img_request->kref);
1648
1649 rbd_img_request_get(img_request); /* Avoid a warning */
1650 rbd_img_request_put(img_request); /* TEMPORARY */
1651
1652 return img_request;
1653}
1654
1655static void rbd_img_request_destroy(struct kref *kref)
1656{
1657 struct rbd_img_request *img_request;
1658 struct rbd_obj_request *obj_request;
1659 struct rbd_obj_request *next_obj_request;
1660
1661 img_request = container_of(kref, struct rbd_img_request, kref);
1662
1663 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1664 rbd_img_obj_request_del(img_request, obj_request);
1665
1666 if (img_request->write_request)
1667 ceph_put_snap_context(img_request->snapc);
1668
1669 kfree(img_request);
1670}
1671
1672static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1673 struct bio *bio_list)
1674{
1675 struct rbd_device *rbd_dev = img_request->rbd_dev;
1676 struct rbd_obj_request *obj_request = NULL;
1677 struct rbd_obj_request *next_obj_request;
1678 unsigned int bio_offset;
1679 u64 image_offset;
1680 u64 resid;
1681 u16 opcode;
1682
1683 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1684 : CEPH_OSD_OP_READ;
1685 bio_offset = 0;
1686 image_offset = img_request->offset;
1687 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1688 resid = img_request->length;
1689 while (resid) {
1690 const char *object_name;
1691 unsigned int clone_size;
1692 struct ceph_osd_req_op *op;
1693 u64 offset;
1694 u64 length;
1695
1696 object_name = rbd_segment_name(rbd_dev, image_offset);
1697 if (!object_name)
1698 goto out_unwind;
1699 offset = rbd_segment_offset(rbd_dev, image_offset);
1700 length = rbd_segment_length(rbd_dev, image_offset, resid);
1701 obj_request = rbd_obj_request_create(object_name,
1702 offset, length,
1703 OBJ_REQUEST_BIO);
1704 kfree(object_name); /* object request has its own copy */
1705 if (!obj_request)
1706 goto out_unwind;
1707
1708 rbd_assert(length <= (u64) UINT_MAX);
1709 clone_size = (unsigned int) length;
1710 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1711 &bio_offset, clone_size,
1712 GFP_ATOMIC);
1713 if (!obj_request->bio_list)
1714 goto out_partial;
1715
1716 /*
1717 * Build up the op to use in building the osd
1718 * request. Note that the contents of the op are
1719 * copied by rbd_osd_req_create().
1720 */
1721 op = rbd_osd_req_op_create(opcode, offset, length);
1722 if (!op)
1723 goto out_partial;
1724 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1725 img_request->write_request,
1726 obj_request, op);
1727 rbd_osd_req_op_destroy(op);
1728 if (!obj_request->osd_req)
1729 goto out_partial;
1730 /* status and version are initially zero-filled */
1731
1732 rbd_img_obj_request_add(img_request, obj_request);
1733
1734 image_offset += length;
1735 resid -= length;
1736 }
1737
1738 return 0;
1739
1740out_partial:
1741 rbd_obj_request_put(obj_request);
1742out_unwind:
1743 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1744 rbd_obj_request_put(obj_request);
1745
1746 return -ENOMEM;
1747}
1748
1749static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1750{
1751 struct rbd_img_request *img_request;
1752 u32 which = obj_request->which;
1753 bool more = true;
1754
1755 img_request = obj_request->img_request;
1756 rbd_assert(img_request != NULL);
1757 rbd_assert(img_request->rq != NULL);
1758 rbd_assert(which != BAD_WHICH);
1759 rbd_assert(which < img_request->obj_request_count);
1760 rbd_assert(which >= img_request->next_completion);
1761
1762 spin_lock_irq(&img_request->completion_lock);
1763 if (which != img_request->next_completion)
1764 goto out;
1765
1766 for_each_obj_request_from(img_request, obj_request) {
1767 unsigned int xferred;
1768 int result;
1769
1770 rbd_assert(more);
1771 rbd_assert(which < img_request->obj_request_count);
1772
1773 if (!atomic_read(&obj_request->done))
1774 break;
1775
1776 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1777 xferred = (unsigned int) obj_request->xferred;
1778 result = (int) obj_request->result;
1779 if (result)
1780 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1781 img_request->write_request ? "write" : "read",
1782 result, xferred);
1783
1784 more = blk_end_request(img_request->rq, result, xferred);
1785 which++;
1786 }
1787 rbd_assert(more ^ (which == img_request->obj_request_count));
1788 img_request->next_completion = which;
1789out:
1790 spin_unlock_irq(&img_request->completion_lock);
1791
1792 if (!more)
1793 rbd_img_request_complete(img_request);
1794}
1795
1796static int rbd_img_request_submit(struct rbd_img_request *img_request)
1797{
1798 struct rbd_device *rbd_dev = img_request->rbd_dev;
1799 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1800 struct rbd_obj_request *obj_request;
1801
1802 for_each_obj_request(img_request, obj_request) {
1803 int ret;
1804
1805 obj_request->callback = rbd_img_obj_callback;
1806 ret = rbd_obj_request_submit(osdc, obj_request);
1807 if (ret)
1808 return ret;
1809 /*
1810 * The image request has its own reference to each
1811 * of its object requests, so we can safely drop the
1812 * initial one here.
1813 */
1814 rbd_obj_request_put(obj_request);
1815 }
1816
1817 return 0;
1818}
1819
Alex Elderb8d70032012-11-30 17:53:04 -06001820static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev,
1821 u64 ver, u64 notify_id)
1822{
1823 struct rbd_obj_request *obj_request;
1824 struct ceph_osd_req_op *op;
1825 struct ceph_osd_client *osdc;
1826 int ret;
1827
1828 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1829 OBJ_REQUEST_NODATA);
1830 if (!obj_request)
1831 return -ENOMEM;
1832
1833 ret = -ENOMEM;
1834 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1835 if (!op)
1836 goto out;
1837 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1838 obj_request, op);
1839 rbd_osd_req_op_destroy(op);
1840 if (!obj_request->osd_req)
1841 goto out;
1842
1843 osdc = &rbd_dev->rbd_client->client->osdc;
1844 ret = rbd_obj_request_submit(osdc, obj_request);
1845 if (!ret)
1846 ret = rbd_obj_request_wait(obj_request);
1847out:
1848 rbd_obj_request_put(obj_request);
1849
1850 return ret;
1851}
1852
1853static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1854{
1855 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1856 u64 hver;
1857 int rc;
1858
1859 if (!rbd_dev)
1860 return;
1861
1862 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1863 rbd_dev->header_name, (unsigned long long) notify_id,
1864 (unsigned int) opcode);
1865 rc = rbd_dev_refresh(rbd_dev, &hver);
1866 if (rc)
1867 rbd_warn(rbd_dev, "got notification but failed to "
1868 " update snaps: %d\n", rc);
1869
1870 (void) rbd_req_sync_notify_ack; /* avoid a warning */
1871 rbd_obj_notify_ack_sync(rbd_dev, hver, notify_id);
1872}
1873
Alex Elder9969ebc2013-01-18 12:31:10 -06001874/*
1875 * Request sync osd watch/unwatch. The value of "start" determines
1876 * whether a watch request is being initiated or torn down.
1877 */
1878static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1879{
1880 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1881 struct rbd_obj_request *obj_request;
1882 struct ceph_osd_req_op *op;
1883 int ret;
1884
1885 rbd_assert(start ^ !!rbd_dev->watch_event);
1886 rbd_assert(start ^ !!rbd_dev->watch_request);
1887
1888 if (start) {
1889 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1890 &rbd_dev->watch_event);
1891 if (ret < 0)
1892 return ret;
1893 }
1894
1895 ret = -ENOMEM;
1896 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1897 OBJ_REQUEST_NODATA);
1898 if (!obj_request)
1899 goto out_cancel;
1900
1901 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1902 rbd_dev->watch_event->cookie,
1903 rbd_dev->header.obj_version, start);
1904 if (!op)
1905 goto out_cancel;
1906 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1907 obj_request, op);
1908 rbd_osd_req_op_destroy(op);
1909 if (!obj_request->osd_req)
1910 goto out_cancel;
1911
1912 if (start) {
1913 rbd_dev->watch_request = obj_request->osd_req;
1914 ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request);
1915 }
1916 ret = rbd_obj_request_submit(osdc, obj_request);
1917 if (ret)
1918 goto out_cancel;
1919 ret = rbd_obj_request_wait(obj_request);
1920 if (ret)
1921 goto out_cancel;
1922
1923 ret = obj_request->result;
1924 if (ret)
1925 goto out_cancel;
1926
1927 if (start)
1928 goto done; /* Done if setting up the watch request */
1929out_cancel:
1930 /* Cancel the event if we're tearing down, or on error */
1931 ceph_osdc_cancel_event(rbd_dev->watch_event);
1932 rbd_dev->watch_event = NULL;
1933done:
1934 if (obj_request)
1935 rbd_obj_request_put(obj_request);
1936
1937 return ret;
1938}
1939
Alex Elderbf0d5f502012-11-22 00:00:08 -06001940static void rbd_request_fn(struct request_queue *q)
1941{
1942 struct rbd_device *rbd_dev = q->queuedata;
1943 bool read_only = rbd_dev->mapping.read_only;
1944 struct request *rq;
1945 int result;
1946
1947 while ((rq = blk_fetch_request(q))) {
1948 bool write_request = rq_data_dir(rq) == WRITE;
1949 struct rbd_img_request *img_request;
1950 u64 offset;
1951 u64 length;
1952
1953 /* Ignore any non-FS requests that filter through. */
1954
1955 if (rq->cmd_type != REQ_TYPE_FS) {
1956 __blk_end_request_all(rq, 0);
1957 continue;
1958 }
1959
1960 spin_unlock_irq(q->queue_lock);
1961
1962 /* Disallow writes to a read-only device */
1963
1964 if (write_request) {
1965 result = -EROFS;
1966 if (read_only)
1967 goto end_request;
1968 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1969 }
1970
1971 /* Quit early if the snapshot has disappeared */
1972
1973 if (!atomic_read(&rbd_dev->exists)) {
1974 dout("request for non-existent snapshot");
1975 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1976 result = -ENXIO;
1977 goto end_request;
1978 }
1979
1980 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1981 length = (u64) blk_rq_bytes(rq);
1982
1983 result = -EINVAL;
1984 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1985 goto end_request; /* Shouldn't happen */
1986
1987 result = -ENOMEM;
1988 img_request = rbd_img_request_create(rbd_dev, offset, length,
1989 write_request);
1990 if (!img_request)
1991 goto end_request;
1992
1993 img_request->rq = rq;
1994
1995 result = rbd_img_request_fill_bio(img_request, rq->bio);
1996 if (!result)
1997 result = rbd_img_request_submit(img_request);
1998 if (result)
1999 rbd_img_request_put(img_request);
2000end_request:
2001 spin_lock_irq(q->queue_lock);
2002 if (result < 0) {
2003 rbd_warn(rbd_dev, "obj_request %s result %d\n",
2004 write_request ? "write" : "read", result);
2005 __blk_end_request_all(rq, result);
2006 }
2007 }
2008}
2009
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002010/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002011 * a queue callback. Makes sure that we don't create a bio that spans across
2012 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002013 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002014 */
2015static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2016 struct bio_vec *bvec)
2017{
2018 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05002019 sector_t sector_offset;
2020 sector_t sectors_per_obj;
2021 sector_t obj_sector_offset;
2022 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002023
Alex Eldere5cfeed2012-10-20 22:17:27 -05002024 /*
2025 * Find how far into its rbd object the partition-relative
2026 * bio start sector is to offset relative to the enclosing
2027 * device.
2028 */
2029 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2030 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2031 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002032
Alex Eldere5cfeed2012-10-20 22:17:27 -05002033 /*
2034 * Compute the number of bytes from that offset to the end
2035 * of the object. Account for what's already used by the bio.
2036 */
2037 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2038 if (ret > bmd->bi_size)
2039 ret -= bmd->bi_size;
2040 else
2041 ret = 0;
2042
2043 /*
2044 * Don't send back more than was asked for. And if the bio
2045 * was empty, let the whole thing through because: "Note
2046 * that a block device *must* allow a single page to be
2047 * added to an empty bio."
2048 */
2049 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2050 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2051 ret = (int) bvec->bv_len;
2052
2053 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002054}
2055
2056static void rbd_free_disk(struct rbd_device *rbd_dev)
2057{
2058 struct gendisk *disk = rbd_dev->disk;
2059
2060 if (!disk)
2061 return;
2062
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002063 if (disk->flags & GENHD_FL_UP)
2064 del_gendisk(disk);
2065 if (disk->queue)
2066 blk_cleanup_queue(disk->queue);
2067 put_disk(disk);
2068}
2069
Alex Elder788e2df2013-01-17 12:25:27 -06002070static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2071 const char *object_name,
2072 u64 offset, u64 length,
2073 char *buf, u64 *version)
2074
2075{
2076 struct ceph_osd_req_op *op;
2077 struct rbd_obj_request *obj_request;
2078 struct ceph_osd_client *osdc;
2079 struct page **pages = NULL;
2080 u32 page_count;
2081 int ret;
2082
2083 page_count = (u32) calc_pages_for(offset, length);
2084 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2085 if (IS_ERR(pages))
2086 ret = PTR_ERR(pages);
2087
2088 ret = -ENOMEM;
2089 obj_request = rbd_obj_request_create(object_name, offset, length,
2090 OBJ_REQUEST_PAGES);
2091 if (!obj_request)
2092 goto out;
2093
2094 obj_request->pages = pages;
2095 obj_request->page_count = page_count;
2096
2097 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2098 if (!op)
2099 goto out;
2100 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2101 obj_request, op);
2102 rbd_osd_req_op_destroy(op);
2103 if (!obj_request->osd_req)
2104 goto out;
2105
2106 osdc = &rbd_dev->rbd_client->client->osdc;
2107 ret = rbd_obj_request_submit(osdc, obj_request);
2108 if (ret)
2109 goto out;
2110 ret = rbd_obj_request_wait(obj_request);
2111 if (ret)
2112 goto out;
2113
2114 ret = obj_request->result;
2115 if (ret < 0)
2116 goto out;
2117 ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
2118 if (version)
2119 *version = obj_request->version;
2120out:
2121 if (obj_request)
2122 rbd_obj_request_put(obj_request);
2123 else
2124 ceph_release_page_vector(pages, page_count);
2125
2126 return ret;
2127}
2128
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002129/*
Alex Elder4156d992012-08-02 11:29:46 -05002130 * Read the complete header for the given rbd device.
2131 *
2132 * Returns a pointer to a dynamically-allocated buffer containing
2133 * the complete and validated header. Caller can pass the address
2134 * of a variable that will be filled in with the version of the
2135 * header object at the time it was read.
2136 *
2137 * Returns a pointer-coded errno if a failure occurs.
2138 */
2139static struct rbd_image_header_ondisk *
2140rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2141{
2142 struct rbd_image_header_ondisk *ondisk = NULL;
2143 u32 snap_count = 0;
2144 u64 names_size = 0;
2145 u32 want_count;
2146 int ret;
2147
2148 /*
2149 * The complete header will include an array of its 64-bit
2150 * snapshot ids, followed by the names of those snapshots as
2151 * a contiguous block of NUL-terminated strings. Note that
2152 * the number of snapshots could change by the time we read
2153 * it in, in which case we re-read it.
2154 */
2155 do {
2156 size_t size;
2157
2158 kfree(ondisk);
2159
2160 size = sizeof (*ondisk);
2161 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2162 size += names_size;
2163 ondisk = kmalloc(size, GFP_KERNEL);
2164 if (!ondisk)
2165 return ERR_PTR(-ENOMEM);
2166
Alex Elder788e2df2013-01-17 12:25:27 -06002167 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002168 0, size,
2169 (char *) ondisk, version);
2170
2171 if (ret < 0)
2172 goto out_err;
2173 if (WARN_ON((size_t) ret < size)) {
2174 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002175 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2176 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002177 goto out_err;
2178 }
2179 if (!rbd_dev_ondisk_valid(ondisk)) {
2180 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002181 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002182 goto out_err;
2183 }
2184
2185 names_size = le64_to_cpu(ondisk->snap_names_len);
2186 want_count = snap_count;
2187 snap_count = le32_to_cpu(ondisk->snap_count);
2188 } while (snap_count != want_count);
2189
2190 return ondisk;
2191
2192out_err:
2193 kfree(ondisk);
2194
2195 return ERR_PTR(ret);
2196}
2197
2198/*
2199 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002200 */
2201static int rbd_read_header(struct rbd_device *rbd_dev,
2202 struct rbd_image_header *header)
2203{
Alex Elder4156d992012-08-02 11:29:46 -05002204 struct rbd_image_header_ondisk *ondisk;
2205 u64 ver = 0;
2206 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002207
Alex Elder4156d992012-08-02 11:29:46 -05002208 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2209 if (IS_ERR(ondisk))
2210 return PTR_ERR(ondisk);
2211 ret = rbd_header_from_disk(header, ondisk);
2212 if (ret >= 0)
2213 header->obj_version = ver;
2214 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002215
Alex Elder4156d992012-08-02 11:29:46 -05002216 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002217}
2218
Alex Elder41f38c22012-10-25 23:34:40 -05002219static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002220{
2221 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002222 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002223
Alex Eldera0593292012-07-19 09:09:27 -05002224 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002225 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002226}
2227
Alex Elder94785542012-10-09 13:50:17 -07002228static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2229{
2230 sector_t size;
2231
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002232 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002233 return;
2234
2235 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2236 dout("setting size to %llu sectors", (unsigned long long) size);
2237 rbd_dev->mapping.size = (u64) size;
2238 set_capacity(rbd_dev->disk, size);
2239}
2240
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002241/*
2242 * only read the first part of the ondisk header, without the snaps info
2243 */
Alex Elder117973f2012-08-31 17:29:55 -05002244static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002245{
2246 int ret;
2247 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002248
2249 ret = rbd_read_header(rbd_dev, &h);
2250 if (ret < 0)
2251 return ret;
2252
Josh Durgina51aa0c2011-12-05 10:35:04 -08002253 down_write(&rbd_dev->header_rwsem);
2254
Alex Elder94785542012-10-09 13:50:17 -07002255 /* Update image size, and check for resize of mapped image */
2256 rbd_dev->header.image_size = h.image_size;
2257 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002258
Alex Elder849b4262012-07-09 21:04:24 -05002259 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002260 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002261 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002262 /* osd requests may still refer to snapc */
2263 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002264
Alex Elderb8136232012-07-25 09:32:41 -05002265 if (hver)
2266 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002267 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002268 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002269 rbd_dev->header.snapc = h.snapc;
2270 rbd_dev->header.snap_names = h.snap_names;
2271 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002272 /* Free the extra copy of the object prefix */
2273 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2274 kfree(h.object_prefix);
2275
Alex Elder304f6802012-08-31 17:29:52 -05002276 ret = rbd_dev_snaps_update(rbd_dev);
2277 if (!ret)
2278 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002279
Josh Durginc6666012011-11-21 17:11:12 -08002280 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002281
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002282 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002283}
2284
Alex Elder117973f2012-08-31 17:29:55 -05002285static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002286{
2287 int ret;
2288
Alex Elder117973f2012-08-31 17:29:55 -05002289 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002290 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002291 if (rbd_dev->image_format == 1)
2292 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2293 else
2294 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002295 mutex_unlock(&ctl_mutex);
2296
2297 return ret;
2298}
2299
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002300static int rbd_init_disk(struct rbd_device *rbd_dev)
2301{
2302 struct gendisk *disk;
2303 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002304 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002305
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002306 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002307 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2308 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002309 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002310
Alex Elderf0f8cef2012-01-29 13:57:44 -06002311 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002312 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002313 disk->major = rbd_dev->major;
2314 disk->first_minor = 0;
2315 disk->fops = &rbd_bd_ops;
2316 disk->private_data = rbd_dev;
2317
Alex Elderbf0d5f502012-11-22 00:00:08 -06002318 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002319 if (!q)
2320 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002321
Alex Elder593a9e72012-02-07 12:03:37 -06002322 /* We use the default size, but let's be explicit about it. */
2323 blk_queue_physical_block_size(q, SECTOR_SIZE);
2324
Josh Durgin029bcbd2011-07-22 11:35:23 -07002325 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002326 segment_size = rbd_obj_bytes(&rbd_dev->header);
2327 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2328 blk_queue_max_segment_size(q, segment_size);
2329 blk_queue_io_min(q, segment_size);
2330 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002331
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002332 blk_queue_merge_bvec(q, rbd_merge_bvec);
2333 disk->queue = q;
2334
2335 q->queuedata = rbd_dev;
2336
2337 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002338
Alex Elder12f02942012-08-29 17:11:07 -05002339 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2340
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002341 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002342out_disk:
2343 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002344
2345 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002346}
2347
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002348/*
2349 sysfs
2350*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002351
Alex Elder593a9e72012-02-07 12:03:37 -06002352static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2353{
2354 return container_of(dev, struct rbd_device, dev);
2355}
2356
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002357static ssize_t rbd_size_show(struct device *dev,
2358 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002359{
Alex Elder593a9e72012-02-07 12:03:37 -06002360 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002361 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002362
Josh Durgina51aa0c2011-12-05 10:35:04 -08002363 down_read(&rbd_dev->header_rwsem);
2364 size = get_capacity(rbd_dev->disk);
2365 up_read(&rbd_dev->header_rwsem);
2366
2367 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002368}
2369
Alex Elder34b13182012-07-13 20:35:12 -05002370/*
2371 * Note this shows the features for whatever's mapped, which is not
2372 * necessarily the base image.
2373 */
2374static ssize_t rbd_features_show(struct device *dev,
2375 struct device_attribute *attr, char *buf)
2376{
2377 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2378
2379 return sprintf(buf, "0x%016llx\n",
2380 (unsigned long long) rbd_dev->mapping.features);
2381}
2382
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002383static ssize_t rbd_major_show(struct device *dev,
2384 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002385{
Alex Elder593a9e72012-02-07 12:03:37 -06002386 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002387
2388 return sprintf(buf, "%d\n", rbd_dev->major);
2389}
2390
2391static ssize_t rbd_client_id_show(struct device *dev,
2392 struct device_attribute *attr, char *buf)
2393{
Alex Elder593a9e72012-02-07 12:03:37 -06002394 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002395
Alex Elder1dbb4392012-01-24 10:08:37 -06002396 return sprintf(buf, "client%lld\n",
2397 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002398}
2399
2400static ssize_t rbd_pool_show(struct device *dev,
2401 struct device_attribute *attr, char *buf)
2402{
Alex Elder593a9e72012-02-07 12:03:37 -06002403 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002404
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002405 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002406}
2407
Alex Elder9bb2f332012-07-12 10:46:35 -05002408static ssize_t rbd_pool_id_show(struct device *dev,
2409 struct device_attribute *attr, char *buf)
2410{
2411 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2412
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002413 return sprintf(buf, "%llu\n",
2414 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002415}
2416
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002417static ssize_t rbd_name_show(struct device *dev,
2418 struct device_attribute *attr, char *buf)
2419{
Alex Elder593a9e72012-02-07 12:03:37 -06002420 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002421
Alex Eldera92ffdf2012-10-30 19:40:33 -05002422 if (rbd_dev->spec->image_name)
2423 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2424
2425 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002426}
2427
Alex Elder589d30e2012-07-10 20:30:11 -05002428static ssize_t rbd_image_id_show(struct device *dev,
2429 struct device_attribute *attr, char *buf)
2430{
2431 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2432
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002433 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002434}
2435
Alex Elder34b13182012-07-13 20:35:12 -05002436/*
2437 * Shows the name of the currently-mapped snapshot (or
2438 * RBD_SNAP_HEAD_NAME for the base image).
2439 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002440static ssize_t rbd_snap_show(struct device *dev,
2441 struct device_attribute *attr,
2442 char *buf)
2443{
Alex Elder593a9e72012-02-07 12:03:37 -06002444 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002445
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002446 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002447}
2448
Alex Elder86b00e02012-10-25 23:34:42 -05002449/*
2450 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2451 * for the parent image. If there is no parent, simply shows
2452 * "(no parent image)".
2453 */
2454static ssize_t rbd_parent_show(struct device *dev,
2455 struct device_attribute *attr,
2456 char *buf)
2457{
2458 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2459 struct rbd_spec *spec = rbd_dev->parent_spec;
2460 int count;
2461 char *bufp = buf;
2462
2463 if (!spec)
2464 return sprintf(buf, "(no parent image)\n");
2465
2466 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2467 (unsigned long long) spec->pool_id, spec->pool_name);
2468 if (count < 0)
2469 return count;
2470 bufp += count;
2471
2472 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2473 spec->image_name ? spec->image_name : "(unknown)");
2474 if (count < 0)
2475 return count;
2476 bufp += count;
2477
2478 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2479 (unsigned long long) spec->snap_id, spec->snap_name);
2480 if (count < 0)
2481 return count;
2482 bufp += count;
2483
2484 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2485 if (count < 0)
2486 return count;
2487 bufp += count;
2488
2489 return (ssize_t) (bufp - buf);
2490}
2491
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002492static ssize_t rbd_image_refresh(struct device *dev,
2493 struct device_attribute *attr,
2494 const char *buf,
2495 size_t size)
2496{
Alex Elder593a9e72012-02-07 12:03:37 -06002497 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002498 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002499
Alex Elder117973f2012-08-31 17:29:55 -05002500 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002501
2502 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002503}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002504
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002505static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002506static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002507static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2508static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2509static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002510static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002511static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002512static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002513static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2514static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002515static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002516
2517static struct attribute *rbd_attrs[] = {
2518 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002519 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002520 &dev_attr_major.attr,
2521 &dev_attr_client_id.attr,
2522 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002523 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002524 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002525 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002526 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002527 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002528 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002529 NULL
2530};
2531
2532static struct attribute_group rbd_attr_group = {
2533 .attrs = rbd_attrs,
2534};
2535
2536static const struct attribute_group *rbd_attr_groups[] = {
2537 &rbd_attr_group,
2538 NULL
2539};
2540
2541static void rbd_sysfs_dev_release(struct device *dev)
2542{
2543}
2544
2545static struct device_type rbd_device_type = {
2546 .name = "rbd",
2547 .groups = rbd_attr_groups,
2548 .release = rbd_sysfs_dev_release,
2549};
2550
2551
2552/*
2553 sysfs - snapshots
2554*/
2555
2556static ssize_t rbd_snap_size_show(struct device *dev,
2557 struct device_attribute *attr,
2558 char *buf)
2559{
2560 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2561
Josh Durgin35915382011-12-05 18:25:13 -08002562 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002563}
2564
2565static ssize_t rbd_snap_id_show(struct device *dev,
2566 struct device_attribute *attr,
2567 char *buf)
2568{
2569 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2570
Josh Durgin35915382011-12-05 18:25:13 -08002571 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002572}
2573
Alex Elder34b13182012-07-13 20:35:12 -05002574static ssize_t rbd_snap_features_show(struct device *dev,
2575 struct device_attribute *attr,
2576 char *buf)
2577{
2578 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2579
2580 return sprintf(buf, "0x%016llx\n",
2581 (unsigned long long) snap->features);
2582}
2583
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002584static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2585static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002586static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002587
2588static struct attribute *rbd_snap_attrs[] = {
2589 &dev_attr_snap_size.attr,
2590 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002591 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002592 NULL,
2593};
2594
2595static struct attribute_group rbd_snap_attr_group = {
2596 .attrs = rbd_snap_attrs,
2597};
2598
2599static void rbd_snap_dev_release(struct device *dev)
2600{
2601 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2602 kfree(snap->name);
2603 kfree(snap);
2604}
2605
2606static const struct attribute_group *rbd_snap_attr_groups[] = {
2607 &rbd_snap_attr_group,
2608 NULL
2609};
2610
2611static struct device_type rbd_snap_device_type = {
2612 .groups = rbd_snap_attr_groups,
2613 .release = rbd_snap_dev_release,
2614};
2615
Alex Elder8b8fb992012-10-26 17:25:24 -05002616static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2617{
2618 kref_get(&spec->kref);
2619
2620 return spec;
2621}
2622
2623static void rbd_spec_free(struct kref *kref);
2624static void rbd_spec_put(struct rbd_spec *spec)
2625{
2626 if (spec)
2627 kref_put(&spec->kref, rbd_spec_free);
2628}
2629
2630static struct rbd_spec *rbd_spec_alloc(void)
2631{
2632 struct rbd_spec *spec;
2633
2634 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2635 if (!spec)
2636 return NULL;
2637 kref_init(&spec->kref);
2638
2639 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2640
2641 return spec;
2642}
2643
2644static void rbd_spec_free(struct kref *kref)
2645{
2646 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2647
2648 kfree(spec->pool_name);
2649 kfree(spec->image_id);
2650 kfree(spec->image_name);
2651 kfree(spec->snap_name);
2652 kfree(spec);
2653}
2654
Alex Elderc53d5892012-10-25 23:34:42 -05002655struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2656 struct rbd_spec *spec)
2657{
2658 struct rbd_device *rbd_dev;
2659
2660 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2661 if (!rbd_dev)
2662 return NULL;
2663
2664 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002665 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002666 INIT_LIST_HEAD(&rbd_dev->node);
2667 INIT_LIST_HEAD(&rbd_dev->snaps);
2668 init_rwsem(&rbd_dev->header_rwsem);
2669
2670 rbd_dev->spec = spec;
2671 rbd_dev->rbd_client = rbdc;
2672
Alex Elder0903e872012-11-14 12:25:19 -06002673 /* Initialize the layout used for all rbd requests */
2674
2675 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2676 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2677 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2678 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2679
Alex Elderc53d5892012-10-25 23:34:42 -05002680 return rbd_dev;
2681}
2682
2683static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2684{
Alex Elder86b00e02012-10-25 23:34:42 -05002685 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002686 kfree(rbd_dev->header_name);
2687 rbd_put_client(rbd_dev->rbd_client);
2688 rbd_spec_put(rbd_dev->spec);
2689 kfree(rbd_dev);
2690}
2691
Alex Elder304f6802012-08-31 17:29:52 -05002692static bool rbd_snap_registered(struct rbd_snap *snap)
2693{
2694 bool ret = snap->dev.type == &rbd_snap_device_type;
2695 bool reg = device_is_registered(&snap->dev);
2696
2697 rbd_assert(!ret ^ reg);
2698
2699 return ret;
2700}
2701
Alex Elder41f38c22012-10-25 23:34:40 -05002702static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002703{
2704 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002705 if (device_is_registered(&snap->dev))
2706 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002707}
2708
Alex Elder14e70852012-07-19 09:09:27 -05002709static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002710 struct device *parent)
2711{
2712 struct device *dev = &snap->dev;
2713 int ret;
2714
2715 dev->type = &rbd_snap_device_type;
2716 dev->parent = parent;
2717 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002718 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002719 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2720
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002721 ret = device_register(dev);
2722
2723 return ret;
2724}
2725
Alex Elder4e891e02012-07-10 20:30:10 -05002726static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002727 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002728 u64 snap_id, u64 snap_size,
2729 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002730{
Alex Elder4e891e02012-07-10 20:30:10 -05002731 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002732 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002733
2734 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002735 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002736 return ERR_PTR(-ENOMEM);
2737
2738 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002739 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002740 if (!snap->name)
2741 goto err;
2742
Alex Elderc8d18422012-07-10 20:30:11 -05002743 snap->id = snap_id;
2744 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002745 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002746
2747 return snap;
2748
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002749err:
2750 kfree(snap->name);
2751 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002752
2753 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002754}
2755
Alex Eldercd892122012-07-03 16:01:19 -05002756static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2757 u64 *snap_size, u64 *snap_features)
2758{
2759 char *snap_name;
2760
2761 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2762
2763 *snap_size = rbd_dev->header.snap_sizes[which];
2764 *snap_features = 0; /* No features for v1 */
2765
2766 /* Skip over names until we find the one we are looking for */
2767
2768 snap_name = rbd_dev->header.snap_names;
2769 while (which--)
2770 snap_name += strlen(snap_name) + 1;
2771
2772 return snap_name;
2773}
2774
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002775/*
Alex Elder9d475de2012-07-03 16:01:19 -05002776 * Get the size and object order for an image snapshot, or if
2777 * snap_id is CEPH_NOSNAP, gets this information for the base
2778 * image.
2779 */
2780static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2781 u8 *order, u64 *snap_size)
2782{
2783 __le64 snapid = cpu_to_le64(snap_id);
2784 int ret;
2785 struct {
2786 u8 order;
2787 __le64 size;
2788 } __attribute__ ((packed)) size_buf = { 0 };
2789
2790 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2791 "rbd", "get_size",
2792 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002793 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder9d475de2012-07-03 16:01:19 -05002794 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2795 if (ret < 0)
2796 return ret;
2797
2798 *order = size_buf.order;
2799 *snap_size = le64_to_cpu(size_buf.size);
2800
2801 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2802 (unsigned long long) snap_id, (unsigned int) *order,
2803 (unsigned long long) *snap_size);
2804
2805 return 0;
2806}
2807
2808static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2809{
2810 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2811 &rbd_dev->header.obj_order,
2812 &rbd_dev->header.image_size);
2813}
2814
Alex Elder1e130192012-07-03 16:01:19 -05002815static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2816{
2817 void *reply_buf;
2818 int ret;
2819 void *p;
2820
2821 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2822 if (!reply_buf)
2823 return -ENOMEM;
2824
2825 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2826 "rbd", "get_object_prefix",
2827 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002828 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder1e130192012-07-03 16:01:19 -05002829 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2830 if (ret < 0)
2831 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002832 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002833
2834 p = reply_buf;
2835 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2836 p + RBD_OBJ_PREFIX_LEN_MAX,
2837 NULL, GFP_NOIO);
2838
2839 if (IS_ERR(rbd_dev->header.object_prefix)) {
2840 ret = PTR_ERR(rbd_dev->header.object_prefix);
2841 rbd_dev->header.object_prefix = NULL;
2842 } else {
2843 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2844 }
2845
2846out:
2847 kfree(reply_buf);
2848
2849 return ret;
2850}
2851
Alex Elderb1b54022012-07-03 16:01:19 -05002852static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2853 u64 *snap_features)
2854{
2855 __le64 snapid = cpu_to_le64(snap_id);
2856 struct {
2857 __le64 features;
2858 __le64 incompat;
2859 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002860 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002861 int ret;
2862
2863 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2864 "rbd", "get_features",
2865 (char *) &snapid, sizeof (snapid),
2866 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002867 NULL);
Alex Elderb1b54022012-07-03 16:01:19 -05002868 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2869 if (ret < 0)
2870 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002871
2872 incompat = le64_to_cpu(features_buf.incompat);
2873 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002874 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002875
Alex Elderb1b54022012-07-03 16:01:19 -05002876 *snap_features = le64_to_cpu(features_buf.features);
2877
2878 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2879 (unsigned long long) snap_id,
2880 (unsigned long long) *snap_features,
2881 (unsigned long long) le64_to_cpu(features_buf.incompat));
2882
2883 return 0;
2884}
2885
2886static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2887{
2888 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2889 &rbd_dev->header.features);
2890}
2891
Alex Elder86b00e02012-10-25 23:34:42 -05002892static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2893{
2894 struct rbd_spec *parent_spec;
2895 size_t size;
2896 void *reply_buf = NULL;
2897 __le64 snapid;
2898 void *p;
2899 void *end;
2900 char *image_id;
2901 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002902 int ret;
2903
2904 parent_spec = rbd_spec_alloc();
2905 if (!parent_spec)
2906 return -ENOMEM;
2907
2908 size = sizeof (__le64) + /* pool_id */
2909 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2910 sizeof (__le64) + /* snap_id */
2911 sizeof (__le64); /* overlap */
2912 reply_buf = kmalloc(size, GFP_KERNEL);
2913 if (!reply_buf) {
2914 ret = -ENOMEM;
2915 goto out_err;
2916 }
2917
2918 snapid = cpu_to_le64(CEPH_NOSNAP);
2919 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2920 "rbd", "get_parent",
2921 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002922 (char *) reply_buf, size, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002923 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2924 if (ret < 0)
2925 goto out_err;
2926
2927 ret = -ERANGE;
2928 p = reply_buf;
2929 end = (char *) reply_buf + size;
2930 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2931 if (parent_spec->pool_id == CEPH_NOPOOL)
2932 goto out; /* No parent? No problem. */
2933
Alex Elder0903e872012-11-14 12:25:19 -06002934 /* The ceph file layout needs to fit pool id in 32 bits */
2935
2936 ret = -EIO;
2937 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2938 goto out;
2939
Alex Elder979ed482012-11-01 08:39:26 -05002940 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002941 if (IS_ERR(image_id)) {
2942 ret = PTR_ERR(image_id);
2943 goto out_err;
2944 }
2945 parent_spec->image_id = image_id;
2946 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2947 ceph_decode_64_safe(&p, end, overlap, out_err);
2948
2949 rbd_dev->parent_overlap = overlap;
2950 rbd_dev->parent_spec = parent_spec;
2951 parent_spec = NULL; /* rbd_dev now owns this */
2952out:
2953 ret = 0;
2954out_err:
2955 kfree(reply_buf);
2956 rbd_spec_put(parent_spec);
2957
2958 return ret;
2959}
2960
Alex Elder9e15b772012-10-30 19:40:33 -05002961static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2962{
2963 size_t image_id_size;
2964 char *image_id;
2965 void *p;
2966 void *end;
2967 size_t size;
2968 void *reply_buf = NULL;
2969 size_t len = 0;
2970 char *image_name = NULL;
2971 int ret;
2972
2973 rbd_assert(!rbd_dev->spec->image_name);
2974
Alex Elder69e7a022012-11-01 08:39:26 -05002975 len = strlen(rbd_dev->spec->image_id);
2976 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002977 image_id = kmalloc(image_id_size, GFP_KERNEL);
2978 if (!image_id)
2979 return NULL;
2980
2981 p = image_id;
2982 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002983 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002984
2985 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2986 reply_buf = kmalloc(size, GFP_KERNEL);
2987 if (!reply_buf)
2988 goto out;
2989
2990 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2991 "rbd", "dir_get_name",
2992 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06002993 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05002994 if (ret < 0)
2995 goto out;
2996 p = reply_buf;
2997 end = (char *) reply_buf + size;
2998 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2999 if (IS_ERR(image_name))
3000 image_name = NULL;
3001 else
3002 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3003out:
3004 kfree(reply_buf);
3005 kfree(image_id);
3006
3007 return image_name;
3008}
3009
3010/*
3011 * When a parent image gets probed, we only have the pool, image,
3012 * and snapshot ids but not the names of any of them. This call
3013 * is made later to fill in those names. It has to be done after
3014 * rbd_dev_snaps_update() has completed because some of the
3015 * information (in particular, snapshot name) is not available
3016 * until then.
3017 */
3018static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3019{
3020 struct ceph_osd_client *osdc;
3021 const char *name;
3022 void *reply_buf = NULL;
3023 int ret;
3024
3025 if (rbd_dev->spec->pool_name)
3026 return 0; /* Already have the names */
3027
3028 /* Look up the pool name */
3029
3030 osdc = &rbd_dev->rbd_client->client->osdc;
3031 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003032 if (!name) {
3033 rbd_warn(rbd_dev, "there is no pool with id %llu",
3034 rbd_dev->spec->pool_id); /* Really a BUG() */
3035 return -EIO;
3036 }
Alex Elder9e15b772012-10-30 19:40:33 -05003037
3038 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3039 if (!rbd_dev->spec->pool_name)
3040 return -ENOMEM;
3041
3042 /* Fetch the image name; tolerate failure here */
3043
3044 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003045 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05003046 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05003047 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003048 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003049
3050 /* Look up the snapshot name. */
3051
3052 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3053 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003054 rbd_warn(rbd_dev, "no snapshot with id %llu",
3055 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003056 ret = -EIO;
3057 goto out_err;
3058 }
3059 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3060 if(!rbd_dev->spec->snap_name)
3061 goto out_err;
3062
3063 return 0;
3064out_err:
3065 kfree(reply_buf);
3066 kfree(rbd_dev->spec->pool_name);
3067 rbd_dev->spec->pool_name = NULL;
3068
3069 return ret;
3070}
3071
Alex Elder6e14b1a2012-07-03 16:01:19 -05003072static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003073{
3074 size_t size;
3075 int ret;
3076 void *reply_buf;
3077 void *p;
3078 void *end;
3079 u64 seq;
3080 u32 snap_count;
3081 struct ceph_snap_context *snapc;
3082 u32 i;
3083
3084 /*
3085 * We'll need room for the seq value (maximum snapshot id),
3086 * snapshot count, and array of that many snapshot ids.
3087 * For now we have a fixed upper limit on the number we're
3088 * prepared to receive.
3089 */
3090 size = sizeof (__le64) + sizeof (__le32) +
3091 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3092 reply_buf = kzalloc(size, GFP_KERNEL);
3093 if (!reply_buf)
3094 return -ENOMEM;
3095
3096 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3097 "rbd", "get_snapcontext",
3098 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003099 reply_buf, size, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003100 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3101 if (ret < 0)
3102 goto out;
3103
3104 ret = -ERANGE;
3105 p = reply_buf;
3106 end = (char *) reply_buf + size;
3107 ceph_decode_64_safe(&p, end, seq, out);
3108 ceph_decode_32_safe(&p, end, snap_count, out);
3109
3110 /*
3111 * Make sure the reported number of snapshot ids wouldn't go
3112 * beyond the end of our buffer. But before checking that,
3113 * make sure the computed size of the snapshot context we
3114 * allocate is representable in a size_t.
3115 */
3116 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3117 / sizeof (u64)) {
3118 ret = -EINVAL;
3119 goto out;
3120 }
3121 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3122 goto out;
3123
3124 size = sizeof (struct ceph_snap_context) +
3125 snap_count * sizeof (snapc->snaps[0]);
3126 snapc = kmalloc(size, GFP_KERNEL);
3127 if (!snapc) {
3128 ret = -ENOMEM;
3129 goto out;
3130 }
3131
3132 atomic_set(&snapc->nref, 1);
3133 snapc->seq = seq;
3134 snapc->num_snaps = snap_count;
3135 for (i = 0; i < snap_count; i++)
3136 snapc->snaps[i] = ceph_decode_64(&p);
3137
3138 rbd_dev->header.snapc = snapc;
3139
3140 dout(" snap context seq = %llu, snap_count = %u\n",
3141 (unsigned long long) seq, (unsigned int) snap_count);
3142
3143out:
3144 kfree(reply_buf);
3145
3146 return 0;
3147}
3148
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003149static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3150{
3151 size_t size;
3152 void *reply_buf;
3153 __le64 snap_id;
3154 int ret;
3155 void *p;
3156 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003157 char *snap_name;
3158
3159 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3160 reply_buf = kmalloc(size, GFP_KERNEL);
3161 if (!reply_buf)
3162 return ERR_PTR(-ENOMEM);
3163
3164 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3165 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3166 "rbd", "get_snapshot_name",
3167 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003168 reply_buf, size, NULL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003169 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3170 if (ret < 0)
3171 goto out;
3172
3173 p = reply_buf;
3174 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003175 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003176 if (IS_ERR(snap_name)) {
3177 ret = PTR_ERR(snap_name);
3178 goto out;
3179 } else {
3180 dout(" snap_id 0x%016llx snap_name = %s\n",
3181 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3182 }
3183 kfree(reply_buf);
3184
3185 return snap_name;
3186out:
3187 kfree(reply_buf);
3188
3189 return ERR_PTR(ret);
3190}
3191
3192static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3193 u64 *snap_size, u64 *snap_features)
3194{
Alex Eldere0b49862013-01-09 14:44:18 -06003195 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003196 u8 order;
3197 int ret;
3198
3199 snap_id = rbd_dev->header.snapc->snaps[which];
3200 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3201 if (ret)
3202 return ERR_PTR(ret);
3203 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3204 if (ret)
3205 return ERR_PTR(ret);
3206
3207 return rbd_dev_v2_snap_name(rbd_dev, which);
3208}
3209
3210static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3211 u64 *snap_size, u64 *snap_features)
3212{
3213 if (rbd_dev->image_format == 1)
3214 return rbd_dev_v1_snap_info(rbd_dev, which,
3215 snap_size, snap_features);
3216 if (rbd_dev->image_format == 2)
3217 return rbd_dev_v2_snap_info(rbd_dev, which,
3218 snap_size, snap_features);
3219 return ERR_PTR(-EINVAL);
3220}
3221
Alex Elder117973f2012-08-31 17:29:55 -05003222static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3223{
3224 int ret;
3225 __u8 obj_order;
3226
3227 down_write(&rbd_dev->header_rwsem);
3228
3229 /* Grab old order first, to see if it changes */
3230
3231 obj_order = rbd_dev->header.obj_order,
3232 ret = rbd_dev_v2_image_size(rbd_dev);
3233 if (ret)
3234 goto out;
3235 if (rbd_dev->header.obj_order != obj_order) {
3236 ret = -EIO;
3237 goto out;
3238 }
3239 rbd_update_mapping_size(rbd_dev);
3240
3241 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3242 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3243 if (ret)
3244 goto out;
3245 ret = rbd_dev_snaps_update(rbd_dev);
3246 dout("rbd_dev_snaps_update returned %d\n", ret);
3247 if (ret)
3248 goto out;
3249 ret = rbd_dev_snaps_register(rbd_dev);
3250 dout("rbd_dev_snaps_register returned %d\n", ret);
3251out:
3252 up_write(&rbd_dev->header_rwsem);
3253
3254 return ret;
3255}
3256
Alex Elder9d475de2012-07-03 16:01:19 -05003257/*
Alex Elder35938152012-08-02 11:29:46 -05003258 * Scan the rbd device's current snapshot list and compare it to the
3259 * newly-received snapshot context. Remove any existing snapshots
3260 * not present in the new snapshot context. Add a new snapshot for
3261 * any snaphots in the snapshot context not in the current list.
3262 * And verify there are no changes to snapshots we already know
3263 * about.
3264 *
3265 * Assumes the snapshots in the snapshot context are sorted by
3266 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3267 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003268 */
Alex Elder304f6802012-08-31 17:29:52 -05003269static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003270{
Alex Elder35938152012-08-02 11:29:46 -05003271 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3272 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003273 struct list_head *head = &rbd_dev->snaps;
3274 struct list_head *links = head->next;
3275 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003276
Alex Elder9fcbb802012-08-23 23:48:49 -05003277 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003278 while (index < snap_count || links != head) {
3279 u64 snap_id;
3280 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003281 char *snap_name;
3282 u64 snap_size = 0;
3283 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003284
Alex Elder35938152012-08-02 11:29:46 -05003285 snap_id = index < snap_count ? snapc->snaps[index]
3286 : CEPH_NOSNAP;
3287 snap = links != head ? list_entry(links, struct rbd_snap, node)
3288 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05003289 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003290
Alex Elder35938152012-08-02 11:29:46 -05003291 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3292 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003293
Alex Elder35938152012-08-02 11:29:46 -05003294 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003295
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003296 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06003297 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05003298 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003299 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003300 rbd_dev->spec->snap_id == snap->id ?
3301 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003302 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003303
Alex Elder35938152012-08-02 11:29:46 -05003304 /* Done with this list entry; advance */
3305
3306 links = next;
3307 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003308 }
Alex Elder35938152012-08-02 11:29:46 -05003309
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003310 snap_name = rbd_dev_snap_info(rbd_dev, index,
3311 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003312 if (IS_ERR(snap_name))
3313 return PTR_ERR(snap_name);
3314
Alex Elder9fcbb802012-08-23 23:48:49 -05003315 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3316 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003317 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3318 struct rbd_snap *new_snap;
3319
3320 /* We haven't seen this snapshot before */
3321
Alex Elderc8d18422012-07-10 20:30:11 -05003322 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003323 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003324 if (IS_ERR(new_snap)) {
3325 int err = PTR_ERR(new_snap);
3326
3327 dout(" failed to add dev, error %d\n", err);
3328
3329 return err;
3330 }
Alex Elder35938152012-08-02 11:29:46 -05003331
3332 /* New goes before existing, or at end of list */
3333
Alex Elder9fcbb802012-08-23 23:48:49 -05003334 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003335 if (snap)
3336 list_add_tail(&new_snap->node, &snap->node);
3337 else
Alex Elder523f3252012-08-30 00:16:37 -05003338 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003339 } else {
3340 /* Already have this one */
3341
Alex Elder9fcbb802012-08-23 23:48:49 -05003342 dout(" already present\n");
3343
Alex Eldercd892122012-07-03 16:01:19 -05003344 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05003345 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003346 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003347
3348 /* Done with this list entry; advance */
3349
3350 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003351 }
Alex Elder35938152012-08-02 11:29:46 -05003352
3353 /* Advance to the next entry in the snapshot context */
3354
3355 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003356 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003357 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003358
3359 return 0;
3360}
3361
Alex Elder304f6802012-08-31 17:29:52 -05003362/*
3363 * Scan the list of snapshots and register the devices for any that
3364 * have not already been registered.
3365 */
3366static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3367{
3368 struct rbd_snap *snap;
3369 int ret = 0;
3370
3371 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003372 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3373 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003374
3375 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3376 if (!rbd_snap_registered(snap)) {
3377 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3378 if (ret < 0)
3379 break;
3380 }
3381 }
3382 dout("%s: returning %d\n", __func__, ret);
3383
3384 return ret;
3385}
3386
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003387static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3388{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003389 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003390 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003391
3392 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003393
Alex Eldercd789ab2012-08-30 00:16:38 -05003394 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003395 dev->bus = &rbd_bus_type;
3396 dev->type = &rbd_device_type;
3397 dev->parent = &rbd_root_dev;
3398 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003399 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003400 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003401
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003402 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003403
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003404 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003405}
3406
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003407static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3408{
3409 device_unregister(&rbd_dev->dev);
3410}
3411
Alex Eldere2839302012-08-29 17:11:06 -05003412static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003413
3414/*
Alex Elder499afd52012-02-02 08:13:29 -06003415 * Get a unique rbd identifier for the given new rbd_dev, and add
3416 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003417 */
Alex Eldere2839302012-08-29 17:11:06 -05003418static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003419{
Alex Eldere2839302012-08-29 17:11:06 -05003420 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003421
3422 spin_lock(&rbd_dev_list_lock);
3423 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3424 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003425 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3426 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003427}
Alex Elderb7f23c32012-01-29 13:57:43 -06003428
Alex Elder1ddbe942012-01-29 13:57:44 -06003429/*
Alex Elder499afd52012-02-02 08:13:29 -06003430 * Remove an rbd_dev from the global list, and record that its
3431 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003432 */
Alex Eldere2839302012-08-29 17:11:06 -05003433static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003434{
Alex Elderd184f6b2012-01-29 13:57:44 -06003435 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003436 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003437 int max_id;
3438
Alex Elderaafb2302012-09-06 16:00:54 -05003439 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003440
Alex Eldere2839302012-08-29 17:11:06 -05003441 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3442 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003443 spin_lock(&rbd_dev_list_lock);
3444 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003445
3446 /*
3447 * If the id being "put" is not the current maximum, there
3448 * is nothing special we need to do.
3449 */
Alex Eldere2839302012-08-29 17:11:06 -05003450 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003451 spin_unlock(&rbd_dev_list_lock);
3452 return;
3453 }
3454
3455 /*
3456 * We need to update the current maximum id. Search the
3457 * list to find out what it is. We're more likely to find
3458 * the maximum at the end, so search the list backward.
3459 */
3460 max_id = 0;
3461 list_for_each_prev(tmp, &rbd_dev_list) {
3462 struct rbd_device *rbd_dev;
3463
3464 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003465 if (rbd_dev->dev_id > max_id)
3466 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003467 }
Alex Elder499afd52012-02-02 08:13:29 -06003468 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003469
Alex Elder1ddbe942012-01-29 13:57:44 -06003470 /*
Alex Eldere2839302012-08-29 17:11:06 -05003471 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003472 * which case it now accurately reflects the new maximum.
3473 * Be careful not to overwrite the maximum value in that
3474 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003475 */
Alex Eldere2839302012-08-29 17:11:06 -05003476 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3477 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003478}
3479
Alex Eldera725f65e2012-02-02 08:13:30 -06003480/*
Alex Eldere28fff262012-02-02 08:13:30 -06003481 * Skips over white space at *buf, and updates *buf to point to the
3482 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003483 * the token (string of non-white space characters) found. Note
3484 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003485 */
3486static inline size_t next_token(const char **buf)
3487{
3488 /*
3489 * These are the characters that produce nonzero for
3490 * isspace() in the "C" and "POSIX" locales.
3491 */
3492 const char *spaces = " \f\n\r\t\v";
3493
3494 *buf += strspn(*buf, spaces); /* Find start of token */
3495
3496 return strcspn(*buf, spaces); /* Return token length */
3497}
3498
3499/*
3500 * Finds the next token in *buf, and if the provided token buffer is
3501 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003502 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3503 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003504 *
3505 * Returns the length of the token found (not including the '\0').
3506 * Return value will be 0 if no token is found, and it will be >=
3507 * token_size if the token would not fit.
3508 *
Alex Elder593a9e72012-02-07 12:03:37 -06003509 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003510 * found token. Note that this occurs even if the token buffer is
3511 * too small to hold it.
3512 */
3513static inline size_t copy_token(const char **buf,
3514 char *token,
3515 size_t token_size)
3516{
3517 size_t len;
3518
3519 len = next_token(buf);
3520 if (len < token_size) {
3521 memcpy(token, *buf, len);
3522 *(token + len) = '\0';
3523 }
3524 *buf += len;
3525
3526 return len;
3527}
3528
3529/*
Alex Elderea3352f2012-07-09 21:04:23 -05003530 * Finds the next token in *buf, dynamically allocates a buffer big
3531 * enough to hold a copy of it, and copies the token into the new
3532 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3533 * that a duplicate buffer is created even for a zero-length token.
3534 *
3535 * Returns a pointer to the newly-allocated duplicate, or a null
3536 * pointer if memory for the duplicate was not available. If
3537 * the lenp argument is a non-null pointer, the length of the token
3538 * (not including the '\0') is returned in *lenp.
3539 *
3540 * If successful, the *buf pointer will be updated to point beyond
3541 * the end of the found token.
3542 *
3543 * Note: uses GFP_KERNEL for allocation.
3544 */
3545static inline char *dup_token(const char **buf, size_t *lenp)
3546{
3547 char *dup;
3548 size_t len;
3549
3550 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003551 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003552 if (!dup)
3553 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003554 *(dup + len) = '\0';
3555 *buf += len;
3556
3557 if (lenp)
3558 *lenp = len;
3559
3560 return dup;
3561}
3562
3563/*
Alex Elder859c31d2012-10-25 23:34:42 -05003564 * Parse the options provided for an "rbd add" (i.e., rbd image
3565 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3566 * and the data written is passed here via a NUL-terminated buffer.
3567 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003568 *
Alex Elder859c31d2012-10-25 23:34:42 -05003569 * The information extracted from these options is recorded in
3570 * the other parameters which return dynamically-allocated
3571 * structures:
3572 * ceph_opts
3573 * The address of a pointer that will refer to a ceph options
3574 * structure. Caller must release the returned pointer using
3575 * ceph_destroy_options() when it is no longer needed.
3576 * rbd_opts
3577 * Address of an rbd options pointer. Fully initialized by
3578 * this function; caller must release with kfree().
3579 * spec
3580 * Address of an rbd image specification pointer. Fully
3581 * initialized by this function based on parsed options.
3582 * Caller must release with rbd_spec_put().
3583 *
3584 * The options passed take this form:
3585 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3586 * where:
3587 * <mon_addrs>
3588 * A comma-separated list of one or more monitor addresses.
3589 * A monitor address is an ip address, optionally followed
3590 * by a port number (separated by a colon).
3591 * I.e.: ip1[:port1][,ip2[:port2]...]
3592 * <options>
3593 * A comma-separated list of ceph and/or rbd options.
3594 * <pool_name>
3595 * The name of the rados pool containing the rbd image.
3596 * <image_name>
3597 * The name of the image in that pool to map.
3598 * <snap_id>
3599 * An optional snapshot id. If provided, the mapping will
3600 * present data from the image at the time that snapshot was
3601 * created. The image head is used if no snapshot id is
3602 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003603 */
Alex Elder859c31d2012-10-25 23:34:42 -05003604static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003605 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003606 struct rbd_options **opts,
3607 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003608{
Alex Elderd22f76e2012-07-12 10:46:35 -05003609 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003610 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003611 const char *mon_addrs;
3612 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003613 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003614 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003615 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003616 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003617
3618 /* The first four tokens are required */
3619
Alex Elder7ef32142012-02-02 08:13:30 -06003620 len = next_token(&buf);
Alex Elder4fb5d672012-11-01 10:17:15 -05003621 if (!len) {
3622 rbd_warn(NULL, "no monitor address(es) provided");
3623 return -EINVAL;
3624 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003625 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003626 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003627 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003628
Alex Elderdc79b112012-10-25 23:34:41 -05003629 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003630 options = dup_token(&buf, NULL);
3631 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003632 return -ENOMEM;
Alex Elder4fb5d672012-11-01 10:17:15 -05003633 if (!*options) {
3634 rbd_warn(NULL, "no options provided");
3635 goto out_err;
3636 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003637
Alex Elder859c31d2012-10-25 23:34:42 -05003638 spec = rbd_spec_alloc();
3639 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003640 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003641
3642 spec->pool_name = dup_token(&buf, NULL);
3643 if (!spec->pool_name)
3644 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003645 if (!*spec->pool_name) {
3646 rbd_warn(NULL, "no pool name provided");
3647 goto out_err;
3648 }
Alex Eldere28fff262012-02-02 08:13:30 -06003649
Alex Elder69e7a022012-11-01 08:39:26 -05003650 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003651 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003652 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003653 if (!*spec->image_name) {
3654 rbd_warn(NULL, "no image name provided");
3655 goto out_err;
3656 }
Alex Eldere28fff262012-02-02 08:13:30 -06003657
Alex Elderf28e5652012-10-25 23:34:41 -05003658 /*
3659 * Snapshot name is optional; default is to use "-"
3660 * (indicating the head/no snapshot).
3661 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003662 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003663 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003664 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3665 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003666 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003667 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003668 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003669 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003670 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003671 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003672 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003673 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003674
Alex Elder0ddebc02012-10-25 23:34:41 -05003675 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003676
Alex Elder4e9afeb2012-10-25 23:34:41 -05003677 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3678 if (!rbd_opts)
3679 goto out_mem;
3680
3681 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003682
Alex Elder859c31d2012-10-25 23:34:42 -05003683 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003684 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003685 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003686 if (IS_ERR(copts)) {
3687 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003688 goto out_err;
3689 }
Alex Elder859c31d2012-10-25 23:34:42 -05003690 kfree(options);
3691
3692 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003693 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003694 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003695
Alex Elderdc79b112012-10-25 23:34:41 -05003696 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003697out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003698 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003699out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003700 kfree(rbd_opts);
3701 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003702 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003703
Alex Elderdc79b112012-10-25 23:34:41 -05003704 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003705}
3706
Alex Elder589d30e2012-07-10 20:30:11 -05003707/*
3708 * An rbd format 2 image has a unique identifier, distinct from the
3709 * name given to it by the user. Internally, that identifier is
3710 * what's used to specify the names of objects related to the image.
3711 *
3712 * A special "rbd id" object is used to map an rbd image name to its
3713 * id. If that object doesn't exist, then there is no v2 rbd image
3714 * with the supplied name.
3715 *
3716 * This function will record the given rbd_dev's image_id field if
3717 * it can be determined, and in that case will return 0. If any
3718 * errors occur a negative errno will be returned and the rbd_dev's
3719 * image_id field will be unchanged (and should be NULL).
3720 */
3721static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3722{
3723 int ret;
3724 size_t size;
3725 char *object_name;
3726 void *response;
3727 void *p;
3728
3729 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003730 * When probing a parent image, the image id is already
3731 * known (and the image name likely is not). There's no
3732 * need to fetch the image id again in this case.
3733 */
3734 if (rbd_dev->spec->image_id)
3735 return 0;
3736
3737 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003738 * First, see if the format 2 image id file exists, and if
3739 * so, get the image's persistent id from it.
3740 */
Alex Elder69e7a022012-11-01 08:39:26 -05003741 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003742 object_name = kmalloc(size, GFP_NOIO);
3743 if (!object_name)
3744 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003745 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003746 dout("rbd id object name is %s\n", object_name);
3747
3748 /* Response will be an encoded string, which includes a length */
3749
3750 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3751 response = kzalloc(size, GFP_NOIO);
3752 if (!response) {
3753 ret = -ENOMEM;
3754 goto out;
3755 }
3756
3757 ret = rbd_req_sync_exec(rbd_dev, object_name,
3758 "rbd", "get_id",
3759 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003760 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003761 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3762 if (ret < 0)
3763 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003764 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003765
3766 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003767 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003768 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003769 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003770 if (IS_ERR(rbd_dev->spec->image_id)) {
3771 ret = PTR_ERR(rbd_dev->spec->image_id);
3772 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003773 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003774 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003775 }
3776out:
3777 kfree(response);
3778 kfree(object_name);
3779
3780 return ret;
3781}
3782
Alex Eldera30b71b2012-07-10 20:30:11 -05003783static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3784{
3785 int ret;
3786 size_t size;
3787
3788 /* Version 1 images have no id; empty string is used */
3789
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003790 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3791 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003792 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003793
3794 /* Record the header object name for this rbd image. */
3795
Alex Elder69e7a022012-11-01 08:39:26 -05003796 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003797 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3798 if (!rbd_dev->header_name) {
3799 ret = -ENOMEM;
3800 goto out_err;
3801 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003802 sprintf(rbd_dev->header_name, "%s%s",
3803 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003804
3805 /* Populate rbd image metadata */
3806
3807 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3808 if (ret < 0)
3809 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003810
3811 /* Version 1 images have no parent (no layering) */
3812
3813 rbd_dev->parent_spec = NULL;
3814 rbd_dev->parent_overlap = 0;
3815
Alex Eldera30b71b2012-07-10 20:30:11 -05003816 rbd_dev->image_format = 1;
3817
3818 dout("discovered version 1 image, header name is %s\n",
3819 rbd_dev->header_name);
3820
3821 return 0;
3822
3823out_err:
3824 kfree(rbd_dev->header_name);
3825 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003826 kfree(rbd_dev->spec->image_id);
3827 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003828
3829 return ret;
3830}
3831
3832static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3833{
3834 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003835 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003836 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003837
3838 /*
3839 * Image id was filled in by the caller. Record the header
3840 * object name for this rbd image.
3841 */
Alex Elder979ed482012-11-01 08:39:26 -05003842 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003843 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3844 if (!rbd_dev->header_name)
3845 return -ENOMEM;
3846 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003847 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003848
3849 /* Get the size and object order for the image */
3850
3851 ret = rbd_dev_v2_image_size(rbd_dev);
3852 if (ret < 0)
3853 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003854
3855 /* Get the object prefix (a.k.a. block_name) for the image */
3856
3857 ret = rbd_dev_v2_object_prefix(rbd_dev);
3858 if (ret < 0)
3859 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003860
Alex Elderd8891402012-10-09 13:50:17 -07003861 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003862
3863 ret = rbd_dev_v2_features(rbd_dev);
3864 if (ret < 0)
3865 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003866
Alex Elder86b00e02012-10-25 23:34:42 -05003867 /* If the image supports layering, get the parent info */
3868
3869 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3870 ret = rbd_dev_v2_parent_info(rbd_dev);
3871 if (ret < 0)
3872 goto out_err;
3873 }
3874
Alex Elder6e14b1a2012-07-03 16:01:19 -05003875 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003876
Alex Elder6e14b1a2012-07-03 16:01:19 -05003877 rbd_dev->header.crypt_type = 0;
3878 rbd_dev->header.comp_type = 0;
3879
3880 /* Get the snapshot context, plus the header version */
3881
3882 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003883 if (ret)
3884 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003885 rbd_dev->header.obj_version = ver;
3886
Alex Eldera30b71b2012-07-10 20:30:11 -05003887 rbd_dev->image_format = 2;
3888
3889 dout("discovered version 2 image, header name is %s\n",
3890 rbd_dev->header_name);
3891
Alex Elder35152972012-08-31 17:29:55 -05003892 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003893out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003894 rbd_dev->parent_overlap = 0;
3895 rbd_spec_put(rbd_dev->parent_spec);
3896 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003897 kfree(rbd_dev->header_name);
3898 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003899 kfree(rbd_dev->header.object_prefix);
3900 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003901
3902 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003903}
3904
Alex Elder83a06262012-10-30 15:47:17 -05003905static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3906{
3907 int ret;
3908
3909 /* no need to lock here, as rbd_dev is not registered yet */
3910 ret = rbd_dev_snaps_update(rbd_dev);
3911 if (ret)
3912 return ret;
3913
Alex Elder9e15b772012-10-30 19:40:33 -05003914 ret = rbd_dev_probe_update_spec(rbd_dev);
3915 if (ret)
3916 goto err_out_snaps;
3917
Alex Elder83a06262012-10-30 15:47:17 -05003918 ret = rbd_dev_set_mapping(rbd_dev);
3919 if (ret)
3920 goto err_out_snaps;
3921
3922 /* generate unique id: find highest unique id, add one */
3923 rbd_dev_id_get(rbd_dev);
3924
3925 /* Fill in the device name, now that we have its id. */
3926 BUILD_BUG_ON(DEV_NAME_LEN
3927 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3928 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3929
3930 /* Get our block major device number. */
3931
3932 ret = register_blkdev(0, rbd_dev->name);
3933 if (ret < 0)
3934 goto err_out_id;
3935 rbd_dev->major = ret;
3936
3937 /* Set up the blkdev mapping. */
3938
3939 ret = rbd_init_disk(rbd_dev);
3940 if (ret)
3941 goto err_out_blkdev;
3942
3943 ret = rbd_bus_add_dev(rbd_dev);
3944 if (ret)
3945 goto err_out_disk;
3946
3947 /*
3948 * At this point cleanup in the event of an error is the job
3949 * of the sysfs code (initiated by rbd_bus_del_dev()).
3950 */
3951 down_write(&rbd_dev->header_rwsem);
3952 ret = rbd_dev_snaps_register(rbd_dev);
3953 up_write(&rbd_dev->header_rwsem);
3954 if (ret)
3955 goto err_out_bus;
3956
Alex Elder9969ebc2013-01-18 12:31:10 -06003957 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05003958 if (ret)
3959 goto err_out_bus;
3960
3961 /* Everything's ready. Announce the disk to the world. */
3962
3963 add_disk(rbd_dev->disk);
3964
3965 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3966 (unsigned long long) rbd_dev->mapping.size);
3967
3968 return ret;
3969err_out_bus:
3970 /* this will also clean up rest of rbd_dev stuff */
3971
3972 rbd_bus_del_dev(rbd_dev);
3973
3974 return ret;
3975err_out_disk:
3976 rbd_free_disk(rbd_dev);
3977err_out_blkdev:
3978 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3979err_out_id:
3980 rbd_dev_id_put(rbd_dev);
3981err_out_snaps:
3982 rbd_remove_all_snaps(rbd_dev);
3983
3984 return ret;
3985}
3986
Alex Eldera30b71b2012-07-10 20:30:11 -05003987/*
3988 * Probe for the existence of the header object for the given rbd
3989 * device. For format 2 images this includes determining the image
3990 * id.
3991 */
3992static int rbd_dev_probe(struct rbd_device *rbd_dev)
3993{
3994 int ret;
3995
3996 /*
3997 * Get the id from the image id object. If it's not a
3998 * format 2 image, we'll get ENOENT back, and we'll assume
3999 * it's a format 1 image.
4000 */
4001 ret = rbd_dev_image_id(rbd_dev);
4002 if (ret)
4003 ret = rbd_dev_v1_probe(rbd_dev);
4004 else
4005 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004006 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05004007 dout("probe failed, returning %d\n", ret);
4008
Alex Elder83a06262012-10-30 15:47:17 -05004009 return ret;
4010 }
4011
4012 ret = rbd_dev_probe_finish(rbd_dev);
4013 if (ret)
4014 rbd_header_free(&rbd_dev->header);
4015
Alex Eldera30b71b2012-07-10 20:30:11 -05004016 return ret;
4017}
4018
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004019static ssize_t rbd_add(struct bus_type *bus,
4020 const char *buf,
4021 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004022{
Alex Eldercb8627c2012-07-09 21:04:23 -05004023 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004024 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004025 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004026 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004027 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004028 struct ceph_osd_client *osdc;
4029 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004030
4031 if (!try_module_get(THIS_MODULE))
4032 return -ENODEV;
4033
Alex Eldera725f65e2012-02-02 08:13:30 -06004034 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004035 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004036 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004037 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004038
Alex Elder9d3997f2012-10-25 23:34:42 -05004039 rbdc = rbd_get_client(ceph_opts);
4040 if (IS_ERR(rbdc)) {
4041 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004042 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004043 }
Alex Elderc53d5892012-10-25 23:34:42 -05004044 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004045
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004046 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004047 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004048 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004049 if (rc < 0)
4050 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004051 spec->pool_id = (u64) rc;
4052
Alex Elder0903e872012-11-14 12:25:19 -06004053 /* The ceph file layout needs to fit pool id in 32 bits */
4054
4055 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4056 rc = -EIO;
4057 goto err_out_client;
4058 }
4059
Alex Elderc53d5892012-10-25 23:34:42 -05004060 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004061 if (!rbd_dev)
4062 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004063 rbdc = NULL; /* rbd_dev now owns this */
4064 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004065
Alex Elderbd4ba652012-10-25 23:34:42 -05004066 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004067 kfree(rbd_opts);
4068 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004069
Alex Eldera30b71b2012-07-10 20:30:11 -05004070 rc = rbd_dev_probe(rbd_dev);
4071 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004072 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004073
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004074 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004075err_out_rbd_dev:
4076 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004077err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004078 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004079err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004080 if (ceph_opts)
4081 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004082 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004083 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004084err_out_module:
4085 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004086
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004087 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004088
4089 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004090}
4091
Alex Elderde71a292012-07-03 16:01:19 -05004092static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004093{
4094 struct list_head *tmp;
4095 struct rbd_device *rbd_dev;
4096
Alex Eldere124a822012-01-29 13:57:44 -06004097 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004098 list_for_each(tmp, &rbd_dev_list) {
4099 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004100 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06004101 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004102 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06004103 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004104 }
Alex Eldere124a822012-01-29 13:57:44 -06004105 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004106 return NULL;
4107}
4108
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004109static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004110{
Alex Elder593a9e72012-02-07 12:03:37 -06004111 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004112
Alex Elder1dbb4392012-01-24 10:08:37 -06004113 if (rbd_dev->watch_request) {
4114 struct ceph_client *client = rbd_dev->rbd_client->client;
4115
4116 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004117 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06004118 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004119 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004120 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004121
4122 /* clean up and free blkdev */
4123 rbd_free_disk(rbd_dev);
4124 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004125
Alex Elder2ac4e752012-07-10 20:30:10 -05004126 /* release allocated disk header fields */
4127 rbd_header_free(&rbd_dev->header);
4128
Alex Elder32eec682012-02-08 16:11:14 -06004129 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004130 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004131 rbd_assert(rbd_dev->rbd_client != NULL);
4132 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004133
4134 /* release module ref */
4135 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004136}
4137
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004138static ssize_t rbd_remove(struct bus_type *bus,
4139 const char *buf,
4140 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004141{
4142 struct rbd_device *rbd_dev = NULL;
4143 int target_id, rc;
4144 unsigned long ul;
4145 int ret = count;
4146
4147 rc = strict_strtoul(buf, 10, &ul);
4148 if (rc)
4149 return rc;
4150
4151 /* convert to int; abort if we lost anything in the conversion */
4152 target_id = (int) ul;
4153 if (target_id != ul)
4154 return -EINVAL;
4155
4156 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4157
4158 rbd_dev = __rbd_get_dev(target_id);
4159 if (!rbd_dev) {
4160 ret = -ENOENT;
4161 goto done;
4162 }
4163
Alex Elder42382b72012-11-16 09:29:16 -06004164 if (rbd_dev->open_count) {
4165 ret = -EBUSY;
4166 goto done;
4167 }
4168
Alex Elder41f38c22012-10-25 23:34:40 -05004169 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004170 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004171
4172done:
4173 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004174
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004175 return ret;
4176}
4177
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004178/*
4179 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004180 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004181 */
4182static int rbd_sysfs_init(void)
4183{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004184 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004185
Alex Elderfed4c142012-02-07 12:03:36 -06004186 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004187 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004188 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004189
Alex Elderfed4c142012-02-07 12:03:36 -06004190 ret = bus_register(&rbd_bus_type);
4191 if (ret < 0)
4192 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004193
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004194 return ret;
4195}
4196
4197static void rbd_sysfs_cleanup(void)
4198{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004199 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004200 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004201}
4202
4203int __init rbd_init(void)
4204{
4205 int rc;
4206
4207 rc = rbd_sysfs_init();
4208 if (rc)
4209 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004210 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004211 return 0;
4212}
4213
4214void __exit rbd_exit(void)
4215{
4216 rbd_sysfs_cleanup();
4217}
4218
4219module_init(rbd_init);
4220module_exit(rbd_exit);
4221
4222MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4223MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4224MODULE_DESCRIPTION("rados block device");
4225
4226/* following authorship retained from original osdblk.c */
4227MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4228
4229MODULE_LICENSE("GPL");