blob: d10649f2e346be5043e3b9e8668ffec44f7833bf [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder9969ebc2013-01-18 12:31:10 -0600173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600176
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
181
182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
199 s32 result;
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600203 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204
205 struct kref kref;
206};
207
208struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */
214 union {
215 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
226};
227
228#define for_each_obj_request(ireq, oreq) \
229 list_for_each_entry(oreq, &ireq->obj_requests, links)
230#define for_each_obj_request_from(ireq, oreq) \
231 list_for_each_entry_from(oreq, &ireq->obj_requests, links)
232#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235struct rbd_snap {
236 struct device dev;
237 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800238 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239 struct list_head node;
240 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500241 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800242};
243
Alex Elderf84344f2012-08-31 17:29:51 -0500244struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500245 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500246 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500247 bool read_only;
248};
249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250/*
251 * a single device
252 */
253struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500254 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255
256 int major; /* blkdev assigned major */
257 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258
Alex Eldera30b71b2012-07-10 20:30:11 -0500259 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260 struct rbd_client *rbd_client;
261
262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
264 spinlock_t lock; /* queue lock */
265
266 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600267 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500268 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500270 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500271
Alex Elder0903e872012-11-14 12:25:19 -0600272 struct ceph_file_layout layout;
273
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274 struct ceph_osd_event *watch_event;
275 struct ceph_osd_request *watch_request;
276
Alex Elder86b00e02012-10-25 23:34:42 -0500277 struct rbd_spec *parent_spec;
278 u64 parent_overlap;
279
Josh Durginc6666012011-11-21 17:11:12 -0800280 /* protects updating the header */
281 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500282
283 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284
285 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */
291 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600292 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800293};
294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600296
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600298static DEFINE_SPINLOCK(rbd_dev_list_lock);
299
Alex Elder432b8582012-01-29 13:57:44 -0600300static LIST_HEAD(rbd_client_list); /* clients */
301static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302
Alex Elder304f6802012-08-31 17:29:52 -0500303static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
304static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
305
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800306static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500307static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800308
Alex Elderf0f8cef2012-01-29 13:57:44 -0600309static ssize_t rbd_add(struct bus_type *bus, const char *buf,
310 size_t count);
311static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
312 size_t count);
313
314static struct bus_attribute rbd_bus_attrs[] = {
315 __ATTR(add, S_IWUSR, NULL, rbd_add),
316 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
317 __ATTR_NULL
318};
319
320static struct bus_type rbd_bus_type = {
321 .name = "rbd",
322 .bus_attrs = rbd_bus_attrs,
323};
324
325static void rbd_root_dev_release(struct device *dev)
326{
327}
328
329static struct device rbd_root_dev = {
330 .init_name = "rbd",
331 .release = rbd_root_dev_release,
332};
333
Alex Elder06ecc6c2012-11-01 10:17:15 -0500334static __printf(2, 3)
335void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
336{
337 struct va_format vaf;
338 va_list args;
339
340 va_start(args, fmt);
341 vaf.fmt = fmt;
342 vaf.va = &args;
343
344 if (!rbd_dev)
345 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
346 else if (rbd_dev->disk)
347 printk(KERN_WARNING "%s: %s: %pV\n",
348 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
349 else if (rbd_dev->spec && rbd_dev->spec->image_name)
350 printk(KERN_WARNING "%s: image %s: %pV\n",
351 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
352 else if (rbd_dev->spec && rbd_dev->spec->image_id)
353 printk(KERN_WARNING "%s: id %s: %pV\n",
354 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
355 else /* punt */
356 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
357 RBD_DRV_NAME, rbd_dev, &vaf);
358 va_end(args);
359}
360
Alex Elderaafb2302012-09-06 16:00:54 -0500361#ifdef RBD_DEBUG
362#define rbd_assert(expr) \
363 if (unlikely(!(expr))) { \
364 printk(KERN_ERR "\nAssertion failure in %s() " \
365 "at line %d:\n\n" \
366 "\trbd_assert(%s);\n\n", \
367 __func__, __LINE__, #expr); \
368 BUG(); \
369 }
370#else /* !RBD_DEBUG */
371# define rbd_assert(expr) ((void) 0)
372#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800373
Alex Elder117973f2012-08-31 17:29:55 -0500374static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
375static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700376
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377static int rbd_open(struct block_device *bdev, fmode_t mode)
378{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600379 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380
Alex Elderf84344f2012-08-31 17:29:51 -0500381 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382 return -EROFS;
383
Alex Elder42382b72012-11-16 09:29:16 -0600384 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600385 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500386 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600387 rbd_dev->open_count++;
388 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700389
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700390 return 0;
391}
392
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800393static int rbd_release(struct gendisk *disk, fmode_t mode)
394{
395 struct rbd_device *rbd_dev = disk->private_data;
396
Alex Elder42382b72012-11-16 09:29:16 -0600397 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398 rbd_assert(rbd_dev->open_count > 0);
399 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600400 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600401 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800402
403 return 0;
404}
405
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406static const struct block_device_operations rbd_bd_ops = {
407 .owner = THIS_MODULE,
408 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800409 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410};
411
412/*
413 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500414 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415 */
Alex Elderf8c38922012-08-10 13:12:07 -0700416static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417{
418 struct rbd_client *rbdc;
419 int ret = -ENOMEM;
420
421 dout("rbd_client_create\n");
422 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
423 if (!rbdc)
424 goto out_opt;
425
426 kref_init(&rbdc->kref);
427 INIT_LIST_HEAD(&rbdc->node);
428
Alex Elderbc534d862012-01-29 13:57:44 -0600429 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
430
Alex Elder43ae4702012-07-03 16:01:18 -0500431 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600433 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500434 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435
436 ret = ceph_open_session(rbdc->client);
437 if (ret < 0)
438 goto out_err;
439
Alex Elder432b8582012-01-29 13:57:44 -0600440 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600442 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443
Alex Elderbc534d862012-01-29 13:57:44 -0600444 mutex_unlock(&ctl_mutex);
445
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446 dout("rbd_client_create created %p\n", rbdc);
447 return rbdc;
448
449out_err:
450 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600451out_mutex:
452 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 kfree(rbdc);
454out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500455 if (ceph_opts)
456 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400457 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458}
459
460/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700461 * Find a ceph client with specific addr and configuration. If
462 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700464static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465{
466 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700467 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468
Alex Elder43ae4702012-07-03 16:01:18 -0500469 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470 return NULL;
471
Alex Elder1f7ba332012-08-10 13:12:07 -0700472 spin_lock(&rbd_client_list_lock);
473 list_for_each_entry(client_node, &rbd_client_list, node) {
474 if (!ceph_compare_options(ceph_opts, client_node->client)) {
475 kref_get(&client_node->kref);
476 found = true;
477 break;
478 }
479 }
480 spin_unlock(&rbd_client_list_lock);
481
482 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483}
484
485/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700486 * mount options
487 */
488enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700489 Opt_last_int,
490 /* int args above */
491 Opt_last_string,
492 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700493 Opt_read_only,
494 Opt_read_write,
495 /* Boolean args above */
496 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700497};
498
Alex Elder43ae4702012-07-03 16:01:18 -0500499static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700500 /* int args above */
501 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500502 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700503 {Opt_read_only, "ro"}, /* Alternate spelling */
504 {Opt_read_write, "read_write"},
505 {Opt_read_write, "rw"}, /* Alternate spelling */
506 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700507 {-1, NULL}
508};
509
Alex Elder98571b52013-01-20 14:44:42 -0600510struct rbd_options {
511 bool read_only;
512};
513
514#define RBD_READ_ONLY_DEFAULT false
515
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700516static int parse_rbd_opts_token(char *c, void *private)
517{
Alex Elder43ae4702012-07-03 16:01:18 -0500518 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700519 substring_t argstr[MAX_OPT_ARGS];
520 int token, intval, ret;
521
Alex Elder43ae4702012-07-03 16:01:18 -0500522 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700523 if (token < 0)
524 return -EINVAL;
525
526 if (token < Opt_last_int) {
527 ret = match_int(&argstr[0], &intval);
528 if (ret < 0) {
529 pr_err("bad mount option arg (not int) "
530 "at '%s'\n", c);
531 return ret;
532 }
533 dout("got int token %d val %d\n", token, intval);
534 } else if (token > Opt_last_int && token < Opt_last_string) {
535 dout("got string token %d val %s\n", token,
536 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700537 } else if (token > Opt_last_string && token < Opt_last_bool) {
538 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700539 } else {
540 dout("got token %d\n", token);
541 }
542
543 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700544 case Opt_read_only:
545 rbd_opts->read_only = true;
546 break;
547 case Opt_read_write:
548 rbd_opts->read_only = false;
549 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700550 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500551 rbd_assert(false);
552 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700553 }
554 return 0;
555}
556
557/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 * Get a ceph client with specific addr and configuration, if one does
559 * not exist create it.
560 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500561static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562{
Alex Elderf8c38922012-08-10 13:12:07 -0700563 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700564
Alex Elder1f7ba332012-08-10 13:12:07 -0700565 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500566 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500567 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500568 else
Alex Elderf8c38922012-08-10 13:12:07 -0700569 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder9d3997f2012-10-25 23:34:42 -0500571 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572}
573
574/*
575 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600576 *
Alex Elder432b8582012-01-29 13:57:44 -0600577 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 */
579static void rbd_client_release(struct kref *kref)
580{
581 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
582
583 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500584 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500586 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587
588 ceph_destroy_client(rbdc->client);
589 kfree(rbdc);
590}
591
592/*
593 * Drop reference to ceph client node. If it's not referenced anymore, release
594 * it.
595 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500596static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597{
Alex Elderc53d5892012-10-25 23:34:42 -0500598 if (rbdc)
599 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600}
601
Alex Eldera30b71b2012-07-10 20:30:11 -0500602static bool rbd_image_format_valid(u32 image_format)
603{
604 return image_format == 1 || image_format == 2;
605}
606
Alex Elder8e94af82012-07-25 09:32:40 -0500607static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
608{
Alex Elder103a1502012-08-02 11:29:45 -0500609 size_t size;
610 u32 snap_count;
611
612 /* The header has to start with the magic rbd header text */
613 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
614 return false;
615
Alex Elderdb2388b2012-10-20 22:17:27 -0500616 /* The bio layer requires at least sector-sized I/O */
617
618 if (ondisk->options.order < SECTOR_SHIFT)
619 return false;
620
621 /* If we use u64 in a few spots we may be able to loosen this */
622
623 if (ondisk->options.order > 8 * sizeof (int) - 1)
624 return false;
625
Alex Elder103a1502012-08-02 11:29:45 -0500626 /*
627 * The size of a snapshot header has to fit in a size_t, and
628 * that limits the number of snapshots.
629 */
630 snap_count = le32_to_cpu(ondisk->snap_count);
631 size = SIZE_MAX - sizeof (struct ceph_snap_context);
632 if (snap_count > size / sizeof (__le64))
633 return false;
634
635 /*
636 * Not only that, but the size of the entire the snapshot
637 * header must also be representable in a size_t.
638 */
639 size -= snap_count * sizeof (__le64);
640 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
641 return false;
642
643 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500644}
645
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646/*
647 * Create a new header structure, translate header format from the on-disk
648 * header.
649 */
650static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500651 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652{
Alex Elderccece232012-07-10 20:30:10 -0500653 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500654 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500655 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500656 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657
Alex Elder6a523252012-07-19 17:12:59 -0500658 memset(header, 0, sizeof (*header));
659
Alex Elder103a1502012-08-02 11:29:45 -0500660 snap_count = le32_to_cpu(ondisk->snap_count);
661
Alex Elder58c17b02012-08-23 23:22:06 -0500662 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
663 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500664 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500666 memcpy(header->object_prefix, ondisk->object_prefix, len);
667 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600668
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500670 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
671
Alex Elder621901d2012-08-23 23:22:06 -0500672 /* Save a copy of the snapshot names */
673
Alex Elderf785cc12012-08-23 23:22:06 -0500674 if (snap_names_len > (u64) SIZE_MAX)
675 return -EIO;
676 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500678 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500679 /*
680 * Note that rbd_dev_v1_header_read() guarantees
681 * the ondisk buffer we're working with has
682 * snap_names_len bytes beyond the end of the
683 * snapshot id array, this memcpy() is safe.
684 */
685 memcpy(header->snap_names, &ondisk->snaps[snap_count],
686 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500687
Alex Elder621901d2012-08-23 23:22:06 -0500688 /* Record each snapshot's size */
689
Alex Elderd2bb24e2012-07-26 23:37:14 -0500690 size = snap_count * sizeof (*header->snap_sizes);
691 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500693 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500694 for (i = 0; i < snap_count; i++)
695 header->snap_sizes[i] =
696 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 } else {
Alex Elderccece232012-07-10 20:30:10 -0500698 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 header->snap_names = NULL;
700 header->snap_sizes = NULL;
701 }
Alex Elder849b4262012-07-09 21:04:24 -0500702
Alex Elder34b13182012-07-13 20:35:12 -0500703 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 header->obj_order = ondisk->options.order;
705 header->crypt_type = ondisk->options.crypt_type;
706 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500707
Alex Elder621901d2012-08-23 23:22:06 -0500708 /* Allocate and fill in the snapshot context */
709
Alex Elderf84344f2012-08-31 17:29:51 -0500710 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500711 size = sizeof (struct ceph_snap_context);
712 size += snap_count * sizeof (header->snapc->snaps[0]);
713 header->snapc = kzalloc(size, GFP_KERNEL);
714 if (!header->snapc)
715 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
717 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500718 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500720 for (i = 0; i < snap_count; i++)
721 header->snapc->snaps[i] =
722 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
724 return 0;
725
Alex Elder6a523252012-07-19 17:12:59 -0500726out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500727 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500728 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500730 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500731 kfree(header->object_prefix);
732 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500733
Alex Elder00f1f362012-02-07 12:03:36 -0600734 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735}
736
Alex Elder9e15b772012-10-30 19:40:33 -0500737static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
738{
739 struct rbd_snap *snap;
740
741 if (snap_id == CEPH_NOSNAP)
742 return RBD_SNAP_HEAD_NAME;
743
744 list_for_each_entry(snap, &rbd_dev->snaps, node)
745 if (snap_id == snap->id)
746 return snap->name;
747
748 return NULL;
749}
750
Alex Elder8836b992012-08-30 14:42:15 -0500751static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753
Alex Eldere86924a2012-07-10 20:30:11 -0500754 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600755
Alex Eldere86924a2012-07-10 20:30:11 -0500756 list_for_each_entry(snap, &rbd_dev->snaps, node) {
757 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500758 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500759 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500760 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600761
Alex Eldere86924a2012-07-10 20:30:11 -0500762 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600763 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700764 }
Alex Eldere86924a2012-07-10 20:30:11 -0500765
Alex Elder00f1f362012-02-07 12:03:36 -0600766 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767}
768
Alex Elder819d52b2012-10-25 23:34:41 -0500769static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770{
Alex Elder78dc4472012-07-19 08:49:18 -0500771 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700772
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500773 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800774 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500775 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500776 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500777 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500778 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500780 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781 if (ret < 0)
782 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500783 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 }
Alex Elderd78b6502012-11-09 08:43:15 -0600785 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 return ret;
788}
789
790static void rbd_header_free(struct rbd_image_header *header)
791{
Alex Elder849b4262012-07-09 21:04:24 -0500792 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500793 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700794 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500795 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500796 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500797 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800798 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500799 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800}
801
Alex Elder98571b52013-01-20 14:44:42 -0600802static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803{
Alex Elder65ccfe22012-08-09 10:33:26 -0700804 char *name;
805 u64 segment;
806 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807
Alex Elder2fd82b92012-11-09 15:05:54 -0600808 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700809 if (!name)
810 return NULL;
811 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600812 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700813 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600814 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700815 pr_err("error formatting segment name for #%llu (%d)\n",
816 segment, ret);
817 kfree(name);
818 name = NULL;
819 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820
Alex Elder65ccfe22012-08-09 10:33:26 -0700821 return name;
822}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700823
Alex Elder65ccfe22012-08-09 10:33:26 -0700824static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
825{
826 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827
Alex Elder65ccfe22012-08-09 10:33:26 -0700828 return offset & (segment_size - 1);
829}
830
831static u64 rbd_segment_length(struct rbd_device *rbd_dev,
832 u64 offset, u64 length)
833{
834 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
835
836 offset &= segment_size - 1;
837
Alex Elderaafb2302012-09-06 16:00:54 -0500838 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700839 if (offset + length > segment_size)
840 length = segment_size - offset;
841
842 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700843}
844
845/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700846 * returns the size of an object in the image
847 */
848static u64 rbd_obj_bytes(struct rbd_image_header *header)
849{
850 return 1 << header->obj_order;
851}
852
853/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854 * bio helpers
855 */
856
857static void bio_chain_put(struct bio *chain)
858{
859 struct bio *tmp;
860
861 while (chain) {
862 tmp = chain;
863 chain = chain->bi_next;
864 bio_put(tmp);
865 }
866}
867
868/*
869 * zeros a bio chain, starting at specific offset
870 */
871static void zero_bio_chain(struct bio *chain, int start_ofs)
872{
873 struct bio_vec *bv;
874 unsigned long flags;
875 void *buf;
876 int i;
877 int pos = 0;
878
879 while (chain) {
880 bio_for_each_segment(bv, chain, i) {
881 if (pos + bv->bv_len > start_ofs) {
882 int remainder = max(start_ofs - pos, 0);
883 buf = bvec_kmap_irq(bv, &flags);
884 memset(buf + remainder, 0,
885 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200886 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 }
888 pos += bv->bv_len;
889 }
890
891 chain = chain->bi_next;
892 }
893}
894
895/*
Alex Elderf7760da2012-10-20 22:17:27 -0500896 * Clone a portion of a bio, starting at the given byte offset
897 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898 */
Alex Elderf7760da2012-10-20 22:17:27 -0500899static struct bio *bio_clone_range(struct bio *bio_src,
900 unsigned int offset,
901 unsigned int len,
902 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903{
Alex Elderf7760da2012-10-20 22:17:27 -0500904 struct bio_vec *bv;
905 unsigned int resid;
906 unsigned short idx;
907 unsigned int voff;
908 unsigned short end_idx;
909 unsigned short vcnt;
910 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911
Alex Elderf7760da2012-10-20 22:17:27 -0500912 /* Handle the easy case for the caller */
913
914 if (!offset && len == bio_src->bi_size)
915 return bio_clone(bio_src, gfpmask);
916
917 if (WARN_ON_ONCE(!len))
918 return NULL;
919 if (WARN_ON_ONCE(len > bio_src->bi_size))
920 return NULL;
921 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
922 return NULL;
923
924 /* Find first affected segment... */
925
926 resid = offset;
927 __bio_for_each_segment(bv, bio_src, idx, 0) {
928 if (resid < bv->bv_len)
929 break;
930 resid -= bv->bv_len;
931 }
932 voff = resid;
933
934 /* ...and the last affected segment */
935
936 resid += len;
937 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
938 if (resid <= bv->bv_len)
939 break;
940 resid -= bv->bv_len;
941 }
942 vcnt = end_idx - idx + 1;
943
944 /* Build the clone */
945
946 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
947 if (!bio)
948 return NULL; /* ENOMEM */
949
950 bio->bi_bdev = bio_src->bi_bdev;
951 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
952 bio->bi_rw = bio_src->bi_rw;
953 bio->bi_flags |= 1 << BIO_CLONED;
954
955 /*
956 * Copy over our part of the bio_vec, then update the first
957 * and last (or only) entries.
958 */
959 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
960 vcnt * sizeof (struct bio_vec));
961 bio->bi_io_vec[0].bv_offset += voff;
962 if (vcnt > 1) {
963 bio->bi_io_vec[0].bv_len -= voff;
964 bio->bi_io_vec[vcnt - 1].bv_len = resid;
965 } else {
966 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 }
968
Alex Elderf7760da2012-10-20 22:17:27 -0500969 bio->bi_vcnt = vcnt;
970 bio->bi_size = len;
971 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700972
Alex Elderf7760da2012-10-20 22:17:27 -0500973 return bio;
974}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Alex Elderf7760da2012-10-20 22:17:27 -0500976/*
977 * Clone a portion of a bio chain, starting at the given byte offset
978 * into the first bio in the source chain and continuing for the
979 * number of bytes indicated. The result is another bio chain of
980 * exactly the given length, or a null pointer on error.
981 *
982 * The bio_src and offset parameters are both in-out. On entry they
983 * refer to the first source bio and the offset into that bio where
984 * the start of data to be cloned is located.
985 *
986 * On return, bio_src is updated to refer to the bio in the source
987 * chain that contains first un-cloned byte, and *offset will
988 * contain the offset of that byte within that bio.
989 */
990static struct bio *bio_chain_clone_range(struct bio **bio_src,
991 unsigned int *offset,
992 unsigned int len,
993 gfp_t gfpmask)
994{
995 struct bio *bi = *bio_src;
996 unsigned int off = *offset;
997 struct bio *chain = NULL;
998 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Alex Elderf7760da2012-10-20 22:17:27 -05001000 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001001
Alex Elderf7760da2012-10-20 22:17:27 -05001002 if (!bi || off >= bi->bi_size || !len)
1003 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
Alex Elderf7760da2012-10-20 22:17:27 -05001005 end = &chain;
1006 while (len) {
1007 unsigned int bi_size;
1008 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001009
Alex Elderf5400b72012-11-01 10:17:15 -05001010 if (!bi) {
1011 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001012 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001013 }
Alex Elderf7760da2012-10-20 22:17:27 -05001014 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1015 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1016 if (!bio)
1017 goto out_err; /* ENOMEM */
1018
1019 *end = bio;
1020 end = &bio->bi_next;
1021
1022 off += bi_size;
1023 if (off == bi->bi_size) {
1024 bi = bi->bi_next;
1025 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026 }
Alex Elderf7760da2012-10-20 22:17:27 -05001027 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028 }
Alex Elderf7760da2012-10-20 22:17:27 -05001029 *bio_src = bi;
1030 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031
Alex Elderf7760da2012-10-20 22:17:27 -05001032 return chain;
1033out_err:
1034 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036 return NULL;
1037}
1038
Alex Elderbf0d5f502012-11-22 00:00:08 -06001039static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1040{
1041 kref_get(&obj_request->kref);
1042}
1043
1044static void rbd_obj_request_destroy(struct kref *kref);
1045static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1046{
1047 rbd_assert(obj_request != NULL);
1048 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1049}
1050
1051static void rbd_img_request_get(struct rbd_img_request *img_request)
1052{
1053 kref_get(&img_request->kref);
1054}
1055
1056static void rbd_img_request_destroy(struct kref *kref);
1057static void rbd_img_request_put(struct rbd_img_request *img_request)
1058{
1059 rbd_assert(img_request != NULL);
1060 kref_put(&img_request->kref, rbd_img_request_destroy);
1061}
1062
1063static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1064 struct rbd_obj_request *obj_request)
1065{
1066 rbd_obj_request_get(obj_request);
1067 obj_request->img_request = img_request;
1068 list_add_tail(&obj_request->links, &img_request->obj_requests);
1069 obj_request->which = img_request->obj_request_count++;
1070 rbd_assert(obj_request->which != BAD_WHICH);
1071}
1072
1073static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1074 struct rbd_obj_request *obj_request)
1075{
1076 rbd_assert(obj_request->which != BAD_WHICH);
1077 obj_request->which = BAD_WHICH;
1078 list_del(&obj_request->links);
1079 rbd_assert(obj_request->img_request == img_request);
1080 obj_request->callback = NULL;
1081 obj_request->img_request = NULL;
1082 rbd_obj_request_put(obj_request);
1083}
1084
1085static bool obj_request_type_valid(enum obj_request_type type)
1086{
1087 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001088 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001089 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001090 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001091 return true;
1092 default:
1093 return false;
1094 }
1095}
1096
Alex Elder8d23bf22012-11-19 22:55:21 -06001097struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1098{
1099 struct ceph_osd_req_op *op;
1100 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001101 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001102
1103 op = kzalloc(sizeof (*op), GFP_NOIO);
1104 if (!op)
1105 return NULL;
1106 op->op = opcode;
1107 va_start(args, opcode);
1108 switch (opcode) {
1109 case CEPH_OSD_OP_READ:
1110 case CEPH_OSD_OP_WRITE:
1111 /* rbd_osd_req_op_create(READ, offset, length) */
1112 /* rbd_osd_req_op_create(WRITE, offset, length) */
1113 op->extent.offset = va_arg(args, u64);
1114 op->extent.length = va_arg(args, u64);
1115 if (opcode == CEPH_OSD_OP_WRITE)
1116 op->payload_len = op->extent.length;
1117 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001118 case CEPH_OSD_OP_CALL:
1119 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1120 op->cls.class_name = va_arg(args, char *);
1121 size = strlen(op->cls.class_name);
1122 rbd_assert(size <= (size_t) U8_MAX);
1123 op->cls.class_len = size;
1124 op->payload_len = size;
1125
1126 op->cls.method_name = va_arg(args, char *);
1127 size = strlen(op->cls.method_name);
1128 rbd_assert(size <= (size_t) U8_MAX);
1129 op->cls.method_len = size;
1130 op->payload_len += size;
1131
1132 op->cls.argc = 0;
1133 op->cls.indata = va_arg(args, void *);
1134 size = va_arg(args, size_t);
1135 rbd_assert(size <= (size_t) U32_MAX);
1136 op->cls.indata_len = (u32) size;
1137 op->payload_len += size;
1138 break;
Alex Elder5efea492012-11-19 22:55:21 -06001139 case CEPH_OSD_OP_NOTIFY_ACK:
1140 case CEPH_OSD_OP_WATCH:
1141 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1142 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1143 op->watch.cookie = va_arg(args, u64);
1144 op->watch.ver = va_arg(args, u64);
1145 op->watch.ver = cpu_to_le64(op->watch.ver);
1146 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1147 op->watch.flag = (u8) 1;
1148 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001149 default:
1150 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1151 kfree(op);
1152 op = NULL;
1153 break;
1154 }
1155 va_end(args);
1156
1157 return op;
1158}
1159
1160static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1161{
1162 kfree(op);
1163}
1164
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165/*
1166 * Send ceph osd request
1167 */
1168static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001169 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 struct ceph_snap_context *snapc,
1171 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001172 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173 struct bio *bio,
1174 struct page **pages,
1175 int num_pages,
1176 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001177 struct ceph_osd_req_op *op,
Alex Elder5f29ddd2012-11-08 08:01:39 -06001178 void (*rbd_cb)(struct ceph_osd_request *,
1179 struct ceph_msg *),
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001180 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181{
Alex Elder1dbb4392012-01-24 10:08:37 -06001182 struct ceph_osd_client *osdc;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001183 struct ceph_osd_request *osd_req;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001184 struct timespec mtime = CURRENT_TIME;
1185 int ret;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001186
Alex Elder7d250b92012-11-30 17:53:04 -06001187 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n",
Alex Elderf7760da2012-10-20 22:17:27 -05001188 object_name, (unsigned long long) ofs,
Alex Elder7d250b92012-11-30 17:53:04 -06001189 (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190
Alex Elder0ce1a792012-07-03 16:01:18 -05001191 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder30573d62012-11-13 21:11:15 -06001192 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001193 if (!osd_req)
1194 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195
Alex Elderd178a9e2012-11-13 21:11:15 -06001196 osd_req->r_flags = flags;
Alex Elder54a54002012-11-13 21:11:15 -06001197 osd_req->r_pages = pages;
1198 if (bio) {
1199 osd_req->r_bio = bio;
1200 bio_get(osd_req->r_bio);
1201 }
Alex Elder2e53c6c2012-11-30 09:59:47 -06001202
Alex Elder5f29ddd2012-11-08 08:01:39 -06001203 osd_req->r_callback = rbd_cb;
Alex Elder7d250b92012-11-30 17:53:04 -06001204 osd_req->r_priv = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205
Alex Elder5f29ddd2012-11-08 08:01:39 -06001206 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1207 osd_req->r_oid_len = strlen(osd_req->r_oid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208
Alex Elder0903e872012-11-14 12:25:19 -06001209 osd_req->r_file_layout = rbd_dev->layout; /* struct */
Alex Eldere01e7922012-11-14 12:25:18 -06001210 osd_req->r_num_pages = calc_pages_for(ofs, len);
1211 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Alex Elder30573d62012-11-13 21:11:15 -06001213 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
Alex Elderae7ca4a32012-11-13 21:11:15 -06001214 snapc, snapid, &mtime);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215
Alex Elder8b84de72012-11-20 14:17:17 -06001216 if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001217 ceph_osdc_set_request_linger(osdc, osd_req);
Alex Elder8b84de72012-11-20 14:17:17 -06001218 rbd_dev->watch_request = osd_req;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001219 }
1220
Alex Elder5f29ddd2012-11-08 08:01:39 -06001221 ret = ceph_osdc_start_request(osdc, osd_req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222 if (ret < 0)
1223 goto done_err;
1224
1225 if (!rbd_cb) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001226 u64 version;
1227
1228 ret = ceph_osdc_wait_request(osdc, osd_req);
1229 version = le64_to_cpu(osd_req->r_reassert_version.version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 if (ver)
Alex Elder5f29ddd2012-11-08 08:01:39 -06001231 *ver = version;
1232 dout("reassert_ver=%llu\n", (unsigned long long) version);
1233 ceph_osdc_put_request(osd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 }
1235 return ret;
1236
1237done_err:
Alex Elder2e53c6c2012-11-30 09:59:47 -06001238 if (bio)
1239 bio_chain_put(osd_req->r_bio);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001240 ceph_osdc_put_request(osd_req);
1241
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242 return ret;
1243}
1244
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001245/*
1246 * Do a synchronous ceph osd operation
1247 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001248static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001249 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001250 struct ceph_osd_req_op *op,
Alex Elderaded07e2012-07-03 16:01:18 -05001251 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001252 u64 ofs, u64 inbound_size,
1253 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001254 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255{
1256 int ret;
1257 struct page **pages;
1258 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001259
Alex Elder30573d62012-11-13 21:11:15 -06001260 rbd_assert(op != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261
Alex Elderf8d4de62012-07-03 16:01:19 -05001262 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001263 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001264 if (IS_ERR(pages))
1265 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266
Alex Elder25704ac2012-11-09 08:43:16 -06001267 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderf8d4de62012-07-03 16:01:19 -05001268 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001269 pages, num_pages,
1270 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001271 op,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272 NULL,
Alex Elder8b84de72012-11-20 14:17:17 -06001273 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001274 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001275 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001276
Alex Elderf8d4de62012-07-03 16:01:19 -05001277 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1278 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001279
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001280done:
1281 ceph_release_page_vector(pages, num_pages);
1282 return ret;
1283}
1284
Alex Elderbf0d5f502012-11-22 00:00:08 -06001285static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1286 struct rbd_obj_request *obj_request)
1287{
1288 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1289}
1290
1291static void rbd_img_request_complete(struct rbd_img_request *img_request)
1292{
1293 if (img_request->callback)
1294 img_request->callback(img_request);
1295 else
1296 rbd_img_request_put(img_request);
1297}
1298
Alex Elder788e2df2013-01-17 12:25:27 -06001299/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1300
1301static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1302{
1303 return wait_for_completion_interruptible(&obj_request->completion);
1304}
1305
Alex Elder9969ebc2013-01-18 12:31:10 -06001306static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
1307 struct ceph_osd_op *op)
1308{
1309 atomic_set(&obj_request->done, 1);
1310}
1311
Alex Elderbf0d5f502012-11-22 00:00:08 -06001312static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1313{
1314 if (obj_request->callback)
1315 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001316 else
1317 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001318}
1319
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001320/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001321 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001322 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001323static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001324 const char *object_name,
1325 const char *class_name,
1326 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001327 const char *outbound,
1328 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001329 char *inbound,
1330 size_t inbound_size,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001331 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001332{
Alex Elder139b4312012-11-13 21:11:15 -06001333 struct ceph_osd_req_op *op;
Alex Elder57cfc102012-06-26 12:57:03 -07001334 int ret;
1335
Alex Elder3cb4a682012-06-26 12:57:03 -07001336 /*
1337 * Any input parameters required by the method we're calling
1338 * will be sent along with the class and method names as
1339 * part of the message payload. That data and its size are
1340 * supplied via the indata and indata_len fields (named from
1341 * the perspective of the server side) in the OSD request
1342 * operation.
1343 */
Alex Elder2647ba32012-11-19 22:55:21 -06001344 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1345 method_name, outbound, outbound_size);
Alex Elder139b4312012-11-13 21:11:15 -06001346 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001347 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001348
Alex Elder30573d62012-11-13 21:11:15 -06001349 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
Alex Elderf8d4de62012-07-03 16:01:19 -05001350 object_name, 0, inbound_size, inbound,
Alex Elder8b84de72012-11-20 14:17:17 -06001351 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001352
Alex Elder2647ba32012-11-19 22:55:21 -06001353 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001354
1355 dout("cls_exec returned %d\n", ret);
1356 return ret;
1357}
1358
Alex Elderbf0d5f502012-11-22 00:00:08 -06001359static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1360 struct ceph_osd_op *op)
1361{
1362 u64 xferred;
1363
1364 /*
1365 * We support a 64-bit length, but ultimately it has to be
1366 * passed to blk_end_request(), which takes an unsigned int.
1367 */
1368 xferred = le64_to_cpu(op->extent.length);
1369 rbd_assert(xferred < (u64) UINT_MAX);
1370 if (obj_request->result == (s32) -ENOENT) {
1371 zero_bio_chain(obj_request->bio_list, 0);
1372 obj_request->result = 0;
1373 } else if (xferred < obj_request->length && !obj_request->result) {
1374 zero_bio_chain(obj_request->bio_list, xferred);
1375 xferred = obj_request->length;
1376 }
1377 obj_request->xferred = xferred;
1378 atomic_set(&obj_request->done, 1);
1379}
1380
1381static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1382 struct ceph_osd_op *op)
1383{
1384 obj_request->xferred = le64_to_cpu(op->extent.length);
1385 atomic_set(&obj_request->done, 1);
1386}
1387
1388static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1389 struct ceph_msg *msg)
1390{
1391 struct rbd_obj_request *obj_request = osd_req->r_priv;
1392 struct ceph_osd_reply_head *reply_head;
1393 struct ceph_osd_op *op;
1394 u32 num_ops;
1395 u16 opcode;
1396
1397 rbd_assert(osd_req == obj_request->osd_req);
1398 rbd_assert(!!obj_request->img_request ^
1399 (obj_request->which == BAD_WHICH));
1400
1401 obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1402 reply_head = msg->front.iov_base;
1403 obj_request->result = (s32) le32_to_cpu(reply_head->result);
1404 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1405
1406 num_ops = le32_to_cpu(reply_head->num_ops);
1407 WARN_ON(num_ops != 1); /* For now */
1408
1409 op = &reply_head->ops[0];
1410 opcode = le16_to_cpu(op->op);
1411 switch (opcode) {
1412 case CEPH_OSD_OP_READ:
1413 rbd_osd_read_callback(obj_request, op);
1414 break;
1415 case CEPH_OSD_OP_WRITE:
1416 rbd_osd_write_callback(obj_request, op);
1417 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001418 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001419 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001420 case CEPH_OSD_OP_WATCH:
1421 rbd_osd_trivial_callback(obj_request, op);
1422 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001423 default:
1424 rbd_warn(NULL, "%s: unsupported op %hu\n",
1425 obj_request->object_name, (unsigned short) opcode);
1426 break;
1427 }
1428
1429 if (atomic_read(&obj_request->done))
1430 rbd_obj_request_complete(obj_request);
1431}
1432
1433static struct ceph_osd_request *rbd_osd_req_create(
1434 struct rbd_device *rbd_dev,
1435 bool write_request,
1436 struct rbd_obj_request *obj_request,
1437 struct ceph_osd_req_op *op)
1438{
1439 struct rbd_img_request *img_request = obj_request->img_request;
1440 struct ceph_snap_context *snapc = NULL;
1441 struct ceph_osd_client *osdc;
1442 struct ceph_osd_request *osd_req;
1443 struct timespec now;
1444 struct timespec *mtime;
1445 u64 snap_id = CEPH_NOSNAP;
1446 u64 offset = obj_request->offset;
1447 u64 length = obj_request->length;
1448
1449 if (img_request) {
1450 rbd_assert(img_request->write_request == write_request);
1451 if (img_request->write_request)
1452 snapc = img_request->snapc;
1453 else
1454 snap_id = img_request->snap_id;
1455 }
1456
1457 /* Allocate and initialize the request, for the single op */
1458
1459 osdc = &rbd_dev->rbd_client->client->osdc;
1460 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1461 if (!osd_req)
1462 return NULL; /* ENOMEM */
1463
1464 rbd_assert(obj_request_type_valid(obj_request->type));
1465 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001466 case OBJ_REQUEST_NODATA:
1467 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001468 case OBJ_REQUEST_BIO:
1469 rbd_assert(obj_request->bio_list != NULL);
1470 osd_req->r_bio = obj_request->bio_list;
1471 bio_get(osd_req->r_bio);
1472 /* osd client requires "num pages" even for bio */
1473 osd_req->r_num_pages = calc_pages_for(offset, length);
1474 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001475 case OBJ_REQUEST_PAGES:
1476 osd_req->r_pages = obj_request->pages;
1477 osd_req->r_num_pages = obj_request->page_count;
1478 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1479 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001480 }
1481
1482 if (write_request) {
1483 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1484 now = CURRENT_TIME;
1485 mtime = &now;
1486 } else {
1487 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1488 mtime = NULL; /* not needed for reads */
1489 offset = 0; /* These are not used... */
1490 length = 0; /* ...for osd read requests */
1491 }
1492
1493 osd_req->r_callback = rbd_osd_req_callback;
1494 osd_req->r_priv = obj_request;
1495
1496 osd_req->r_oid_len = strlen(obj_request->object_name);
1497 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1498 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1499
1500 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1501
1502 /* osd_req will get its own reference to snapc (if non-null) */
1503
1504 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1505 snapc, snap_id, mtime);
1506
1507 return osd_req;
1508}
1509
1510static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1511{
1512 ceph_osdc_put_request(osd_req);
1513}
1514
1515/* object_name is assumed to be a non-null pointer and NUL-terminated */
1516
1517static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1518 u64 offset, u64 length,
1519 enum obj_request_type type)
1520{
1521 struct rbd_obj_request *obj_request;
1522 size_t size;
1523 char *name;
1524
1525 rbd_assert(obj_request_type_valid(type));
1526
1527 size = strlen(object_name) + 1;
1528 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1529 if (!obj_request)
1530 return NULL;
1531
1532 name = (char *)(obj_request + 1);
1533 obj_request->object_name = memcpy(name, object_name, size);
1534 obj_request->offset = offset;
1535 obj_request->length = length;
1536 obj_request->which = BAD_WHICH;
1537 obj_request->type = type;
1538 INIT_LIST_HEAD(&obj_request->links);
1539 atomic_set(&obj_request->done, 0);
Alex Elder788e2df2013-01-17 12:25:27 -06001540 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001541 kref_init(&obj_request->kref);
1542
1543 return obj_request;
1544}
1545
1546static void rbd_obj_request_destroy(struct kref *kref)
1547{
1548 struct rbd_obj_request *obj_request;
1549
1550 obj_request = container_of(kref, struct rbd_obj_request, kref);
1551
1552 rbd_assert(obj_request->img_request == NULL);
1553 rbd_assert(obj_request->which == BAD_WHICH);
1554
1555 if (obj_request->osd_req)
1556 rbd_osd_req_destroy(obj_request->osd_req);
1557
1558 rbd_assert(obj_request_type_valid(obj_request->type));
1559 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001560 case OBJ_REQUEST_NODATA:
1561 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001562 case OBJ_REQUEST_BIO:
1563 if (obj_request->bio_list)
1564 bio_chain_put(obj_request->bio_list);
1565 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001566 case OBJ_REQUEST_PAGES:
1567 if (obj_request->pages)
1568 ceph_release_page_vector(obj_request->pages,
1569 obj_request->page_count);
1570 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001571 }
1572
1573 kfree(obj_request);
1574}
1575
1576/*
1577 * Caller is responsible for filling in the list of object requests
1578 * that comprises the image request, and the Linux request pointer
1579 * (if there is one).
1580 */
1581struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1582 u64 offset, u64 length,
1583 bool write_request)
1584{
1585 struct rbd_img_request *img_request;
1586 struct ceph_snap_context *snapc = NULL;
1587
1588 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1589 if (!img_request)
1590 return NULL;
1591
1592 if (write_request) {
1593 down_read(&rbd_dev->header_rwsem);
1594 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1595 up_read(&rbd_dev->header_rwsem);
1596 if (WARN_ON(!snapc)) {
1597 kfree(img_request);
1598 return NULL; /* Shouldn't happen */
1599 }
1600 }
1601
1602 img_request->rq = NULL;
1603 img_request->rbd_dev = rbd_dev;
1604 img_request->offset = offset;
1605 img_request->length = length;
1606 img_request->write_request = write_request;
1607 if (write_request)
1608 img_request->snapc = snapc;
1609 else
1610 img_request->snap_id = rbd_dev->spec->snap_id;
1611 spin_lock_init(&img_request->completion_lock);
1612 img_request->next_completion = 0;
1613 img_request->callback = NULL;
1614 img_request->obj_request_count = 0;
1615 INIT_LIST_HEAD(&img_request->obj_requests);
1616 kref_init(&img_request->kref);
1617
1618 rbd_img_request_get(img_request); /* Avoid a warning */
1619 rbd_img_request_put(img_request); /* TEMPORARY */
1620
1621 return img_request;
1622}
1623
1624static void rbd_img_request_destroy(struct kref *kref)
1625{
1626 struct rbd_img_request *img_request;
1627 struct rbd_obj_request *obj_request;
1628 struct rbd_obj_request *next_obj_request;
1629
1630 img_request = container_of(kref, struct rbd_img_request, kref);
1631
1632 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1633 rbd_img_obj_request_del(img_request, obj_request);
1634
1635 if (img_request->write_request)
1636 ceph_put_snap_context(img_request->snapc);
1637
1638 kfree(img_request);
1639}
1640
1641static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1642 struct bio *bio_list)
1643{
1644 struct rbd_device *rbd_dev = img_request->rbd_dev;
1645 struct rbd_obj_request *obj_request = NULL;
1646 struct rbd_obj_request *next_obj_request;
1647 unsigned int bio_offset;
1648 u64 image_offset;
1649 u64 resid;
1650 u16 opcode;
1651
1652 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1653 : CEPH_OSD_OP_READ;
1654 bio_offset = 0;
1655 image_offset = img_request->offset;
1656 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1657 resid = img_request->length;
1658 while (resid) {
1659 const char *object_name;
1660 unsigned int clone_size;
1661 struct ceph_osd_req_op *op;
1662 u64 offset;
1663 u64 length;
1664
1665 object_name = rbd_segment_name(rbd_dev, image_offset);
1666 if (!object_name)
1667 goto out_unwind;
1668 offset = rbd_segment_offset(rbd_dev, image_offset);
1669 length = rbd_segment_length(rbd_dev, image_offset, resid);
1670 obj_request = rbd_obj_request_create(object_name,
1671 offset, length,
1672 OBJ_REQUEST_BIO);
1673 kfree(object_name); /* object request has its own copy */
1674 if (!obj_request)
1675 goto out_unwind;
1676
1677 rbd_assert(length <= (u64) UINT_MAX);
1678 clone_size = (unsigned int) length;
1679 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1680 &bio_offset, clone_size,
1681 GFP_ATOMIC);
1682 if (!obj_request->bio_list)
1683 goto out_partial;
1684
1685 /*
1686 * Build up the op to use in building the osd
1687 * request. Note that the contents of the op are
1688 * copied by rbd_osd_req_create().
1689 */
1690 op = rbd_osd_req_op_create(opcode, offset, length);
1691 if (!op)
1692 goto out_partial;
1693 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1694 img_request->write_request,
1695 obj_request, op);
1696 rbd_osd_req_op_destroy(op);
1697 if (!obj_request->osd_req)
1698 goto out_partial;
1699 /* status and version are initially zero-filled */
1700
1701 rbd_img_obj_request_add(img_request, obj_request);
1702
1703 image_offset += length;
1704 resid -= length;
1705 }
1706
1707 return 0;
1708
1709out_partial:
1710 rbd_obj_request_put(obj_request);
1711out_unwind:
1712 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1713 rbd_obj_request_put(obj_request);
1714
1715 return -ENOMEM;
1716}
1717
1718static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1719{
1720 struct rbd_img_request *img_request;
1721 u32 which = obj_request->which;
1722 bool more = true;
1723
1724 img_request = obj_request->img_request;
1725 rbd_assert(img_request != NULL);
1726 rbd_assert(img_request->rq != NULL);
1727 rbd_assert(which != BAD_WHICH);
1728 rbd_assert(which < img_request->obj_request_count);
1729 rbd_assert(which >= img_request->next_completion);
1730
1731 spin_lock_irq(&img_request->completion_lock);
1732 if (which != img_request->next_completion)
1733 goto out;
1734
1735 for_each_obj_request_from(img_request, obj_request) {
1736 unsigned int xferred;
1737 int result;
1738
1739 rbd_assert(more);
1740 rbd_assert(which < img_request->obj_request_count);
1741
1742 if (!atomic_read(&obj_request->done))
1743 break;
1744
1745 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1746 xferred = (unsigned int) obj_request->xferred;
1747 result = (int) obj_request->result;
1748 if (result)
1749 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1750 img_request->write_request ? "write" : "read",
1751 result, xferred);
1752
1753 more = blk_end_request(img_request->rq, result, xferred);
1754 which++;
1755 }
1756 rbd_assert(more ^ (which == img_request->obj_request_count));
1757 img_request->next_completion = which;
1758out:
1759 spin_unlock_irq(&img_request->completion_lock);
1760
1761 if (!more)
1762 rbd_img_request_complete(img_request);
1763}
1764
1765static int rbd_img_request_submit(struct rbd_img_request *img_request)
1766{
1767 struct rbd_device *rbd_dev = img_request->rbd_dev;
1768 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1769 struct rbd_obj_request *obj_request;
1770
1771 for_each_obj_request(img_request, obj_request) {
1772 int ret;
1773
1774 obj_request->callback = rbd_img_obj_callback;
1775 ret = rbd_obj_request_submit(osdc, obj_request);
1776 if (ret)
1777 return ret;
1778 /*
1779 * The image request has its own reference to each
1780 * of its object requests, so we can safely drop the
1781 * initial one here.
1782 */
1783 rbd_obj_request_put(obj_request);
1784 }
1785
1786 return 0;
1787}
1788
Alex Eldercf81b602013-01-17 12:18:46 -06001789static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06001790 u64 ver, u64 notify_id)
1791{
1792 struct rbd_obj_request *obj_request;
1793 struct ceph_osd_req_op *op;
1794 struct ceph_osd_client *osdc;
1795 int ret;
1796
1797 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1798 OBJ_REQUEST_NODATA);
1799 if (!obj_request)
1800 return -ENOMEM;
1801
1802 ret = -ENOMEM;
1803 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1804 if (!op)
1805 goto out;
1806 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1807 obj_request, op);
1808 rbd_osd_req_op_destroy(op);
1809 if (!obj_request->osd_req)
1810 goto out;
1811
1812 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Eldercf81b602013-01-17 12:18:46 -06001813 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06001814 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001815out:
Alex Eldercf81b602013-01-17 12:18:46 -06001816 if (ret)
1817 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001818
1819 return ret;
1820}
1821
1822static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1823{
1824 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1825 u64 hver;
1826 int rc;
1827
1828 if (!rbd_dev)
1829 return;
1830
1831 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1832 rbd_dev->header_name, (unsigned long long) notify_id,
1833 (unsigned int) opcode);
1834 rc = rbd_dev_refresh(rbd_dev, &hver);
1835 if (rc)
1836 rbd_warn(rbd_dev, "got notification but failed to "
1837 " update snaps: %d\n", rc);
1838
Alex Eldercf81b602013-01-17 12:18:46 -06001839 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06001840}
1841
Alex Elder9969ebc2013-01-18 12:31:10 -06001842/*
1843 * Request sync osd watch/unwatch. The value of "start" determines
1844 * whether a watch request is being initiated or torn down.
1845 */
1846static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1847{
1848 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1849 struct rbd_obj_request *obj_request;
1850 struct ceph_osd_req_op *op;
1851 int ret;
1852
1853 rbd_assert(start ^ !!rbd_dev->watch_event);
1854 rbd_assert(start ^ !!rbd_dev->watch_request);
1855
1856 if (start) {
1857 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1858 &rbd_dev->watch_event);
1859 if (ret < 0)
1860 return ret;
1861 }
1862
1863 ret = -ENOMEM;
1864 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1865 OBJ_REQUEST_NODATA);
1866 if (!obj_request)
1867 goto out_cancel;
1868
1869 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1870 rbd_dev->watch_event->cookie,
1871 rbd_dev->header.obj_version, start);
1872 if (!op)
1873 goto out_cancel;
1874 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1875 obj_request, op);
1876 rbd_osd_req_op_destroy(op);
1877 if (!obj_request->osd_req)
1878 goto out_cancel;
1879
1880 if (start) {
1881 rbd_dev->watch_request = obj_request->osd_req;
1882 ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request);
1883 }
1884 ret = rbd_obj_request_submit(osdc, obj_request);
1885 if (ret)
1886 goto out_cancel;
1887 ret = rbd_obj_request_wait(obj_request);
1888 if (ret)
1889 goto out_cancel;
1890
1891 ret = obj_request->result;
1892 if (ret)
1893 goto out_cancel;
1894
1895 if (start)
1896 goto done; /* Done if setting up the watch request */
1897out_cancel:
1898 /* Cancel the event if we're tearing down, or on error */
1899 ceph_osdc_cancel_event(rbd_dev->watch_event);
1900 rbd_dev->watch_event = NULL;
1901done:
1902 if (obj_request)
1903 rbd_obj_request_put(obj_request);
1904
1905 return ret;
1906}
1907
Alex Elder36be9a72013-01-19 00:30:28 -06001908/*
1909 * Synchronous osd object method call
1910 */
1911static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1912 const char *object_name,
1913 const char *class_name,
1914 const char *method_name,
1915 const char *outbound,
1916 size_t outbound_size,
1917 char *inbound,
1918 size_t inbound_size,
1919 u64 *version)
1920{
1921 struct rbd_obj_request *obj_request;
1922 struct ceph_osd_client *osdc;
1923 struct ceph_osd_req_op *op;
1924 struct page **pages;
1925 u32 page_count;
1926 int ret;
1927
1928 /*
1929 * Method calls are ultimately read operations but they
1930 * don't involve object data (so no offset or length).
1931 * The result should placed into the inbound buffer
1932 * provided. They also supply outbound data--parameters for
1933 * the object method. Currently if this is present it will
1934 * be a snapshot id.
1935 */
1936 page_count = (u32) calc_pages_for(0, inbound_size);
1937 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1938 if (IS_ERR(pages))
1939 return PTR_ERR(pages);
1940
1941 ret = -ENOMEM;
1942 obj_request = rbd_obj_request_create(object_name, 0, 0,
1943 OBJ_REQUEST_PAGES);
1944 if (!obj_request)
1945 goto out;
1946
1947 obj_request->pages = pages;
1948 obj_request->page_count = page_count;
1949
1950 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1951 method_name, outbound, outbound_size);
1952 if (!op)
1953 goto out;
1954 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1955 obj_request, op);
1956 rbd_osd_req_op_destroy(op);
1957 if (!obj_request->osd_req)
1958 goto out;
1959
1960 osdc = &rbd_dev->rbd_client->client->osdc;
1961 ret = rbd_obj_request_submit(osdc, obj_request);
1962 if (ret)
1963 goto out;
1964 ret = rbd_obj_request_wait(obj_request);
1965 if (ret)
1966 goto out;
1967
1968 ret = obj_request->result;
1969 if (ret < 0)
1970 goto out;
1971 ret = ceph_copy_from_page_vector(pages, inbound, 0,
1972 obj_request->xferred);
1973 if (version)
1974 *version = obj_request->version;
1975out:
1976 if (obj_request)
1977 rbd_obj_request_put(obj_request);
1978 else
1979 ceph_release_page_vector(pages, page_count);
1980
1981 return ret;
1982}
1983
Alex Elderbf0d5f502012-11-22 00:00:08 -06001984static void rbd_request_fn(struct request_queue *q)
1985{
1986 struct rbd_device *rbd_dev = q->queuedata;
1987 bool read_only = rbd_dev->mapping.read_only;
1988 struct request *rq;
1989 int result;
1990
1991 while ((rq = blk_fetch_request(q))) {
1992 bool write_request = rq_data_dir(rq) == WRITE;
1993 struct rbd_img_request *img_request;
1994 u64 offset;
1995 u64 length;
1996
1997 /* Ignore any non-FS requests that filter through. */
1998
1999 if (rq->cmd_type != REQ_TYPE_FS) {
2000 __blk_end_request_all(rq, 0);
2001 continue;
2002 }
2003
2004 spin_unlock_irq(q->queue_lock);
2005
2006 /* Disallow writes to a read-only device */
2007
2008 if (write_request) {
2009 result = -EROFS;
2010 if (read_only)
2011 goto end_request;
2012 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2013 }
2014
2015 /* Quit early if the snapshot has disappeared */
2016
2017 if (!atomic_read(&rbd_dev->exists)) {
2018 dout("request for non-existent snapshot");
2019 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2020 result = -ENXIO;
2021 goto end_request;
2022 }
2023
2024 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2025 length = (u64) blk_rq_bytes(rq);
2026
2027 result = -EINVAL;
2028 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2029 goto end_request; /* Shouldn't happen */
2030
2031 result = -ENOMEM;
2032 img_request = rbd_img_request_create(rbd_dev, offset, length,
2033 write_request);
2034 if (!img_request)
2035 goto end_request;
2036
2037 img_request->rq = rq;
2038
2039 result = rbd_img_request_fill_bio(img_request, rq->bio);
2040 if (!result)
2041 result = rbd_img_request_submit(img_request);
2042 if (result)
2043 rbd_img_request_put(img_request);
2044end_request:
2045 spin_lock_irq(q->queue_lock);
2046 if (result < 0) {
2047 rbd_warn(rbd_dev, "obj_request %s result %d\n",
2048 write_request ? "write" : "read", result);
2049 __blk_end_request_all(rq, result);
2050 }
2051 }
2052}
2053
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002054/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002055 * a queue callback. Makes sure that we don't create a bio that spans across
2056 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002057 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002058 */
2059static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2060 struct bio_vec *bvec)
2061{
2062 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05002063 sector_t sector_offset;
2064 sector_t sectors_per_obj;
2065 sector_t obj_sector_offset;
2066 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002067
Alex Eldere5cfeed2012-10-20 22:17:27 -05002068 /*
2069 * Find how far into its rbd object the partition-relative
2070 * bio start sector is to offset relative to the enclosing
2071 * device.
2072 */
2073 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2074 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2075 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002076
Alex Eldere5cfeed2012-10-20 22:17:27 -05002077 /*
2078 * Compute the number of bytes from that offset to the end
2079 * of the object. Account for what's already used by the bio.
2080 */
2081 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2082 if (ret > bmd->bi_size)
2083 ret -= bmd->bi_size;
2084 else
2085 ret = 0;
2086
2087 /*
2088 * Don't send back more than was asked for. And if the bio
2089 * was empty, let the whole thing through because: "Note
2090 * that a block device *must* allow a single page to be
2091 * added to an empty bio."
2092 */
2093 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2094 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2095 ret = (int) bvec->bv_len;
2096
2097 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002098}
2099
2100static void rbd_free_disk(struct rbd_device *rbd_dev)
2101{
2102 struct gendisk *disk = rbd_dev->disk;
2103
2104 if (!disk)
2105 return;
2106
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002107 if (disk->flags & GENHD_FL_UP)
2108 del_gendisk(disk);
2109 if (disk->queue)
2110 blk_cleanup_queue(disk->queue);
2111 put_disk(disk);
2112}
2113
Alex Elder788e2df2013-01-17 12:25:27 -06002114static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2115 const char *object_name,
2116 u64 offset, u64 length,
2117 char *buf, u64 *version)
2118
2119{
2120 struct ceph_osd_req_op *op;
2121 struct rbd_obj_request *obj_request;
2122 struct ceph_osd_client *osdc;
2123 struct page **pages = NULL;
2124 u32 page_count;
2125 int ret;
2126
2127 page_count = (u32) calc_pages_for(offset, length);
2128 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2129 if (IS_ERR(pages))
2130 ret = PTR_ERR(pages);
2131
2132 ret = -ENOMEM;
2133 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002134 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002135 if (!obj_request)
2136 goto out;
2137
2138 obj_request->pages = pages;
2139 obj_request->page_count = page_count;
2140
2141 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2142 if (!op)
2143 goto out;
2144 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2145 obj_request, op);
2146 rbd_osd_req_op_destroy(op);
2147 if (!obj_request->osd_req)
2148 goto out;
2149
2150 osdc = &rbd_dev->rbd_client->client->osdc;
2151 ret = rbd_obj_request_submit(osdc, obj_request);
2152 if (ret)
2153 goto out;
2154 ret = rbd_obj_request_wait(obj_request);
2155 if (ret)
2156 goto out;
2157
2158 ret = obj_request->result;
2159 if (ret < 0)
2160 goto out;
2161 ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
2162 if (version)
2163 *version = obj_request->version;
2164out:
2165 if (obj_request)
2166 rbd_obj_request_put(obj_request);
2167 else
2168 ceph_release_page_vector(pages, page_count);
2169
2170 return ret;
2171}
2172
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002173/*
Alex Elder4156d992012-08-02 11:29:46 -05002174 * Read the complete header for the given rbd device.
2175 *
2176 * Returns a pointer to a dynamically-allocated buffer containing
2177 * the complete and validated header. Caller can pass the address
2178 * of a variable that will be filled in with the version of the
2179 * header object at the time it was read.
2180 *
2181 * Returns a pointer-coded errno if a failure occurs.
2182 */
2183static struct rbd_image_header_ondisk *
2184rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2185{
2186 struct rbd_image_header_ondisk *ondisk = NULL;
2187 u32 snap_count = 0;
2188 u64 names_size = 0;
2189 u32 want_count;
2190 int ret;
2191
2192 /*
2193 * The complete header will include an array of its 64-bit
2194 * snapshot ids, followed by the names of those snapshots as
2195 * a contiguous block of NUL-terminated strings. Note that
2196 * the number of snapshots could change by the time we read
2197 * it in, in which case we re-read it.
2198 */
2199 do {
2200 size_t size;
2201
2202 kfree(ondisk);
2203
2204 size = sizeof (*ondisk);
2205 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2206 size += names_size;
2207 ondisk = kmalloc(size, GFP_KERNEL);
2208 if (!ondisk)
2209 return ERR_PTR(-ENOMEM);
2210
Alex Elder788e2df2013-01-17 12:25:27 -06002211 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002212 0, size,
2213 (char *) ondisk, version);
2214
2215 if (ret < 0)
2216 goto out_err;
2217 if (WARN_ON((size_t) ret < size)) {
2218 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002219 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2220 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002221 goto out_err;
2222 }
2223 if (!rbd_dev_ondisk_valid(ondisk)) {
2224 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002225 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002226 goto out_err;
2227 }
2228
2229 names_size = le64_to_cpu(ondisk->snap_names_len);
2230 want_count = snap_count;
2231 snap_count = le32_to_cpu(ondisk->snap_count);
2232 } while (snap_count != want_count);
2233
2234 return ondisk;
2235
2236out_err:
2237 kfree(ondisk);
2238
2239 return ERR_PTR(ret);
2240}
2241
2242/*
2243 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002244 */
2245static int rbd_read_header(struct rbd_device *rbd_dev,
2246 struct rbd_image_header *header)
2247{
Alex Elder4156d992012-08-02 11:29:46 -05002248 struct rbd_image_header_ondisk *ondisk;
2249 u64 ver = 0;
2250 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002251
Alex Elder4156d992012-08-02 11:29:46 -05002252 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2253 if (IS_ERR(ondisk))
2254 return PTR_ERR(ondisk);
2255 ret = rbd_header_from_disk(header, ondisk);
2256 if (ret >= 0)
2257 header->obj_version = ver;
2258 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002259
Alex Elder4156d992012-08-02 11:29:46 -05002260 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002261}
2262
Alex Elder41f38c22012-10-25 23:34:40 -05002263static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002264{
2265 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002266 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002267
Alex Eldera0593292012-07-19 09:09:27 -05002268 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002269 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002270}
2271
Alex Elder94785542012-10-09 13:50:17 -07002272static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2273{
2274 sector_t size;
2275
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002276 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002277 return;
2278
2279 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2280 dout("setting size to %llu sectors", (unsigned long long) size);
2281 rbd_dev->mapping.size = (u64) size;
2282 set_capacity(rbd_dev->disk, size);
2283}
2284
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002285/*
2286 * only read the first part of the ondisk header, without the snaps info
2287 */
Alex Elder117973f2012-08-31 17:29:55 -05002288static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002289{
2290 int ret;
2291 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002292
2293 ret = rbd_read_header(rbd_dev, &h);
2294 if (ret < 0)
2295 return ret;
2296
Josh Durgina51aa0c2011-12-05 10:35:04 -08002297 down_write(&rbd_dev->header_rwsem);
2298
Alex Elder94785542012-10-09 13:50:17 -07002299 /* Update image size, and check for resize of mapped image */
2300 rbd_dev->header.image_size = h.image_size;
2301 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002302
Alex Elder849b4262012-07-09 21:04:24 -05002303 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002304 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002305 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002306 /* osd requests may still refer to snapc */
2307 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002308
Alex Elderb8136232012-07-25 09:32:41 -05002309 if (hver)
2310 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002311 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002312 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002313 rbd_dev->header.snapc = h.snapc;
2314 rbd_dev->header.snap_names = h.snap_names;
2315 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002316 /* Free the extra copy of the object prefix */
2317 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2318 kfree(h.object_prefix);
2319
Alex Elder304f6802012-08-31 17:29:52 -05002320 ret = rbd_dev_snaps_update(rbd_dev);
2321 if (!ret)
2322 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002323
Josh Durginc6666012011-11-21 17:11:12 -08002324 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002325
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002326 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002327}
2328
Alex Elder117973f2012-08-31 17:29:55 -05002329static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002330{
2331 int ret;
2332
Alex Elder117973f2012-08-31 17:29:55 -05002333 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002334 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002335 if (rbd_dev->image_format == 1)
2336 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2337 else
2338 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002339 mutex_unlock(&ctl_mutex);
2340
2341 return ret;
2342}
2343
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002344static int rbd_init_disk(struct rbd_device *rbd_dev)
2345{
2346 struct gendisk *disk;
2347 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002348 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002349
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002350 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002351 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2352 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002353 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002354
Alex Elderf0f8cef2012-01-29 13:57:44 -06002355 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002356 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002357 disk->major = rbd_dev->major;
2358 disk->first_minor = 0;
2359 disk->fops = &rbd_bd_ops;
2360 disk->private_data = rbd_dev;
2361
Alex Elderbf0d5f502012-11-22 00:00:08 -06002362 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002363 if (!q)
2364 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002365
Alex Elder593a9e72012-02-07 12:03:37 -06002366 /* We use the default size, but let's be explicit about it. */
2367 blk_queue_physical_block_size(q, SECTOR_SIZE);
2368
Josh Durgin029bcbd2011-07-22 11:35:23 -07002369 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002370 segment_size = rbd_obj_bytes(&rbd_dev->header);
2371 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2372 blk_queue_max_segment_size(q, segment_size);
2373 blk_queue_io_min(q, segment_size);
2374 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002375
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002376 blk_queue_merge_bvec(q, rbd_merge_bvec);
2377 disk->queue = q;
2378
2379 q->queuedata = rbd_dev;
2380
2381 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002382
Alex Elder12f02942012-08-29 17:11:07 -05002383 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2384
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002385 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002386out_disk:
2387 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002388
2389 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002390}
2391
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002392/*
2393 sysfs
2394*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002395
Alex Elder593a9e72012-02-07 12:03:37 -06002396static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2397{
2398 return container_of(dev, struct rbd_device, dev);
2399}
2400
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002401static ssize_t rbd_size_show(struct device *dev,
2402 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002403{
Alex Elder593a9e72012-02-07 12:03:37 -06002404 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002405 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002406
Josh Durgina51aa0c2011-12-05 10:35:04 -08002407 down_read(&rbd_dev->header_rwsem);
2408 size = get_capacity(rbd_dev->disk);
2409 up_read(&rbd_dev->header_rwsem);
2410
2411 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002412}
2413
Alex Elder34b13182012-07-13 20:35:12 -05002414/*
2415 * Note this shows the features for whatever's mapped, which is not
2416 * necessarily the base image.
2417 */
2418static ssize_t rbd_features_show(struct device *dev,
2419 struct device_attribute *attr, char *buf)
2420{
2421 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2422
2423 return sprintf(buf, "0x%016llx\n",
2424 (unsigned long long) rbd_dev->mapping.features);
2425}
2426
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002427static ssize_t rbd_major_show(struct device *dev,
2428 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002429{
Alex Elder593a9e72012-02-07 12:03:37 -06002430 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002431
2432 return sprintf(buf, "%d\n", rbd_dev->major);
2433}
2434
2435static ssize_t rbd_client_id_show(struct device *dev,
2436 struct device_attribute *attr, char *buf)
2437{
Alex Elder593a9e72012-02-07 12:03:37 -06002438 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002439
Alex Elder1dbb4392012-01-24 10:08:37 -06002440 return sprintf(buf, "client%lld\n",
2441 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002442}
2443
2444static ssize_t rbd_pool_show(struct device *dev,
2445 struct device_attribute *attr, char *buf)
2446{
Alex Elder593a9e72012-02-07 12:03:37 -06002447 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002448
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002449 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002450}
2451
Alex Elder9bb2f332012-07-12 10:46:35 -05002452static ssize_t rbd_pool_id_show(struct device *dev,
2453 struct device_attribute *attr, char *buf)
2454{
2455 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2456
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002457 return sprintf(buf, "%llu\n",
2458 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002459}
2460
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002461static ssize_t rbd_name_show(struct device *dev,
2462 struct device_attribute *attr, char *buf)
2463{
Alex Elder593a9e72012-02-07 12:03:37 -06002464 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002465
Alex Eldera92ffdf2012-10-30 19:40:33 -05002466 if (rbd_dev->spec->image_name)
2467 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2468
2469 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002470}
2471
Alex Elder589d30e2012-07-10 20:30:11 -05002472static ssize_t rbd_image_id_show(struct device *dev,
2473 struct device_attribute *attr, char *buf)
2474{
2475 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2476
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002477 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002478}
2479
Alex Elder34b13182012-07-13 20:35:12 -05002480/*
2481 * Shows the name of the currently-mapped snapshot (or
2482 * RBD_SNAP_HEAD_NAME for the base image).
2483 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002484static ssize_t rbd_snap_show(struct device *dev,
2485 struct device_attribute *attr,
2486 char *buf)
2487{
Alex Elder593a9e72012-02-07 12:03:37 -06002488 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002489
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002490 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002491}
2492
Alex Elder86b00e02012-10-25 23:34:42 -05002493/*
2494 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2495 * for the parent image. If there is no parent, simply shows
2496 * "(no parent image)".
2497 */
2498static ssize_t rbd_parent_show(struct device *dev,
2499 struct device_attribute *attr,
2500 char *buf)
2501{
2502 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2503 struct rbd_spec *spec = rbd_dev->parent_spec;
2504 int count;
2505 char *bufp = buf;
2506
2507 if (!spec)
2508 return sprintf(buf, "(no parent image)\n");
2509
2510 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2511 (unsigned long long) spec->pool_id, spec->pool_name);
2512 if (count < 0)
2513 return count;
2514 bufp += count;
2515
2516 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2517 spec->image_name ? spec->image_name : "(unknown)");
2518 if (count < 0)
2519 return count;
2520 bufp += count;
2521
2522 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2523 (unsigned long long) spec->snap_id, spec->snap_name);
2524 if (count < 0)
2525 return count;
2526 bufp += count;
2527
2528 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2529 if (count < 0)
2530 return count;
2531 bufp += count;
2532
2533 return (ssize_t) (bufp - buf);
2534}
2535
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002536static ssize_t rbd_image_refresh(struct device *dev,
2537 struct device_attribute *attr,
2538 const char *buf,
2539 size_t size)
2540{
Alex Elder593a9e72012-02-07 12:03:37 -06002541 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002542 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002543
Alex Elder117973f2012-08-31 17:29:55 -05002544 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002545
2546 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002547}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002548
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002549static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002550static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002551static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2552static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2553static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002554static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002555static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002556static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002557static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2558static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002559static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002560
2561static struct attribute *rbd_attrs[] = {
2562 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002563 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002564 &dev_attr_major.attr,
2565 &dev_attr_client_id.attr,
2566 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002567 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002568 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002569 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002570 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002571 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002572 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002573 NULL
2574};
2575
2576static struct attribute_group rbd_attr_group = {
2577 .attrs = rbd_attrs,
2578};
2579
2580static const struct attribute_group *rbd_attr_groups[] = {
2581 &rbd_attr_group,
2582 NULL
2583};
2584
2585static void rbd_sysfs_dev_release(struct device *dev)
2586{
2587}
2588
2589static struct device_type rbd_device_type = {
2590 .name = "rbd",
2591 .groups = rbd_attr_groups,
2592 .release = rbd_sysfs_dev_release,
2593};
2594
2595
2596/*
2597 sysfs - snapshots
2598*/
2599
2600static ssize_t rbd_snap_size_show(struct device *dev,
2601 struct device_attribute *attr,
2602 char *buf)
2603{
2604 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2605
Josh Durgin35915382011-12-05 18:25:13 -08002606 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002607}
2608
2609static ssize_t rbd_snap_id_show(struct device *dev,
2610 struct device_attribute *attr,
2611 char *buf)
2612{
2613 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2614
Josh Durgin35915382011-12-05 18:25:13 -08002615 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002616}
2617
Alex Elder34b13182012-07-13 20:35:12 -05002618static ssize_t rbd_snap_features_show(struct device *dev,
2619 struct device_attribute *attr,
2620 char *buf)
2621{
2622 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2623
2624 return sprintf(buf, "0x%016llx\n",
2625 (unsigned long long) snap->features);
2626}
2627
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002628static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2629static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002630static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002631
2632static struct attribute *rbd_snap_attrs[] = {
2633 &dev_attr_snap_size.attr,
2634 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002635 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002636 NULL,
2637};
2638
2639static struct attribute_group rbd_snap_attr_group = {
2640 .attrs = rbd_snap_attrs,
2641};
2642
2643static void rbd_snap_dev_release(struct device *dev)
2644{
2645 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2646 kfree(snap->name);
2647 kfree(snap);
2648}
2649
2650static const struct attribute_group *rbd_snap_attr_groups[] = {
2651 &rbd_snap_attr_group,
2652 NULL
2653};
2654
2655static struct device_type rbd_snap_device_type = {
2656 .groups = rbd_snap_attr_groups,
2657 .release = rbd_snap_dev_release,
2658};
2659
Alex Elder8b8fb992012-10-26 17:25:24 -05002660static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2661{
2662 kref_get(&spec->kref);
2663
2664 return spec;
2665}
2666
2667static void rbd_spec_free(struct kref *kref);
2668static void rbd_spec_put(struct rbd_spec *spec)
2669{
2670 if (spec)
2671 kref_put(&spec->kref, rbd_spec_free);
2672}
2673
2674static struct rbd_spec *rbd_spec_alloc(void)
2675{
2676 struct rbd_spec *spec;
2677
2678 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2679 if (!spec)
2680 return NULL;
2681 kref_init(&spec->kref);
2682
2683 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2684
2685 return spec;
2686}
2687
2688static void rbd_spec_free(struct kref *kref)
2689{
2690 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2691
2692 kfree(spec->pool_name);
2693 kfree(spec->image_id);
2694 kfree(spec->image_name);
2695 kfree(spec->snap_name);
2696 kfree(spec);
2697}
2698
Alex Elderc53d5892012-10-25 23:34:42 -05002699struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2700 struct rbd_spec *spec)
2701{
2702 struct rbd_device *rbd_dev;
2703
2704 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2705 if (!rbd_dev)
2706 return NULL;
2707
2708 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002709 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002710 INIT_LIST_HEAD(&rbd_dev->node);
2711 INIT_LIST_HEAD(&rbd_dev->snaps);
2712 init_rwsem(&rbd_dev->header_rwsem);
2713
2714 rbd_dev->spec = spec;
2715 rbd_dev->rbd_client = rbdc;
2716
Alex Elder0903e872012-11-14 12:25:19 -06002717 /* Initialize the layout used for all rbd requests */
2718
2719 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2720 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2721 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2722 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2723
Alex Elderc53d5892012-10-25 23:34:42 -05002724 return rbd_dev;
2725}
2726
2727static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2728{
Alex Elder86b00e02012-10-25 23:34:42 -05002729 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002730 kfree(rbd_dev->header_name);
2731 rbd_put_client(rbd_dev->rbd_client);
2732 rbd_spec_put(rbd_dev->spec);
2733 kfree(rbd_dev);
2734}
2735
Alex Elder304f6802012-08-31 17:29:52 -05002736static bool rbd_snap_registered(struct rbd_snap *snap)
2737{
2738 bool ret = snap->dev.type == &rbd_snap_device_type;
2739 bool reg = device_is_registered(&snap->dev);
2740
2741 rbd_assert(!ret ^ reg);
2742
2743 return ret;
2744}
2745
Alex Elder41f38c22012-10-25 23:34:40 -05002746static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002747{
2748 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002749 if (device_is_registered(&snap->dev))
2750 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002751}
2752
Alex Elder14e70852012-07-19 09:09:27 -05002753static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002754 struct device *parent)
2755{
2756 struct device *dev = &snap->dev;
2757 int ret;
2758
2759 dev->type = &rbd_snap_device_type;
2760 dev->parent = parent;
2761 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002762 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002763 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2764
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002765 ret = device_register(dev);
2766
2767 return ret;
2768}
2769
Alex Elder4e891e02012-07-10 20:30:10 -05002770static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002771 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002772 u64 snap_id, u64 snap_size,
2773 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002774{
Alex Elder4e891e02012-07-10 20:30:10 -05002775 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002776 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002777
2778 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002779 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002780 return ERR_PTR(-ENOMEM);
2781
2782 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002783 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002784 if (!snap->name)
2785 goto err;
2786
Alex Elderc8d18422012-07-10 20:30:11 -05002787 snap->id = snap_id;
2788 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002789 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002790
2791 return snap;
2792
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002793err:
2794 kfree(snap->name);
2795 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002796
2797 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002798}
2799
Alex Eldercd892122012-07-03 16:01:19 -05002800static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2801 u64 *snap_size, u64 *snap_features)
2802{
2803 char *snap_name;
2804
2805 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2806
2807 *snap_size = rbd_dev->header.snap_sizes[which];
2808 *snap_features = 0; /* No features for v1 */
2809
2810 /* Skip over names until we find the one we are looking for */
2811
2812 snap_name = rbd_dev->header.snap_names;
2813 while (which--)
2814 snap_name += strlen(snap_name) + 1;
2815
2816 return snap_name;
2817}
2818
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002819/*
Alex Elder9d475de2012-07-03 16:01:19 -05002820 * Get the size and object order for an image snapshot, or if
2821 * snap_id is CEPH_NOSNAP, gets this information for the base
2822 * image.
2823 */
2824static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2825 u8 *order, u64 *snap_size)
2826{
2827 __le64 snapid = cpu_to_le64(snap_id);
2828 int ret;
2829 struct {
2830 u8 order;
2831 __le64 size;
2832 } __attribute__ ((packed)) size_buf = { 0 };
2833
Alex Elder36be9a72013-01-19 00:30:28 -06002834 (void) rbd_req_sync_exec; /* Avoid a warning */
2835 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05002836 "rbd", "get_size",
2837 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002838 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002839 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05002840 if (ret < 0)
2841 return ret;
2842
2843 *order = size_buf.order;
2844 *snap_size = le64_to_cpu(size_buf.size);
2845
2846 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2847 (unsigned long long) snap_id, (unsigned int) *order,
2848 (unsigned long long) *snap_size);
2849
2850 return 0;
2851}
2852
2853static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2854{
2855 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2856 &rbd_dev->header.obj_order,
2857 &rbd_dev->header.image_size);
2858}
2859
Alex Elder1e130192012-07-03 16:01:19 -05002860static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2861{
2862 void *reply_buf;
2863 int ret;
2864 void *p;
2865
2866 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2867 if (!reply_buf)
2868 return -ENOMEM;
2869
Alex Elder36be9a72013-01-19 00:30:28 -06002870 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder1e130192012-07-03 16:01:19 -05002871 "rbd", "get_object_prefix",
2872 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002873 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002874 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05002875 if (ret < 0)
2876 goto out;
Alex Elder36be9a72013-01-19 00:30:28 -06002877 ret = 0; /* rbd_obj_method_sync() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002878
2879 p = reply_buf;
2880 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2881 p + RBD_OBJ_PREFIX_LEN_MAX,
2882 NULL, GFP_NOIO);
2883
2884 if (IS_ERR(rbd_dev->header.object_prefix)) {
2885 ret = PTR_ERR(rbd_dev->header.object_prefix);
2886 rbd_dev->header.object_prefix = NULL;
2887 } else {
2888 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2889 }
2890
2891out:
2892 kfree(reply_buf);
2893
2894 return ret;
2895}
2896
Alex Elderb1b54022012-07-03 16:01:19 -05002897static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2898 u64 *snap_features)
2899{
2900 __le64 snapid = cpu_to_le64(snap_id);
2901 struct {
2902 __le64 features;
2903 __le64 incompat;
2904 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002905 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002906 int ret;
2907
Alex Elder36be9a72013-01-19 00:30:28 -06002908 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05002909 "rbd", "get_features",
2910 (char *) &snapid, sizeof (snapid),
2911 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002912 NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002913 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05002914 if (ret < 0)
2915 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002916
2917 incompat = le64_to_cpu(features_buf.incompat);
2918 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002919 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002920
Alex Elderb1b54022012-07-03 16:01:19 -05002921 *snap_features = le64_to_cpu(features_buf.features);
2922
2923 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2924 (unsigned long long) snap_id,
2925 (unsigned long long) *snap_features,
2926 (unsigned long long) le64_to_cpu(features_buf.incompat));
2927
2928 return 0;
2929}
2930
2931static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2932{
2933 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2934 &rbd_dev->header.features);
2935}
2936
Alex Elder86b00e02012-10-25 23:34:42 -05002937static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2938{
2939 struct rbd_spec *parent_spec;
2940 size_t size;
2941 void *reply_buf = NULL;
2942 __le64 snapid;
2943 void *p;
2944 void *end;
2945 char *image_id;
2946 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002947 int ret;
2948
2949 parent_spec = rbd_spec_alloc();
2950 if (!parent_spec)
2951 return -ENOMEM;
2952
2953 size = sizeof (__le64) + /* pool_id */
2954 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2955 sizeof (__le64) + /* snap_id */
2956 sizeof (__le64); /* overlap */
2957 reply_buf = kmalloc(size, GFP_KERNEL);
2958 if (!reply_buf) {
2959 ret = -ENOMEM;
2960 goto out_err;
2961 }
2962
2963 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06002964 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05002965 "rbd", "get_parent",
2966 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002967 (char *) reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002968 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05002969 if (ret < 0)
2970 goto out_err;
2971
2972 ret = -ERANGE;
2973 p = reply_buf;
2974 end = (char *) reply_buf + size;
2975 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2976 if (parent_spec->pool_id == CEPH_NOPOOL)
2977 goto out; /* No parent? No problem. */
2978
Alex Elder0903e872012-11-14 12:25:19 -06002979 /* The ceph file layout needs to fit pool id in 32 bits */
2980
2981 ret = -EIO;
2982 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2983 goto out;
2984
Alex Elder979ed482012-11-01 08:39:26 -05002985 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002986 if (IS_ERR(image_id)) {
2987 ret = PTR_ERR(image_id);
2988 goto out_err;
2989 }
2990 parent_spec->image_id = image_id;
2991 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2992 ceph_decode_64_safe(&p, end, overlap, out_err);
2993
2994 rbd_dev->parent_overlap = overlap;
2995 rbd_dev->parent_spec = parent_spec;
2996 parent_spec = NULL; /* rbd_dev now owns this */
2997out:
2998 ret = 0;
2999out_err:
3000 kfree(reply_buf);
3001 rbd_spec_put(parent_spec);
3002
3003 return ret;
3004}
3005
Alex Elder9e15b772012-10-30 19:40:33 -05003006static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3007{
3008 size_t image_id_size;
3009 char *image_id;
3010 void *p;
3011 void *end;
3012 size_t size;
3013 void *reply_buf = NULL;
3014 size_t len = 0;
3015 char *image_name = NULL;
3016 int ret;
3017
3018 rbd_assert(!rbd_dev->spec->image_name);
3019
Alex Elder69e7a022012-11-01 08:39:26 -05003020 len = strlen(rbd_dev->spec->image_id);
3021 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003022 image_id = kmalloc(image_id_size, GFP_KERNEL);
3023 if (!image_id)
3024 return NULL;
3025
3026 p = image_id;
3027 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05003028 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05003029
3030 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3031 reply_buf = kmalloc(size, GFP_KERNEL);
3032 if (!reply_buf)
3033 goto out;
3034
Alex Elder36be9a72013-01-19 00:30:28 -06003035 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003036 "rbd", "dir_get_name",
3037 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06003038 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05003039 if (ret < 0)
3040 goto out;
3041 p = reply_buf;
3042 end = (char *) reply_buf + size;
3043 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3044 if (IS_ERR(image_name))
3045 image_name = NULL;
3046 else
3047 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3048out:
3049 kfree(reply_buf);
3050 kfree(image_id);
3051
3052 return image_name;
3053}
3054
3055/*
3056 * When a parent image gets probed, we only have the pool, image,
3057 * and snapshot ids but not the names of any of them. This call
3058 * is made later to fill in those names. It has to be done after
3059 * rbd_dev_snaps_update() has completed because some of the
3060 * information (in particular, snapshot name) is not available
3061 * until then.
3062 */
3063static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3064{
3065 struct ceph_osd_client *osdc;
3066 const char *name;
3067 void *reply_buf = NULL;
3068 int ret;
3069
3070 if (rbd_dev->spec->pool_name)
3071 return 0; /* Already have the names */
3072
3073 /* Look up the pool name */
3074
3075 osdc = &rbd_dev->rbd_client->client->osdc;
3076 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003077 if (!name) {
3078 rbd_warn(rbd_dev, "there is no pool with id %llu",
3079 rbd_dev->spec->pool_id); /* Really a BUG() */
3080 return -EIO;
3081 }
Alex Elder9e15b772012-10-30 19:40:33 -05003082
3083 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3084 if (!rbd_dev->spec->pool_name)
3085 return -ENOMEM;
3086
3087 /* Fetch the image name; tolerate failure here */
3088
3089 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003090 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05003091 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05003092 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003093 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003094
3095 /* Look up the snapshot name. */
3096
3097 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3098 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003099 rbd_warn(rbd_dev, "no snapshot with id %llu",
3100 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003101 ret = -EIO;
3102 goto out_err;
3103 }
3104 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3105 if(!rbd_dev->spec->snap_name)
3106 goto out_err;
3107
3108 return 0;
3109out_err:
3110 kfree(reply_buf);
3111 kfree(rbd_dev->spec->pool_name);
3112 rbd_dev->spec->pool_name = NULL;
3113
3114 return ret;
3115}
3116
Alex Elder6e14b1a2012-07-03 16:01:19 -05003117static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003118{
3119 size_t size;
3120 int ret;
3121 void *reply_buf;
3122 void *p;
3123 void *end;
3124 u64 seq;
3125 u32 snap_count;
3126 struct ceph_snap_context *snapc;
3127 u32 i;
3128
3129 /*
3130 * We'll need room for the seq value (maximum snapshot id),
3131 * snapshot count, and array of that many snapshot ids.
3132 * For now we have a fixed upper limit on the number we're
3133 * prepared to receive.
3134 */
3135 size = sizeof (__le64) + sizeof (__le32) +
3136 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3137 reply_buf = kzalloc(size, GFP_KERNEL);
3138 if (!reply_buf)
3139 return -ENOMEM;
3140
Alex Elder36be9a72013-01-19 00:30:28 -06003141 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder35d489f2012-07-03 16:01:19 -05003142 "rbd", "get_snapcontext",
3143 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003144 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06003145 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003146 if (ret < 0)
3147 goto out;
3148
3149 ret = -ERANGE;
3150 p = reply_buf;
3151 end = (char *) reply_buf + size;
3152 ceph_decode_64_safe(&p, end, seq, out);
3153 ceph_decode_32_safe(&p, end, snap_count, out);
3154
3155 /*
3156 * Make sure the reported number of snapshot ids wouldn't go
3157 * beyond the end of our buffer. But before checking that,
3158 * make sure the computed size of the snapshot context we
3159 * allocate is representable in a size_t.
3160 */
3161 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3162 / sizeof (u64)) {
3163 ret = -EINVAL;
3164 goto out;
3165 }
3166 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3167 goto out;
3168
3169 size = sizeof (struct ceph_snap_context) +
3170 snap_count * sizeof (snapc->snaps[0]);
3171 snapc = kmalloc(size, GFP_KERNEL);
3172 if (!snapc) {
3173 ret = -ENOMEM;
3174 goto out;
3175 }
3176
3177 atomic_set(&snapc->nref, 1);
3178 snapc->seq = seq;
3179 snapc->num_snaps = snap_count;
3180 for (i = 0; i < snap_count; i++)
3181 snapc->snaps[i] = ceph_decode_64(&p);
3182
3183 rbd_dev->header.snapc = snapc;
3184
3185 dout(" snap context seq = %llu, snap_count = %u\n",
3186 (unsigned long long) seq, (unsigned int) snap_count);
3187
3188out:
3189 kfree(reply_buf);
3190
3191 return 0;
3192}
3193
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003194static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3195{
3196 size_t size;
3197 void *reply_buf;
3198 __le64 snap_id;
3199 int ret;
3200 void *p;
3201 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003202 char *snap_name;
3203
3204 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3205 reply_buf = kmalloc(size, GFP_KERNEL);
3206 if (!reply_buf)
3207 return ERR_PTR(-ENOMEM);
3208
3209 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003210 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003211 "rbd", "get_snapshot_name",
3212 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003213 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003214 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003215 if (ret < 0)
3216 goto out;
3217
3218 p = reply_buf;
3219 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003220 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003221 if (IS_ERR(snap_name)) {
3222 ret = PTR_ERR(snap_name);
3223 goto out;
3224 } else {
3225 dout(" snap_id 0x%016llx snap_name = %s\n",
3226 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3227 }
3228 kfree(reply_buf);
3229
3230 return snap_name;
3231out:
3232 kfree(reply_buf);
3233
3234 return ERR_PTR(ret);
3235}
3236
3237static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3238 u64 *snap_size, u64 *snap_features)
3239{
Alex Eldere0b49862013-01-09 14:44:18 -06003240 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003241 u8 order;
3242 int ret;
3243
3244 snap_id = rbd_dev->header.snapc->snaps[which];
3245 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3246 if (ret)
3247 return ERR_PTR(ret);
3248 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3249 if (ret)
3250 return ERR_PTR(ret);
3251
3252 return rbd_dev_v2_snap_name(rbd_dev, which);
3253}
3254
3255static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3256 u64 *snap_size, u64 *snap_features)
3257{
3258 if (rbd_dev->image_format == 1)
3259 return rbd_dev_v1_snap_info(rbd_dev, which,
3260 snap_size, snap_features);
3261 if (rbd_dev->image_format == 2)
3262 return rbd_dev_v2_snap_info(rbd_dev, which,
3263 snap_size, snap_features);
3264 return ERR_PTR(-EINVAL);
3265}
3266
Alex Elder117973f2012-08-31 17:29:55 -05003267static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3268{
3269 int ret;
3270 __u8 obj_order;
3271
3272 down_write(&rbd_dev->header_rwsem);
3273
3274 /* Grab old order first, to see if it changes */
3275
3276 obj_order = rbd_dev->header.obj_order,
3277 ret = rbd_dev_v2_image_size(rbd_dev);
3278 if (ret)
3279 goto out;
3280 if (rbd_dev->header.obj_order != obj_order) {
3281 ret = -EIO;
3282 goto out;
3283 }
3284 rbd_update_mapping_size(rbd_dev);
3285
3286 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3287 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3288 if (ret)
3289 goto out;
3290 ret = rbd_dev_snaps_update(rbd_dev);
3291 dout("rbd_dev_snaps_update returned %d\n", ret);
3292 if (ret)
3293 goto out;
3294 ret = rbd_dev_snaps_register(rbd_dev);
3295 dout("rbd_dev_snaps_register returned %d\n", ret);
3296out:
3297 up_write(&rbd_dev->header_rwsem);
3298
3299 return ret;
3300}
3301
Alex Elder9d475de2012-07-03 16:01:19 -05003302/*
Alex Elder35938152012-08-02 11:29:46 -05003303 * Scan the rbd device's current snapshot list and compare it to the
3304 * newly-received snapshot context. Remove any existing snapshots
3305 * not present in the new snapshot context. Add a new snapshot for
3306 * any snaphots in the snapshot context not in the current list.
3307 * And verify there are no changes to snapshots we already know
3308 * about.
3309 *
3310 * Assumes the snapshots in the snapshot context are sorted by
3311 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3312 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003313 */
Alex Elder304f6802012-08-31 17:29:52 -05003314static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003315{
Alex Elder35938152012-08-02 11:29:46 -05003316 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3317 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003318 struct list_head *head = &rbd_dev->snaps;
3319 struct list_head *links = head->next;
3320 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003321
Alex Elder9fcbb802012-08-23 23:48:49 -05003322 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003323 while (index < snap_count || links != head) {
3324 u64 snap_id;
3325 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003326 char *snap_name;
3327 u64 snap_size = 0;
3328 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003329
Alex Elder35938152012-08-02 11:29:46 -05003330 snap_id = index < snap_count ? snapc->snaps[index]
3331 : CEPH_NOSNAP;
3332 snap = links != head ? list_entry(links, struct rbd_snap, node)
3333 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05003334 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003335
Alex Elder35938152012-08-02 11:29:46 -05003336 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3337 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003338
Alex Elder35938152012-08-02 11:29:46 -05003339 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003340
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003341 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06003342 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05003343 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003344 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003345 rbd_dev->spec->snap_id == snap->id ?
3346 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003347 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003348
Alex Elder35938152012-08-02 11:29:46 -05003349 /* Done with this list entry; advance */
3350
3351 links = next;
3352 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003353 }
Alex Elder35938152012-08-02 11:29:46 -05003354
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003355 snap_name = rbd_dev_snap_info(rbd_dev, index,
3356 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003357 if (IS_ERR(snap_name))
3358 return PTR_ERR(snap_name);
3359
Alex Elder9fcbb802012-08-23 23:48:49 -05003360 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3361 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003362 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3363 struct rbd_snap *new_snap;
3364
3365 /* We haven't seen this snapshot before */
3366
Alex Elderc8d18422012-07-10 20:30:11 -05003367 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003368 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003369 if (IS_ERR(new_snap)) {
3370 int err = PTR_ERR(new_snap);
3371
3372 dout(" failed to add dev, error %d\n", err);
3373
3374 return err;
3375 }
Alex Elder35938152012-08-02 11:29:46 -05003376
3377 /* New goes before existing, or at end of list */
3378
Alex Elder9fcbb802012-08-23 23:48:49 -05003379 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003380 if (snap)
3381 list_add_tail(&new_snap->node, &snap->node);
3382 else
Alex Elder523f3252012-08-30 00:16:37 -05003383 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003384 } else {
3385 /* Already have this one */
3386
Alex Elder9fcbb802012-08-23 23:48:49 -05003387 dout(" already present\n");
3388
Alex Eldercd892122012-07-03 16:01:19 -05003389 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05003390 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003391 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003392
3393 /* Done with this list entry; advance */
3394
3395 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003396 }
Alex Elder35938152012-08-02 11:29:46 -05003397
3398 /* Advance to the next entry in the snapshot context */
3399
3400 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003401 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003402 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003403
3404 return 0;
3405}
3406
Alex Elder304f6802012-08-31 17:29:52 -05003407/*
3408 * Scan the list of snapshots and register the devices for any that
3409 * have not already been registered.
3410 */
3411static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3412{
3413 struct rbd_snap *snap;
3414 int ret = 0;
3415
3416 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003417 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3418 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003419
3420 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3421 if (!rbd_snap_registered(snap)) {
3422 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3423 if (ret < 0)
3424 break;
3425 }
3426 }
3427 dout("%s: returning %d\n", __func__, ret);
3428
3429 return ret;
3430}
3431
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003432static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3433{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003434 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003435 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003436
3437 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003438
Alex Eldercd789ab2012-08-30 00:16:38 -05003439 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003440 dev->bus = &rbd_bus_type;
3441 dev->type = &rbd_device_type;
3442 dev->parent = &rbd_root_dev;
3443 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003444 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003445 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003446
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003447 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003448
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003449 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003450}
3451
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003452static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3453{
3454 device_unregister(&rbd_dev->dev);
3455}
3456
Alex Eldere2839302012-08-29 17:11:06 -05003457static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003458
3459/*
Alex Elder499afd52012-02-02 08:13:29 -06003460 * Get a unique rbd identifier for the given new rbd_dev, and add
3461 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003462 */
Alex Eldere2839302012-08-29 17:11:06 -05003463static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003464{
Alex Eldere2839302012-08-29 17:11:06 -05003465 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003466
3467 spin_lock(&rbd_dev_list_lock);
3468 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3469 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003470 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3471 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003472}
Alex Elderb7f23c32012-01-29 13:57:43 -06003473
Alex Elder1ddbe942012-01-29 13:57:44 -06003474/*
Alex Elder499afd52012-02-02 08:13:29 -06003475 * Remove an rbd_dev from the global list, and record that its
3476 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003477 */
Alex Eldere2839302012-08-29 17:11:06 -05003478static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003479{
Alex Elderd184f6b2012-01-29 13:57:44 -06003480 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003481 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003482 int max_id;
3483
Alex Elderaafb2302012-09-06 16:00:54 -05003484 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003485
Alex Eldere2839302012-08-29 17:11:06 -05003486 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3487 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003488 spin_lock(&rbd_dev_list_lock);
3489 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003490
3491 /*
3492 * If the id being "put" is not the current maximum, there
3493 * is nothing special we need to do.
3494 */
Alex Eldere2839302012-08-29 17:11:06 -05003495 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003496 spin_unlock(&rbd_dev_list_lock);
3497 return;
3498 }
3499
3500 /*
3501 * We need to update the current maximum id. Search the
3502 * list to find out what it is. We're more likely to find
3503 * the maximum at the end, so search the list backward.
3504 */
3505 max_id = 0;
3506 list_for_each_prev(tmp, &rbd_dev_list) {
3507 struct rbd_device *rbd_dev;
3508
3509 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003510 if (rbd_dev->dev_id > max_id)
3511 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003512 }
Alex Elder499afd52012-02-02 08:13:29 -06003513 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003514
Alex Elder1ddbe942012-01-29 13:57:44 -06003515 /*
Alex Eldere2839302012-08-29 17:11:06 -05003516 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003517 * which case it now accurately reflects the new maximum.
3518 * Be careful not to overwrite the maximum value in that
3519 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003520 */
Alex Eldere2839302012-08-29 17:11:06 -05003521 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3522 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003523}
3524
Alex Eldera725f65e2012-02-02 08:13:30 -06003525/*
Alex Eldere28fff262012-02-02 08:13:30 -06003526 * Skips over white space at *buf, and updates *buf to point to the
3527 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003528 * the token (string of non-white space characters) found. Note
3529 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003530 */
3531static inline size_t next_token(const char **buf)
3532{
3533 /*
3534 * These are the characters that produce nonzero for
3535 * isspace() in the "C" and "POSIX" locales.
3536 */
3537 const char *spaces = " \f\n\r\t\v";
3538
3539 *buf += strspn(*buf, spaces); /* Find start of token */
3540
3541 return strcspn(*buf, spaces); /* Return token length */
3542}
3543
3544/*
3545 * Finds the next token in *buf, and if the provided token buffer is
3546 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003547 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3548 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003549 *
3550 * Returns the length of the token found (not including the '\0').
3551 * Return value will be 0 if no token is found, and it will be >=
3552 * token_size if the token would not fit.
3553 *
Alex Elder593a9e72012-02-07 12:03:37 -06003554 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003555 * found token. Note that this occurs even if the token buffer is
3556 * too small to hold it.
3557 */
3558static inline size_t copy_token(const char **buf,
3559 char *token,
3560 size_t token_size)
3561{
3562 size_t len;
3563
3564 len = next_token(buf);
3565 if (len < token_size) {
3566 memcpy(token, *buf, len);
3567 *(token + len) = '\0';
3568 }
3569 *buf += len;
3570
3571 return len;
3572}
3573
3574/*
Alex Elderea3352f2012-07-09 21:04:23 -05003575 * Finds the next token in *buf, dynamically allocates a buffer big
3576 * enough to hold a copy of it, and copies the token into the new
3577 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3578 * that a duplicate buffer is created even for a zero-length token.
3579 *
3580 * Returns a pointer to the newly-allocated duplicate, or a null
3581 * pointer if memory for the duplicate was not available. If
3582 * the lenp argument is a non-null pointer, the length of the token
3583 * (not including the '\0') is returned in *lenp.
3584 *
3585 * If successful, the *buf pointer will be updated to point beyond
3586 * the end of the found token.
3587 *
3588 * Note: uses GFP_KERNEL for allocation.
3589 */
3590static inline char *dup_token(const char **buf, size_t *lenp)
3591{
3592 char *dup;
3593 size_t len;
3594
3595 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003596 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003597 if (!dup)
3598 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003599 *(dup + len) = '\0';
3600 *buf += len;
3601
3602 if (lenp)
3603 *lenp = len;
3604
3605 return dup;
3606}
3607
3608/*
Alex Elder859c31d2012-10-25 23:34:42 -05003609 * Parse the options provided for an "rbd add" (i.e., rbd image
3610 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3611 * and the data written is passed here via a NUL-terminated buffer.
3612 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003613 *
Alex Elder859c31d2012-10-25 23:34:42 -05003614 * The information extracted from these options is recorded in
3615 * the other parameters which return dynamically-allocated
3616 * structures:
3617 * ceph_opts
3618 * The address of a pointer that will refer to a ceph options
3619 * structure. Caller must release the returned pointer using
3620 * ceph_destroy_options() when it is no longer needed.
3621 * rbd_opts
3622 * Address of an rbd options pointer. Fully initialized by
3623 * this function; caller must release with kfree().
3624 * spec
3625 * Address of an rbd image specification pointer. Fully
3626 * initialized by this function based on parsed options.
3627 * Caller must release with rbd_spec_put().
3628 *
3629 * The options passed take this form:
3630 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3631 * where:
3632 * <mon_addrs>
3633 * A comma-separated list of one or more monitor addresses.
3634 * A monitor address is an ip address, optionally followed
3635 * by a port number (separated by a colon).
3636 * I.e.: ip1[:port1][,ip2[:port2]...]
3637 * <options>
3638 * A comma-separated list of ceph and/or rbd options.
3639 * <pool_name>
3640 * The name of the rados pool containing the rbd image.
3641 * <image_name>
3642 * The name of the image in that pool to map.
3643 * <snap_id>
3644 * An optional snapshot id. If provided, the mapping will
3645 * present data from the image at the time that snapshot was
3646 * created. The image head is used if no snapshot id is
3647 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003648 */
Alex Elder859c31d2012-10-25 23:34:42 -05003649static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003650 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003651 struct rbd_options **opts,
3652 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003653{
Alex Elderd22f76e2012-07-12 10:46:35 -05003654 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003655 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003656 const char *mon_addrs;
3657 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003658 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003659 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003660 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003661 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003662
3663 /* The first four tokens are required */
3664
Alex Elder7ef32142012-02-02 08:13:30 -06003665 len = next_token(&buf);
Alex Elder4fb5d672012-11-01 10:17:15 -05003666 if (!len) {
3667 rbd_warn(NULL, "no monitor address(es) provided");
3668 return -EINVAL;
3669 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003670 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003671 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003672 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003673
Alex Elderdc79b112012-10-25 23:34:41 -05003674 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003675 options = dup_token(&buf, NULL);
3676 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003677 return -ENOMEM;
Alex Elder4fb5d672012-11-01 10:17:15 -05003678 if (!*options) {
3679 rbd_warn(NULL, "no options provided");
3680 goto out_err;
3681 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003682
Alex Elder859c31d2012-10-25 23:34:42 -05003683 spec = rbd_spec_alloc();
3684 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003685 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003686
3687 spec->pool_name = dup_token(&buf, NULL);
3688 if (!spec->pool_name)
3689 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003690 if (!*spec->pool_name) {
3691 rbd_warn(NULL, "no pool name provided");
3692 goto out_err;
3693 }
Alex Eldere28fff262012-02-02 08:13:30 -06003694
Alex Elder69e7a022012-11-01 08:39:26 -05003695 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003696 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003697 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003698 if (!*spec->image_name) {
3699 rbd_warn(NULL, "no image name provided");
3700 goto out_err;
3701 }
Alex Eldere28fff262012-02-02 08:13:30 -06003702
Alex Elderf28e5652012-10-25 23:34:41 -05003703 /*
3704 * Snapshot name is optional; default is to use "-"
3705 * (indicating the head/no snapshot).
3706 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003707 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003708 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003709 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3710 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003711 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003712 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003713 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003714 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003715 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003716 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003717 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003718 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003719
Alex Elder0ddebc02012-10-25 23:34:41 -05003720 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003721
Alex Elder4e9afeb2012-10-25 23:34:41 -05003722 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3723 if (!rbd_opts)
3724 goto out_mem;
3725
3726 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003727
Alex Elder859c31d2012-10-25 23:34:42 -05003728 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003729 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003730 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003731 if (IS_ERR(copts)) {
3732 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003733 goto out_err;
3734 }
Alex Elder859c31d2012-10-25 23:34:42 -05003735 kfree(options);
3736
3737 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003738 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003739 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003740
Alex Elderdc79b112012-10-25 23:34:41 -05003741 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003742out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003743 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003744out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003745 kfree(rbd_opts);
3746 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003747 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003748
Alex Elderdc79b112012-10-25 23:34:41 -05003749 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003750}
3751
Alex Elder589d30e2012-07-10 20:30:11 -05003752/*
3753 * An rbd format 2 image has a unique identifier, distinct from the
3754 * name given to it by the user. Internally, that identifier is
3755 * what's used to specify the names of objects related to the image.
3756 *
3757 * A special "rbd id" object is used to map an rbd image name to its
3758 * id. If that object doesn't exist, then there is no v2 rbd image
3759 * with the supplied name.
3760 *
3761 * This function will record the given rbd_dev's image_id field if
3762 * it can be determined, and in that case will return 0. If any
3763 * errors occur a negative errno will be returned and the rbd_dev's
3764 * image_id field will be unchanged (and should be NULL).
3765 */
3766static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3767{
3768 int ret;
3769 size_t size;
3770 char *object_name;
3771 void *response;
3772 void *p;
3773
3774 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003775 * When probing a parent image, the image id is already
3776 * known (and the image name likely is not). There's no
3777 * need to fetch the image id again in this case.
3778 */
3779 if (rbd_dev->spec->image_id)
3780 return 0;
3781
3782 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003783 * First, see if the format 2 image id file exists, and if
3784 * so, get the image's persistent id from it.
3785 */
Alex Elder69e7a022012-11-01 08:39:26 -05003786 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003787 object_name = kmalloc(size, GFP_NOIO);
3788 if (!object_name)
3789 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003790 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003791 dout("rbd id object name is %s\n", object_name);
3792
3793 /* Response will be an encoded string, which includes a length */
3794
3795 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3796 response = kzalloc(size, GFP_NOIO);
3797 if (!response) {
3798 ret = -ENOMEM;
3799 goto out;
3800 }
3801
Alex Elder36be9a72013-01-19 00:30:28 -06003802 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder589d30e2012-07-10 20:30:11 -05003803 "rbd", "get_id",
3804 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003805 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003806 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder589d30e2012-07-10 20:30:11 -05003807 if (ret < 0)
3808 goto out;
Alex Elder36be9a72013-01-19 00:30:28 -06003809 ret = 0; /* rbd_obj_method_sync() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003810
3811 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003812 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003813 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003814 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003815 if (IS_ERR(rbd_dev->spec->image_id)) {
3816 ret = PTR_ERR(rbd_dev->spec->image_id);
3817 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003818 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003819 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003820 }
3821out:
3822 kfree(response);
3823 kfree(object_name);
3824
3825 return ret;
3826}
3827
Alex Eldera30b71b2012-07-10 20:30:11 -05003828static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3829{
3830 int ret;
3831 size_t size;
3832
3833 /* Version 1 images have no id; empty string is used */
3834
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003835 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3836 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003837 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003838
3839 /* Record the header object name for this rbd image. */
3840
Alex Elder69e7a022012-11-01 08:39:26 -05003841 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003842 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3843 if (!rbd_dev->header_name) {
3844 ret = -ENOMEM;
3845 goto out_err;
3846 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003847 sprintf(rbd_dev->header_name, "%s%s",
3848 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003849
3850 /* Populate rbd image metadata */
3851
3852 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3853 if (ret < 0)
3854 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003855
3856 /* Version 1 images have no parent (no layering) */
3857
3858 rbd_dev->parent_spec = NULL;
3859 rbd_dev->parent_overlap = 0;
3860
Alex Eldera30b71b2012-07-10 20:30:11 -05003861 rbd_dev->image_format = 1;
3862
3863 dout("discovered version 1 image, header name is %s\n",
3864 rbd_dev->header_name);
3865
3866 return 0;
3867
3868out_err:
3869 kfree(rbd_dev->header_name);
3870 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003871 kfree(rbd_dev->spec->image_id);
3872 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003873
3874 return ret;
3875}
3876
3877static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3878{
3879 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003880 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003881 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003882
3883 /*
3884 * Image id was filled in by the caller. Record the header
3885 * object name for this rbd image.
3886 */
Alex Elder979ed482012-11-01 08:39:26 -05003887 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003888 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3889 if (!rbd_dev->header_name)
3890 return -ENOMEM;
3891 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003892 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003893
3894 /* Get the size and object order for the image */
3895
3896 ret = rbd_dev_v2_image_size(rbd_dev);
3897 if (ret < 0)
3898 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003899
3900 /* Get the object prefix (a.k.a. block_name) for the image */
3901
3902 ret = rbd_dev_v2_object_prefix(rbd_dev);
3903 if (ret < 0)
3904 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003905
Alex Elderd8891402012-10-09 13:50:17 -07003906 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003907
3908 ret = rbd_dev_v2_features(rbd_dev);
3909 if (ret < 0)
3910 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003911
Alex Elder86b00e02012-10-25 23:34:42 -05003912 /* If the image supports layering, get the parent info */
3913
3914 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3915 ret = rbd_dev_v2_parent_info(rbd_dev);
3916 if (ret < 0)
3917 goto out_err;
3918 }
3919
Alex Elder6e14b1a2012-07-03 16:01:19 -05003920 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003921
Alex Elder6e14b1a2012-07-03 16:01:19 -05003922 rbd_dev->header.crypt_type = 0;
3923 rbd_dev->header.comp_type = 0;
3924
3925 /* Get the snapshot context, plus the header version */
3926
3927 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003928 if (ret)
3929 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003930 rbd_dev->header.obj_version = ver;
3931
Alex Eldera30b71b2012-07-10 20:30:11 -05003932 rbd_dev->image_format = 2;
3933
3934 dout("discovered version 2 image, header name is %s\n",
3935 rbd_dev->header_name);
3936
Alex Elder35152972012-08-31 17:29:55 -05003937 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003938out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003939 rbd_dev->parent_overlap = 0;
3940 rbd_spec_put(rbd_dev->parent_spec);
3941 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003942 kfree(rbd_dev->header_name);
3943 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003944 kfree(rbd_dev->header.object_prefix);
3945 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003946
3947 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003948}
3949
Alex Elder83a06262012-10-30 15:47:17 -05003950static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3951{
3952 int ret;
3953
3954 /* no need to lock here, as rbd_dev is not registered yet */
3955 ret = rbd_dev_snaps_update(rbd_dev);
3956 if (ret)
3957 return ret;
3958
Alex Elder9e15b772012-10-30 19:40:33 -05003959 ret = rbd_dev_probe_update_spec(rbd_dev);
3960 if (ret)
3961 goto err_out_snaps;
3962
Alex Elder83a06262012-10-30 15:47:17 -05003963 ret = rbd_dev_set_mapping(rbd_dev);
3964 if (ret)
3965 goto err_out_snaps;
3966
3967 /* generate unique id: find highest unique id, add one */
3968 rbd_dev_id_get(rbd_dev);
3969
3970 /* Fill in the device name, now that we have its id. */
3971 BUILD_BUG_ON(DEV_NAME_LEN
3972 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3973 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3974
3975 /* Get our block major device number. */
3976
3977 ret = register_blkdev(0, rbd_dev->name);
3978 if (ret < 0)
3979 goto err_out_id;
3980 rbd_dev->major = ret;
3981
3982 /* Set up the blkdev mapping. */
3983
3984 ret = rbd_init_disk(rbd_dev);
3985 if (ret)
3986 goto err_out_blkdev;
3987
3988 ret = rbd_bus_add_dev(rbd_dev);
3989 if (ret)
3990 goto err_out_disk;
3991
3992 /*
3993 * At this point cleanup in the event of an error is the job
3994 * of the sysfs code (initiated by rbd_bus_del_dev()).
3995 */
3996 down_write(&rbd_dev->header_rwsem);
3997 ret = rbd_dev_snaps_register(rbd_dev);
3998 up_write(&rbd_dev->header_rwsem);
3999 if (ret)
4000 goto err_out_bus;
4001
Alex Elder9969ebc2013-01-18 12:31:10 -06004002 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05004003 if (ret)
4004 goto err_out_bus;
4005
4006 /* Everything's ready. Announce the disk to the world. */
4007
4008 add_disk(rbd_dev->disk);
4009
4010 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4011 (unsigned long long) rbd_dev->mapping.size);
4012
4013 return ret;
4014err_out_bus:
4015 /* this will also clean up rest of rbd_dev stuff */
4016
4017 rbd_bus_del_dev(rbd_dev);
4018
4019 return ret;
4020err_out_disk:
4021 rbd_free_disk(rbd_dev);
4022err_out_blkdev:
4023 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4024err_out_id:
4025 rbd_dev_id_put(rbd_dev);
4026err_out_snaps:
4027 rbd_remove_all_snaps(rbd_dev);
4028
4029 return ret;
4030}
4031
Alex Eldera30b71b2012-07-10 20:30:11 -05004032/*
4033 * Probe for the existence of the header object for the given rbd
4034 * device. For format 2 images this includes determining the image
4035 * id.
4036 */
4037static int rbd_dev_probe(struct rbd_device *rbd_dev)
4038{
4039 int ret;
4040
4041 /*
4042 * Get the id from the image id object. If it's not a
4043 * format 2 image, we'll get ENOENT back, and we'll assume
4044 * it's a format 1 image.
4045 */
4046 ret = rbd_dev_image_id(rbd_dev);
4047 if (ret)
4048 ret = rbd_dev_v1_probe(rbd_dev);
4049 else
4050 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004051 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05004052 dout("probe failed, returning %d\n", ret);
4053
Alex Elder83a06262012-10-30 15:47:17 -05004054 return ret;
4055 }
4056
4057 ret = rbd_dev_probe_finish(rbd_dev);
4058 if (ret)
4059 rbd_header_free(&rbd_dev->header);
4060
Alex Eldera30b71b2012-07-10 20:30:11 -05004061 return ret;
4062}
4063
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004064static ssize_t rbd_add(struct bus_type *bus,
4065 const char *buf,
4066 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004067{
Alex Eldercb8627c2012-07-09 21:04:23 -05004068 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004069 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004070 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004071 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004072 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004073 struct ceph_osd_client *osdc;
4074 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004075
4076 if (!try_module_get(THIS_MODULE))
4077 return -ENODEV;
4078
Alex Eldera725f65e2012-02-02 08:13:30 -06004079 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004080 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004081 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004082 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004083
Alex Elder9d3997f2012-10-25 23:34:42 -05004084 rbdc = rbd_get_client(ceph_opts);
4085 if (IS_ERR(rbdc)) {
4086 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004087 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004088 }
Alex Elderc53d5892012-10-25 23:34:42 -05004089 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004090
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004091 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004092 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004093 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004094 if (rc < 0)
4095 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004096 spec->pool_id = (u64) rc;
4097
Alex Elder0903e872012-11-14 12:25:19 -06004098 /* The ceph file layout needs to fit pool id in 32 bits */
4099
4100 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4101 rc = -EIO;
4102 goto err_out_client;
4103 }
4104
Alex Elderc53d5892012-10-25 23:34:42 -05004105 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004106 if (!rbd_dev)
4107 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004108 rbdc = NULL; /* rbd_dev now owns this */
4109 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004110
Alex Elderbd4ba652012-10-25 23:34:42 -05004111 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004112 kfree(rbd_opts);
4113 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004114
Alex Eldera30b71b2012-07-10 20:30:11 -05004115 rc = rbd_dev_probe(rbd_dev);
4116 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004117 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004118
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004119 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004120err_out_rbd_dev:
4121 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004122err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004123 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004124err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004125 if (ceph_opts)
4126 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004127 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004128 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004129err_out_module:
4130 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004131
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004132 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004133
4134 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004135}
4136
Alex Elderde71a292012-07-03 16:01:19 -05004137static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004138{
4139 struct list_head *tmp;
4140 struct rbd_device *rbd_dev;
4141
Alex Eldere124a822012-01-29 13:57:44 -06004142 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004143 list_for_each(tmp, &rbd_dev_list) {
4144 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004145 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06004146 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004147 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06004148 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004149 }
Alex Eldere124a822012-01-29 13:57:44 -06004150 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004151 return NULL;
4152}
4153
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004154static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004155{
Alex Elder593a9e72012-02-07 12:03:37 -06004156 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004157
Alex Elder1dbb4392012-01-24 10:08:37 -06004158 if (rbd_dev->watch_request) {
4159 struct ceph_client *client = rbd_dev->rbd_client->client;
4160
4161 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004162 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06004163 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004164 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004165 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004166
4167 /* clean up and free blkdev */
4168 rbd_free_disk(rbd_dev);
4169 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004170
Alex Elder2ac4e752012-07-10 20:30:10 -05004171 /* release allocated disk header fields */
4172 rbd_header_free(&rbd_dev->header);
4173
Alex Elder32eec682012-02-08 16:11:14 -06004174 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004175 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004176 rbd_assert(rbd_dev->rbd_client != NULL);
4177 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004178
4179 /* release module ref */
4180 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004181}
4182
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004183static ssize_t rbd_remove(struct bus_type *bus,
4184 const char *buf,
4185 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004186{
4187 struct rbd_device *rbd_dev = NULL;
4188 int target_id, rc;
4189 unsigned long ul;
4190 int ret = count;
4191
4192 rc = strict_strtoul(buf, 10, &ul);
4193 if (rc)
4194 return rc;
4195
4196 /* convert to int; abort if we lost anything in the conversion */
4197 target_id = (int) ul;
4198 if (target_id != ul)
4199 return -EINVAL;
4200
4201 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4202
4203 rbd_dev = __rbd_get_dev(target_id);
4204 if (!rbd_dev) {
4205 ret = -ENOENT;
4206 goto done;
4207 }
4208
Alex Elder42382b72012-11-16 09:29:16 -06004209 if (rbd_dev->open_count) {
4210 ret = -EBUSY;
4211 goto done;
4212 }
4213
Alex Elder41f38c22012-10-25 23:34:40 -05004214 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004215 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004216
4217done:
4218 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004219
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004220 return ret;
4221}
4222
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004223/*
4224 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004225 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004226 */
4227static int rbd_sysfs_init(void)
4228{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004229 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004230
Alex Elderfed4c142012-02-07 12:03:36 -06004231 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004232 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004233 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004234
Alex Elderfed4c142012-02-07 12:03:36 -06004235 ret = bus_register(&rbd_bus_type);
4236 if (ret < 0)
4237 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004238
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004239 return ret;
4240}
4241
4242static void rbd_sysfs_cleanup(void)
4243{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004244 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004245 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004246}
4247
4248int __init rbd_init(void)
4249{
4250 int rc;
4251
4252 rc = rbd_sysfs_init();
4253 if (rc)
4254 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004255 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004256 return 0;
4257}
4258
4259void __exit rbd_exit(void)
4260{
4261 rbd_sysfs_cleanup();
4262}
4263
4264module_init(rbd_init);
4265module_exit(rbd_exit);
4266
4267MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4268MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4269MODULE_DESCRIPTION("rados block device");
4270
4271/* following authorship retained from original osdblk.c */
4272MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4273
4274MODULE_LICENSE("GPL");