blob: 60b68512fa93417583bc54a810d5bd8396a76979 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder9969ebc2013-01-18 12:31:10 -0600173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600176
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
181
182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
199 s32 result;
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600203 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204
205 struct kref kref;
206};
207
208struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */
214 union {
215 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
226};
227
228#define for_each_obj_request(ireq, oreq) \
229 list_for_each_entry(oreq, &ireq->obj_requests, links)
230#define for_each_obj_request_from(ireq, oreq) \
231 list_for_each_entry_from(oreq, &ireq->obj_requests, links)
232#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235struct rbd_snap {
236 struct device dev;
237 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800238 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239 struct list_head node;
240 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500241 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800242};
243
Alex Elderf84344f2012-08-31 17:29:51 -0500244struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500245 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500246 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500247 bool read_only;
248};
249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250/*
251 * a single device
252 */
253struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500254 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255
256 int major; /* blkdev assigned major */
257 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258
Alex Eldera30b71b2012-07-10 20:30:11 -0500259 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260 struct rbd_client *rbd_client;
261
262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
264 spinlock_t lock; /* queue lock */
265
266 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600267 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500268 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500270 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500271
Alex Elder0903e872012-11-14 12:25:19 -0600272 struct ceph_file_layout layout;
273
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274 struct ceph_osd_event *watch_event;
275 struct ceph_osd_request *watch_request;
276
Alex Elder86b00e02012-10-25 23:34:42 -0500277 struct rbd_spec *parent_spec;
278 u64 parent_overlap;
279
Josh Durginc6666012011-11-21 17:11:12 -0800280 /* protects updating the header */
281 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500282
283 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284
285 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */
291 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600292 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800293};
294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600296
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600298static DEFINE_SPINLOCK(rbd_dev_list_lock);
299
Alex Elder432b8582012-01-29 13:57:44 -0600300static LIST_HEAD(rbd_client_list); /* clients */
301static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302
Alex Elder304f6802012-08-31 17:29:52 -0500303static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
304static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
305
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800306static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500307static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800308
Alex Elderf0f8cef2012-01-29 13:57:44 -0600309static ssize_t rbd_add(struct bus_type *bus, const char *buf,
310 size_t count);
311static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
312 size_t count);
313
314static struct bus_attribute rbd_bus_attrs[] = {
315 __ATTR(add, S_IWUSR, NULL, rbd_add),
316 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
317 __ATTR_NULL
318};
319
320static struct bus_type rbd_bus_type = {
321 .name = "rbd",
322 .bus_attrs = rbd_bus_attrs,
323};
324
325static void rbd_root_dev_release(struct device *dev)
326{
327}
328
329static struct device rbd_root_dev = {
330 .init_name = "rbd",
331 .release = rbd_root_dev_release,
332};
333
Alex Elder06ecc6c2012-11-01 10:17:15 -0500334static __printf(2, 3)
335void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
336{
337 struct va_format vaf;
338 va_list args;
339
340 va_start(args, fmt);
341 vaf.fmt = fmt;
342 vaf.va = &args;
343
344 if (!rbd_dev)
345 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
346 else if (rbd_dev->disk)
347 printk(KERN_WARNING "%s: %s: %pV\n",
348 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
349 else if (rbd_dev->spec && rbd_dev->spec->image_name)
350 printk(KERN_WARNING "%s: image %s: %pV\n",
351 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
352 else if (rbd_dev->spec && rbd_dev->spec->image_id)
353 printk(KERN_WARNING "%s: id %s: %pV\n",
354 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
355 else /* punt */
356 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
357 RBD_DRV_NAME, rbd_dev, &vaf);
358 va_end(args);
359}
360
Alex Elderaafb2302012-09-06 16:00:54 -0500361#ifdef RBD_DEBUG
362#define rbd_assert(expr) \
363 if (unlikely(!(expr))) { \
364 printk(KERN_ERR "\nAssertion failure in %s() " \
365 "at line %d:\n\n" \
366 "\trbd_assert(%s);\n\n", \
367 __func__, __LINE__, #expr); \
368 BUG(); \
369 }
370#else /* !RBD_DEBUG */
371# define rbd_assert(expr) ((void) 0)
372#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800373
Alex Elder117973f2012-08-31 17:29:55 -0500374static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
375static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700376
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377static int rbd_open(struct block_device *bdev, fmode_t mode)
378{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600379 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380
Alex Elderf84344f2012-08-31 17:29:51 -0500381 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700382 return -EROFS;
383
Alex Elder42382b72012-11-16 09:29:16 -0600384 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600385 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500386 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600387 rbd_dev->open_count++;
388 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700389
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700390 return 0;
391}
392
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800393static int rbd_release(struct gendisk *disk, fmode_t mode)
394{
395 struct rbd_device *rbd_dev = disk->private_data;
396
Alex Elder42382b72012-11-16 09:29:16 -0600397 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398 rbd_assert(rbd_dev->open_count > 0);
399 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600400 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600401 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800402
403 return 0;
404}
405
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406static const struct block_device_operations rbd_bd_ops = {
407 .owner = THIS_MODULE,
408 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800409 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410};
411
412/*
413 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500414 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415 */
Alex Elderf8c38922012-08-10 13:12:07 -0700416static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700417{
418 struct rbd_client *rbdc;
419 int ret = -ENOMEM;
420
421 dout("rbd_client_create\n");
422 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
423 if (!rbdc)
424 goto out_opt;
425
426 kref_init(&rbdc->kref);
427 INIT_LIST_HEAD(&rbdc->node);
428
Alex Elderbc534d862012-01-29 13:57:44 -0600429 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
430
Alex Elder43ae4702012-07-03 16:01:18 -0500431 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600433 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500434 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435
436 ret = ceph_open_session(rbdc->client);
437 if (ret < 0)
438 goto out_err;
439
Alex Elder432b8582012-01-29 13:57:44 -0600440 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600442 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443
Alex Elderbc534d862012-01-29 13:57:44 -0600444 mutex_unlock(&ctl_mutex);
445
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446 dout("rbd_client_create created %p\n", rbdc);
447 return rbdc;
448
449out_err:
450 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600451out_mutex:
452 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 kfree(rbdc);
454out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500455 if (ceph_opts)
456 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400457 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458}
459
460/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700461 * Find a ceph client with specific addr and configuration. If
462 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700464static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465{
466 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700467 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468
Alex Elder43ae4702012-07-03 16:01:18 -0500469 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470 return NULL;
471
Alex Elder1f7ba332012-08-10 13:12:07 -0700472 spin_lock(&rbd_client_list_lock);
473 list_for_each_entry(client_node, &rbd_client_list, node) {
474 if (!ceph_compare_options(ceph_opts, client_node->client)) {
475 kref_get(&client_node->kref);
476 found = true;
477 break;
478 }
479 }
480 spin_unlock(&rbd_client_list_lock);
481
482 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483}
484
485/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700486 * mount options
487 */
488enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700489 Opt_last_int,
490 /* int args above */
491 Opt_last_string,
492 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700493 Opt_read_only,
494 Opt_read_write,
495 /* Boolean args above */
496 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700497};
498
Alex Elder43ae4702012-07-03 16:01:18 -0500499static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700500 /* int args above */
501 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500502 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700503 {Opt_read_only, "ro"}, /* Alternate spelling */
504 {Opt_read_write, "read_write"},
505 {Opt_read_write, "rw"}, /* Alternate spelling */
506 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700507 {-1, NULL}
508};
509
Alex Elder98571b52013-01-20 14:44:42 -0600510struct rbd_options {
511 bool read_only;
512};
513
514#define RBD_READ_ONLY_DEFAULT false
515
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700516static int parse_rbd_opts_token(char *c, void *private)
517{
Alex Elder43ae4702012-07-03 16:01:18 -0500518 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700519 substring_t argstr[MAX_OPT_ARGS];
520 int token, intval, ret;
521
Alex Elder43ae4702012-07-03 16:01:18 -0500522 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700523 if (token < 0)
524 return -EINVAL;
525
526 if (token < Opt_last_int) {
527 ret = match_int(&argstr[0], &intval);
528 if (ret < 0) {
529 pr_err("bad mount option arg (not int) "
530 "at '%s'\n", c);
531 return ret;
532 }
533 dout("got int token %d val %d\n", token, intval);
534 } else if (token > Opt_last_int && token < Opt_last_string) {
535 dout("got string token %d val %s\n", token,
536 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700537 } else if (token > Opt_last_string && token < Opt_last_bool) {
538 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700539 } else {
540 dout("got token %d\n", token);
541 }
542
543 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700544 case Opt_read_only:
545 rbd_opts->read_only = true;
546 break;
547 case Opt_read_write:
548 rbd_opts->read_only = false;
549 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700550 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500551 rbd_assert(false);
552 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700553 }
554 return 0;
555}
556
557/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 * Get a ceph client with specific addr and configuration, if one does
559 * not exist create it.
560 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500561static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562{
Alex Elderf8c38922012-08-10 13:12:07 -0700563 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700564
Alex Elder1f7ba332012-08-10 13:12:07 -0700565 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500566 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500567 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500568 else
Alex Elderf8c38922012-08-10 13:12:07 -0700569 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder9d3997f2012-10-25 23:34:42 -0500571 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572}
573
574/*
575 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600576 *
Alex Elder432b8582012-01-29 13:57:44 -0600577 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 */
579static void rbd_client_release(struct kref *kref)
580{
581 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
582
583 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500584 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500586 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587
588 ceph_destroy_client(rbdc->client);
589 kfree(rbdc);
590}
591
592/*
593 * Drop reference to ceph client node. If it's not referenced anymore, release
594 * it.
595 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500596static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597{
Alex Elderc53d5892012-10-25 23:34:42 -0500598 if (rbdc)
599 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600}
601
Alex Eldera30b71b2012-07-10 20:30:11 -0500602static bool rbd_image_format_valid(u32 image_format)
603{
604 return image_format == 1 || image_format == 2;
605}
606
Alex Elder8e94af82012-07-25 09:32:40 -0500607static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
608{
Alex Elder103a1502012-08-02 11:29:45 -0500609 size_t size;
610 u32 snap_count;
611
612 /* The header has to start with the magic rbd header text */
613 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
614 return false;
615
Alex Elderdb2388b2012-10-20 22:17:27 -0500616 /* The bio layer requires at least sector-sized I/O */
617
618 if (ondisk->options.order < SECTOR_SHIFT)
619 return false;
620
621 /* If we use u64 in a few spots we may be able to loosen this */
622
623 if (ondisk->options.order > 8 * sizeof (int) - 1)
624 return false;
625
Alex Elder103a1502012-08-02 11:29:45 -0500626 /*
627 * The size of a snapshot header has to fit in a size_t, and
628 * that limits the number of snapshots.
629 */
630 snap_count = le32_to_cpu(ondisk->snap_count);
631 size = SIZE_MAX - sizeof (struct ceph_snap_context);
632 if (snap_count > size / sizeof (__le64))
633 return false;
634
635 /*
636 * Not only that, but the size of the entire the snapshot
637 * header must also be representable in a size_t.
638 */
639 size -= snap_count * sizeof (__le64);
640 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
641 return false;
642
643 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500644}
645
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646/*
647 * Create a new header structure, translate header format from the on-disk
648 * header.
649 */
650static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500651 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652{
Alex Elderccece232012-07-10 20:30:10 -0500653 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500654 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500655 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500656 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657
Alex Elder6a523252012-07-19 17:12:59 -0500658 memset(header, 0, sizeof (*header));
659
Alex Elder103a1502012-08-02 11:29:45 -0500660 snap_count = le32_to_cpu(ondisk->snap_count);
661
Alex Elder58c17b02012-08-23 23:22:06 -0500662 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
663 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500664 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500666 memcpy(header->object_prefix, ondisk->object_prefix, len);
667 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600668
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500670 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
671
Alex Elder621901d2012-08-23 23:22:06 -0500672 /* Save a copy of the snapshot names */
673
Alex Elderf785cc12012-08-23 23:22:06 -0500674 if (snap_names_len > (u64) SIZE_MAX)
675 return -EIO;
676 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500678 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500679 /*
680 * Note that rbd_dev_v1_header_read() guarantees
681 * the ondisk buffer we're working with has
682 * snap_names_len bytes beyond the end of the
683 * snapshot id array, this memcpy() is safe.
684 */
685 memcpy(header->snap_names, &ondisk->snaps[snap_count],
686 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500687
Alex Elder621901d2012-08-23 23:22:06 -0500688 /* Record each snapshot's size */
689
Alex Elderd2bb24e2012-07-26 23:37:14 -0500690 size = snap_count * sizeof (*header->snap_sizes);
691 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500693 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500694 for (i = 0; i < snap_count; i++)
695 header->snap_sizes[i] =
696 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 } else {
Alex Elderccece232012-07-10 20:30:10 -0500698 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 header->snap_names = NULL;
700 header->snap_sizes = NULL;
701 }
Alex Elder849b4262012-07-09 21:04:24 -0500702
Alex Elder34b13182012-07-13 20:35:12 -0500703 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 header->obj_order = ondisk->options.order;
705 header->crypt_type = ondisk->options.crypt_type;
706 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500707
Alex Elder621901d2012-08-23 23:22:06 -0500708 /* Allocate and fill in the snapshot context */
709
Alex Elderf84344f2012-08-31 17:29:51 -0500710 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500711 size = sizeof (struct ceph_snap_context);
712 size += snap_count * sizeof (header->snapc->snaps[0]);
713 header->snapc = kzalloc(size, GFP_KERNEL);
714 if (!header->snapc)
715 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
717 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500718 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500720 for (i = 0; i < snap_count; i++)
721 header->snapc->snaps[i] =
722 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
724 return 0;
725
Alex Elder6a523252012-07-19 17:12:59 -0500726out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500727 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500728 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500730 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500731 kfree(header->object_prefix);
732 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500733
Alex Elder00f1f362012-02-07 12:03:36 -0600734 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735}
736
Alex Elder9e15b772012-10-30 19:40:33 -0500737static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
738{
739 struct rbd_snap *snap;
740
741 if (snap_id == CEPH_NOSNAP)
742 return RBD_SNAP_HEAD_NAME;
743
744 list_for_each_entry(snap, &rbd_dev->snaps, node)
745 if (snap_id == snap->id)
746 return snap->name;
747
748 return NULL;
749}
750
Alex Elder8836b992012-08-30 14:42:15 -0500751static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700752{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753
Alex Eldere86924a2012-07-10 20:30:11 -0500754 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600755
Alex Eldere86924a2012-07-10 20:30:11 -0500756 list_for_each_entry(snap, &rbd_dev->snaps, node) {
757 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500758 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500759 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500760 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600761
Alex Eldere86924a2012-07-10 20:30:11 -0500762 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600763 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700764 }
Alex Eldere86924a2012-07-10 20:30:11 -0500765
Alex Elder00f1f362012-02-07 12:03:36 -0600766 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767}
768
Alex Elder819d52b2012-10-25 23:34:41 -0500769static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770{
Alex Elder78dc4472012-07-19 08:49:18 -0500771 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700772
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500773 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800774 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500775 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500776 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500777 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500778 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500780 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781 if (ret < 0)
782 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500783 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 }
Alex Elderd78b6502012-11-09 08:43:15 -0600785 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 return ret;
788}
789
790static void rbd_header_free(struct rbd_image_header *header)
791{
Alex Elder849b4262012-07-09 21:04:24 -0500792 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500793 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700794 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500795 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500796 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500797 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800798 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500799 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800}
801
Alex Elder98571b52013-01-20 14:44:42 -0600802static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803{
Alex Elder65ccfe22012-08-09 10:33:26 -0700804 char *name;
805 u64 segment;
806 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807
Alex Elder2fd82b92012-11-09 15:05:54 -0600808 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700809 if (!name)
810 return NULL;
811 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600812 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700813 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600814 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700815 pr_err("error formatting segment name for #%llu (%d)\n",
816 segment, ret);
817 kfree(name);
818 name = NULL;
819 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820
Alex Elder65ccfe22012-08-09 10:33:26 -0700821 return name;
822}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700823
Alex Elder65ccfe22012-08-09 10:33:26 -0700824static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
825{
826 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827
Alex Elder65ccfe22012-08-09 10:33:26 -0700828 return offset & (segment_size - 1);
829}
830
831static u64 rbd_segment_length(struct rbd_device *rbd_dev,
832 u64 offset, u64 length)
833{
834 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
835
836 offset &= segment_size - 1;
837
Alex Elderaafb2302012-09-06 16:00:54 -0500838 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700839 if (offset + length > segment_size)
840 length = segment_size - offset;
841
842 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700843}
844
845/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700846 * returns the size of an object in the image
847 */
848static u64 rbd_obj_bytes(struct rbd_image_header *header)
849{
850 return 1 << header->obj_order;
851}
852
853/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854 * bio helpers
855 */
856
857static void bio_chain_put(struct bio *chain)
858{
859 struct bio *tmp;
860
861 while (chain) {
862 tmp = chain;
863 chain = chain->bi_next;
864 bio_put(tmp);
865 }
866}
867
868/*
869 * zeros a bio chain, starting at specific offset
870 */
871static void zero_bio_chain(struct bio *chain, int start_ofs)
872{
873 struct bio_vec *bv;
874 unsigned long flags;
875 void *buf;
876 int i;
877 int pos = 0;
878
879 while (chain) {
880 bio_for_each_segment(bv, chain, i) {
881 if (pos + bv->bv_len > start_ofs) {
882 int remainder = max(start_ofs - pos, 0);
883 buf = bvec_kmap_irq(bv, &flags);
884 memset(buf + remainder, 0,
885 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200886 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 }
888 pos += bv->bv_len;
889 }
890
891 chain = chain->bi_next;
892 }
893}
894
895/*
Alex Elderf7760da2012-10-20 22:17:27 -0500896 * Clone a portion of a bio, starting at the given byte offset
897 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898 */
Alex Elderf7760da2012-10-20 22:17:27 -0500899static struct bio *bio_clone_range(struct bio *bio_src,
900 unsigned int offset,
901 unsigned int len,
902 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903{
Alex Elderf7760da2012-10-20 22:17:27 -0500904 struct bio_vec *bv;
905 unsigned int resid;
906 unsigned short idx;
907 unsigned int voff;
908 unsigned short end_idx;
909 unsigned short vcnt;
910 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911
Alex Elderf7760da2012-10-20 22:17:27 -0500912 /* Handle the easy case for the caller */
913
914 if (!offset && len == bio_src->bi_size)
915 return bio_clone(bio_src, gfpmask);
916
917 if (WARN_ON_ONCE(!len))
918 return NULL;
919 if (WARN_ON_ONCE(len > bio_src->bi_size))
920 return NULL;
921 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
922 return NULL;
923
924 /* Find first affected segment... */
925
926 resid = offset;
927 __bio_for_each_segment(bv, bio_src, idx, 0) {
928 if (resid < bv->bv_len)
929 break;
930 resid -= bv->bv_len;
931 }
932 voff = resid;
933
934 /* ...and the last affected segment */
935
936 resid += len;
937 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
938 if (resid <= bv->bv_len)
939 break;
940 resid -= bv->bv_len;
941 }
942 vcnt = end_idx - idx + 1;
943
944 /* Build the clone */
945
946 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
947 if (!bio)
948 return NULL; /* ENOMEM */
949
950 bio->bi_bdev = bio_src->bi_bdev;
951 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
952 bio->bi_rw = bio_src->bi_rw;
953 bio->bi_flags |= 1 << BIO_CLONED;
954
955 /*
956 * Copy over our part of the bio_vec, then update the first
957 * and last (or only) entries.
958 */
959 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
960 vcnt * sizeof (struct bio_vec));
961 bio->bi_io_vec[0].bv_offset += voff;
962 if (vcnt > 1) {
963 bio->bi_io_vec[0].bv_len -= voff;
964 bio->bi_io_vec[vcnt - 1].bv_len = resid;
965 } else {
966 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 }
968
Alex Elderf7760da2012-10-20 22:17:27 -0500969 bio->bi_vcnt = vcnt;
970 bio->bi_size = len;
971 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700972
Alex Elderf7760da2012-10-20 22:17:27 -0500973 return bio;
974}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Alex Elderf7760da2012-10-20 22:17:27 -0500976/*
977 * Clone a portion of a bio chain, starting at the given byte offset
978 * into the first bio in the source chain and continuing for the
979 * number of bytes indicated. The result is another bio chain of
980 * exactly the given length, or a null pointer on error.
981 *
982 * The bio_src and offset parameters are both in-out. On entry they
983 * refer to the first source bio and the offset into that bio where
984 * the start of data to be cloned is located.
985 *
986 * On return, bio_src is updated to refer to the bio in the source
987 * chain that contains first un-cloned byte, and *offset will
988 * contain the offset of that byte within that bio.
989 */
990static struct bio *bio_chain_clone_range(struct bio **bio_src,
991 unsigned int *offset,
992 unsigned int len,
993 gfp_t gfpmask)
994{
995 struct bio *bi = *bio_src;
996 unsigned int off = *offset;
997 struct bio *chain = NULL;
998 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Alex Elderf7760da2012-10-20 22:17:27 -05001000 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001001
Alex Elderf7760da2012-10-20 22:17:27 -05001002 if (!bi || off >= bi->bi_size || !len)
1003 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
Alex Elderf7760da2012-10-20 22:17:27 -05001005 end = &chain;
1006 while (len) {
1007 unsigned int bi_size;
1008 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001009
Alex Elderf5400b72012-11-01 10:17:15 -05001010 if (!bi) {
1011 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001012 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001013 }
Alex Elderf7760da2012-10-20 22:17:27 -05001014 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1015 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1016 if (!bio)
1017 goto out_err; /* ENOMEM */
1018
1019 *end = bio;
1020 end = &bio->bi_next;
1021
1022 off += bi_size;
1023 if (off == bi->bi_size) {
1024 bi = bi->bi_next;
1025 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026 }
Alex Elderf7760da2012-10-20 22:17:27 -05001027 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028 }
Alex Elderf7760da2012-10-20 22:17:27 -05001029 *bio_src = bi;
1030 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031
Alex Elderf7760da2012-10-20 22:17:27 -05001032 return chain;
1033out_err:
1034 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036 return NULL;
1037}
1038
Alex Elderbf0d5f502012-11-22 00:00:08 -06001039static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1040{
1041 kref_get(&obj_request->kref);
1042}
1043
1044static void rbd_obj_request_destroy(struct kref *kref);
1045static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1046{
1047 rbd_assert(obj_request != NULL);
1048 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1049}
1050
1051static void rbd_img_request_get(struct rbd_img_request *img_request)
1052{
1053 kref_get(&img_request->kref);
1054}
1055
1056static void rbd_img_request_destroy(struct kref *kref);
1057static void rbd_img_request_put(struct rbd_img_request *img_request)
1058{
1059 rbd_assert(img_request != NULL);
1060 kref_put(&img_request->kref, rbd_img_request_destroy);
1061}
1062
1063static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1064 struct rbd_obj_request *obj_request)
1065{
1066 rbd_obj_request_get(obj_request);
1067 obj_request->img_request = img_request;
1068 list_add_tail(&obj_request->links, &img_request->obj_requests);
1069 obj_request->which = img_request->obj_request_count++;
1070 rbd_assert(obj_request->which != BAD_WHICH);
1071}
1072
1073static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1074 struct rbd_obj_request *obj_request)
1075{
1076 rbd_assert(obj_request->which != BAD_WHICH);
1077 obj_request->which = BAD_WHICH;
1078 list_del(&obj_request->links);
1079 rbd_assert(obj_request->img_request == img_request);
1080 obj_request->callback = NULL;
1081 obj_request->img_request = NULL;
1082 rbd_obj_request_put(obj_request);
1083}
1084
1085static bool obj_request_type_valid(enum obj_request_type type)
1086{
1087 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001088 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001089 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001090 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001091 return true;
1092 default:
1093 return false;
1094 }
1095}
1096
Alex Elder8d23bf22012-11-19 22:55:21 -06001097struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1098{
1099 struct ceph_osd_req_op *op;
1100 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001101 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001102
1103 op = kzalloc(sizeof (*op), GFP_NOIO);
1104 if (!op)
1105 return NULL;
1106 op->op = opcode;
1107 va_start(args, opcode);
1108 switch (opcode) {
1109 case CEPH_OSD_OP_READ:
1110 case CEPH_OSD_OP_WRITE:
1111 /* rbd_osd_req_op_create(READ, offset, length) */
1112 /* rbd_osd_req_op_create(WRITE, offset, length) */
1113 op->extent.offset = va_arg(args, u64);
1114 op->extent.length = va_arg(args, u64);
1115 if (opcode == CEPH_OSD_OP_WRITE)
1116 op->payload_len = op->extent.length;
1117 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001118 case CEPH_OSD_OP_CALL:
1119 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1120 op->cls.class_name = va_arg(args, char *);
1121 size = strlen(op->cls.class_name);
1122 rbd_assert(size <= (size_t) U8_MAX);
1123 op->cls.class_len = size;
1124 op->payload_len = size;
1125
1126 op->cls.method_name = va_arg(args, char *);
1127 size = strlen(op->cls.method_name);
1128 rbd_assert(size <= (size_t) U8_MAX);
1129 op->cls.method_len = size;
1130 op->payload_len += size;
1131
1132 op->cls.argc = 0;
1133 op->cls.indata = va_arg(args, void *);
1134 size = va_arg(args, size_t);
1135 rbd_assert(size <= (size_t) U32_MAX);
1136 op->cls.indata_len = (u32) size;
1137 op->payload_len += size;
1138 break;
Alex Elder5efea492012-11-19 22:55:21 -06001139 case CEPH_OSD_OP_NOTIFY_ACK:
1140 case CEPH_OSD_OP_WATCH:
1141 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1142 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1143 op->watch.cookie = va_arg(args, u64);
1144 op->watch.ver = va_arg(args, u64);
1145 op->watch.ver = cpu_to_le64(op->watch.ver);
1146 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1147 op->watch.flag = (u8) 1;
1148 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001149 default:
1150 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1151 kfree(op);
1152 op = NULL;
1153 break;
1154 }
1155 va_end(args);
1156
1157 return op;
1158}
1159
1160static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1161{
1162 kfree(op);
1163}
1164
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165/*
1166 * Send ceph osd request
1167 */
1168static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001169 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 struct ceph_snap_context *snapc,
1171 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001172 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173 struct bio *bio,
1174 struct page **pages,
1175 int num_pages,
1176 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001177 struct ceph_osd_req_op *op,
Alex Elder5f29ddd2012-11-08 08:01:39 -06001178 void (*rbd_cb)(struct ceph_osd_request *,
1179 struct ceph_msg *),
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001180 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181{
Alex Elder1dbb4392012-01-24 10:08:37 -06001182 struct ceph_osd_client *osdc;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001183 struct ceph_osd_request *osd_req;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001184 struct timespec mtime = CURRENT_TIME;
1185 int ret;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001186
Alex Elder7d250b92012-11-30 17:53:04 -06001187 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n",
Alex Elderf7760da2012-10-20 22:17:27 -05001188 object_name, (unsigned long long) ofs,
Alex Elder7d250b92012-11-30 17:53:04 -06001189 (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190
Alex Elder0ce1a792012-07-03 16:01:18 -05001191 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder30573d62012-11-13 21:11:15 -06001192 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001193 if (!osd_req)
1194 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195
Alex Elderd178a9e2012-11-13 21:11:15 -06001196 osd_req->r_flags = flags;
Alex Elder54a54002012-11-13 21:11:15 -06001197 osd_req->r_pages = pages;
1198 if (bio) {
1199 osd_req->r_bio = bio;
1200 bio_get(osd_req->r_bio);
1201 }
Alex Elder2e53c6c2012-11-30 09:59:47 -06001202
Alex Elder5f29ddd2012-11-08 08:01:39 -06001203 osd_req->r_callback = rbd_cb;
Alex Elder7d250b92012-11-30 17:53:04 -06001204 osd_req->r_priv = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205
Alex Elder5f29ddd2012-11-08 08:01:39 -06001206 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1207 osd_req->r_oid_len = strlen(osd_req->r_oid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208
Alex Elder0903e872012-11-14 12:25:19 -06001209 osd_req->r_file_layout = rbd_dev->layout; /* struct */
Alex Eldere01e7922012-11-14 12:25:18 -06001210 osd_req->r_num_pages = calc_pages_for(ofs, len);
1211 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Alex Elder30573d62012-11-13 21:11:15 -06001213 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
Alex Elderae7ca4a32012-11-13 21:11:15 -06001214 snapc, snapid, &mtime);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215
Alex Elder8b84de72012-11-20 14:17:17 -06001216 if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001217 ceph_osdc_set_request_linger(osdc, osd_req);
Alex Elder8b84de72012-11-20 14:17:17 -06001218 rbd_dev->watch_request = osd_req;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001219 }
1220
Alex Elder5f29ddd2012-11-08 08:01:39 -06001221 ret = ceph_osdc_start_request(osdc, osd_req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222 if (ret < 0)
1223 goto done_err;
1224
1225 if (!rbd_cb) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001226 u64 version;
1227
1228 ret = ceph_osdc_wait_request(osdc, osd_req);
1229 version = le64_to_cpu(osd_req->r_reassert_version.version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 if (ver)
Alex Elder5f29ddd2012-11-08 08:01:39 -06001231 *ver = version;
1232 dout("reassert_ver=%llu\n", (unsigned long long) version);
1233 ceph_osdc_put_request(osd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 }
1235 return ret;
1236
1237done_err:
Alex Elder2e53c6c2012-11-30 09:59:47 -06001238 if (bio)
1239 bio_chain_put(osd_req->r_bio);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001240 ceph_osdc_put_request(osd_req);
1241
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242 return ret;
1243}
1244
Alex Elder5f29ddd2012-11-08 08:01:39 -06001245static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1246 struct ceph_msg *msg)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001247{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001248 ceph_osdc_put_request(osd_req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001249}
1250
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251/*
1252 * Do a synchronous ceph osd operation
1253 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001254static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001256 struct ceph_osd_req_op *op,
Alex Elderaded07e2012-07-03 16:01:18 -05001257 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001258 u64 ofs, u64 inbound_size,
1259 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261{
1262 int ret;
1263 struct page **pages;
1264 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001265
Alex Elder30573d62012-11-13 21:11:15 -06001266 rbd_assert(op != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001267
Alex Elderf8d4de62012-07-03 16:01:19 -05001268 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001269 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001270 if (IS_ERR(pages))
1271 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272
Alex Elder25704ac2012-11-09 08:43:16 -06001273 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderf8d4de62012-07-03 16:01:19 -05001274 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001275 pages, num_pages,
1276 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001277 op,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001278 NULL,
Alex Elder8b84de72012-11-20 14:17:17 -06001279 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001280 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001281 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001282
Alex Elderf8d4de62012-07-03 16:01:19 -05001283 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1284 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001285
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001286done:
1287 ceph_release_page_vector(pages, num_pages);
1288 return ret;
1289}
1290
Alex Elderbf0d5f502012-11-22 00:00:08 -06001291static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1292 struct rbd_obj_request *obj_request)
1293{
1294 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1295}
1296
1297static void rbd_img_request_complete(struct rbd_img_request *img_request)
1298{
1299 if (img_request->callback)
1300 img_request->callback(img_request);
1301 else
1302 rbd_img_request_put(img_request);
1303}
1304
Alex Elder788e2df2013-01-17 12:25:27 -06001305/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1306
1307static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1308{
1309 return wait_for_completion_interruptible(&obj_request->completion);
1310}
1311
Alex Elder9969ebc2013-01-18 12:31:10 -06001312static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
1313 struct ceph_osd_op *op)
1314{
1315 atomic_set(&obj_request->done, 1);
1316}
1317
Alex Elderbf0d5f502012-11-22 00:00:08 -06001318static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1319{
1320 if (obj_request->callback)
1321 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001322 else
1323 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001324}
1325
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001326/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327 * Request sync osd watch
1328 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001329static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001331 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001332{
Alex Elder139b4312012-11-13 21:11:15 -06001333 struct ceph_osd_req_op *op;
Sage Weil11f77002011-05-12 16:13:54 -07001334 int ret;
1335
Alex Elder5efea492012-11-19 22:55:21 -06001336 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
Alex Elder139b4312012-11-13 21:11:15 -06001337 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001338 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001339
Alex Elder0ce1a792012-07-03 16:01:18 -05001340 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001341 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001342 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343 CEPH_OSD_FLAG_READ,
Alex Elder30573d62012-11-13 21:11:15 -06001344 op,
Alex Elder8b84de72012-11-20 14:17:17 -06001345 rbd_simple_req_cb, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001346
Alex Elder5efea492012-11-19 22:55:21 -06001347 rbd_osd_req_op_destroy(op);
1348
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001349 return ret;
1350}
1351
1352static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1353{
Alex Elder0ce1a792012-07-03 16:01:18 -05001354 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001355 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001356 int rc;
1357
Alex Elder0ce1a792012-07-03 16:01:18 -05001358 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001359 return;
1360
Alex Elderbd919d42012-07-13 20:35:11 -05001361 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1362 rbd_dev->header_name, (unsigned long long) notify_id,
1363 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001364 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001365 if (rc)
Alex Elder06ecc6c2012-11-01 10:17:15 -05001366 rbd_warn(rbd_dev, "got notification but failed to "
1367 " update snaps: %d\n", rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001368
Alex Elder7f0a24d2012-07-25 09:32:40 -05001369 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001370}
1371
1372/*
Alex Elder907703d2012-11-13 21:11:15 -06001373 * Request sync osd watch/unwatch. The value of "start" determines
1374 * whether a watch request is being initiated or torn down.
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001375 */
Alex Elder907703d2012-11-13 21:11:15 -06001376static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001377{
Alex Elder5efea492012-11-19 22:55:21 -06001378 struct ceph_osd_req_op *op;
1379 int ret = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001380
Alex Elderc0430642013-01-18 12:31:09 -06001381 rbd_assert(start ^ !!rbd_dev->watch_event);
1382 rbd_assert(start ^ !!rbd_dev->watch_request);
1383
Alex Elder907703d2012-11-13 21:11:15 -06001384 if (start) {
1385 struct ceph_osd_client *osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386
Alex Elder907703d2012-11-13 21:11:15 -06001387 osdc = &rbd_dev->rbd_client->client->osdc;
1388 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1389 &rbd_dev->watch_event);
1390 if (ret < 0)
Alex Elder5efea492012-11-19 22:55:21 -06001391 return ret;
Alex Elder907703d2012-11-13 21:11:15 -06001392 }
1393
Alex Elder5efea492012-11-19 22:55:21 -06001394 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1395 rbd_dev->watch_event->cookie,
1396 rbd_dev->header.obj_version, start);
1397 if (op)
1398 ret = rbd_req_sync_op(rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Alex Elder907703d2012-11-13 21:11:15 -06001400 op, rbd_dev->header_name,
Alex Elder8b84de72012-11-20 14:17:17 -06001401 0, 0, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001402
Alex Elder5efea492012-11-19 22:55:21 -06001403 /* Cancel the event if we're tearing down, or on error */
1404
1405 if (!start || !op || ret < 0) {
Alex Elder907703d2012-11-13 21:11:15 -06001406 ceph_osdc_cancel_event(rbd_dev->watch_event);
1407 rbd_dev->watch_event = NULL;
1408 }
Alex Elder5efea492012-11-19 22:55:21 -06001409 rbd_osd_req_op_destroy(op);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001410
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001411 return ret;
1412}
1413
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001414/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001415 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001416 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001417static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001418 const char *object_name,
1419 const char *class_name,
1420 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001421 const char *outbound,
1422 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001423 char *inbound,
1424 size_t inbound_size,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001425 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001426{
Alex Elder139b4312012-11-13 21:11:15 -06001427 struct ceph_osd_req_op *op;
Alex Elder57cfc102012-06-26 12:57:03 -07001428 int ret;
1429
Alex Elder3cb4a682012-06-26 12:57:03 -07001430 /*
1431 * Any input parameters required by the method we're calling
1432 * will be sent along with the class and method names as
1433 * part of the message payload. That data and its size are
1434 * supplied via the indata and indata_len fields (named from
1435 * the perspective of the server side) in the OSD request
1436 * operation.
1437 */
Alex Elder2647ba32012-11-19 22:55:21 -06001438 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1439 method_name, outbound, outbound_size);
Alex Elder139b4312012-11-13 21:11:15 -06001440 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001441 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001442
Alex Elder30573d62012-11-13 21:11:15 -06001443 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
Alex Elderf8d4de62012-07-03 16:01:19 -05001444 object_name, 0, inbound_size, inbound,
Alex Elder8b84de72012-11-20 14:17:17 -06001445 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001446
Alex Elder2647ba32012-11-19 22:55:21 -06001447 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001448
1449 dout("cls_exec returned %d\n", ret);
1450 return ret;
1451}
1452
Alex Elderbf0d5f502012-11-22 00:00:08 -06001453static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1454 struct ceph_osd_op *op)
1455{
1456 u64 xferred;
1457
1458 /*
1459 * We support a 64-bit length, but ultimately it has to be
1460 * passed to blk_end_request(), which takes an unsigned int.
1461 */
1462 xferred = le64_to_cpu(op->extent.length);
1463 rbd_assert(xferred < (u64) UINT_MAX);
1464 if (obj_request->result == (s32) -ENOENT) {
1465 zero_bio_chain(obj_request->bio_list, 0);
1466 obj_request->result = 0;
1467 } else if (xferred < obj_request->length && !obj_request->result) {
1468 zero_bio_chain(obj_request->bio_list, xferred);
1469 xferred = obj_request->length;
1470 }
1471 obj_request->xferred = xferred;
1472 atomic_set(&obj_request->done, 1);
1473}
1474
1475static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1476 struct ceph_osd_op *op)
1477{
1478 obj_request->xferred = le64_to_cpu(op->extent.length);
1479 atomic_set(&obj_request->done, 1);
1480}
1481
1482static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1483 struct ceph_msg *msg)
1484{
1485 struct rbd_obj_request *obj_request = osd_req->r_priv;
1486 struct ceph_osd_reply_head *reply_head;
1487 struct ceph_osd_op *op;
1488 u32 num_ops;
1489 u16 opcode;
1490
1491 rbd_assert(osd_req == obj_request->osd_req);
1492 rbd_assert(!!obj_request->img_request ^
1493 (obj_request->which == BAD_WHICH));
1494
1495 obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1496 reply_head = msg->front.iov_base;
1497 obj_request->result = (s32) le32_to_cpu(reply_head->result);
1498 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1499
1500 num_ops = le32_to_cpu(reply_head->num_ops);
1501 WARN_ON(num_ops != 1); /* For now */
1502
1503 op = &reply_head->ops[0];
1504 opcode = le16_to_cpu(op->op);
1505 switch (opcode) {
1506 case CEPH_OSD_OP_READ:
1507 rbd_osd_read_callback(obj_request, op);
1508 break;
1509 case CEPH_OSD_OP_WRITE:
1510 rbd_osd_write_callback(obj_request, op);
1511 break;
Alex Elder9969ebc2013-01-18 12:31:10 -06001512 case CEPH_OSD_OP_WATCH:
1513 rbd_osd_trivial_callback(obj_request, op);
1514 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001515 default:
1516 rbd_warn(NULL, "%s: unsupported op %hu\n",
1517 obj_request->object_name, (unsigned short) opcode);
1518 break;
1519 }
1520
1521 if (atomic_read(&obj_request->done))
1522 rbd_obj_request_complete(obj_request);
1523}
1524
1525static struct ceph_osd_request *rbd_osd_req_create(
1526 struct rbd_device *rbd_dev,
1527 bool write_request,
1528 struct rbd_obj_request *obj_request,
1529 struct ceph_osd_req_op *op)
1530{
1531 struct rbd_img_request *img_request = obj_request->img_request;
1532 struct ceph_snap_context *snapc = NULL;
1533 struct ceph_osd_client *osdc;
1534 struct ceph_osd_request *osd_req;
1535 struct timespec now;
1536 struct timespec *mtime;
1537 u64 snap_id = CEPH_NOSNAP;
1538 u64 offset = obj_request->offset;
1539 u64 length = obj_request->length;
1540
1541 if (img_request) {
1542 rbd_assert(img_request->write_request == write_request);
1543 if (img_request->write_request)
1544 snapc = img_request->snapc;
1545 else
1546 snap_id = img_request->snap_id;
1547 }
1548
1549 /* Allocate and initialize the request, for the single op */
1550
1551 osdc = &rbd_dev->rbd_client->client->osdc;
1552 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1553 if (!osd_req)
1554 return NULL; /* ENOMEM */
1555
1556 rbd_assert(obj_request_type_valid(obj_request->type));
1557 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001558 case OBJ_REQUEST_NODATA:
1559 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001560 case OBJ_REQUEST_BIO:
1561 rbd_assert(obj_request->bio_list != NULL);
1562 osd_req->r_bio = obj_request->bio_list;
1563 bio_get(osd_req->r_bio);
1564 /* osd client requires "num pages" even for bio */
1565 osd_req->r_num_pages = calc_pages_for(offset, length);
1566 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001567 case OBJ_REQUEST_PAGES:
1568 osd_req->r_pages = obj_request->pages;
1569 osd_req->r_num_pages = obj_request->page_count;
1570 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1571 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001572 }
1573
1574 if (write_request) {
1575 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1576 now = CURRENT_TIME;
1577 mtime = &now;
1578 } else {
1579 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1580 mtime = NULL; /* not needed for reads */
1581 offset = 0; /* These are not used... */
1582 length = 0; /* ...for osd read requests */
1583 }
1584
1585 osd_req->r_callback = rbd_osd_req_callback;
1586 osd_req->r_priv = obj_request;
1587
1588 osd_req->r_oid_len = strlen(obj_request->object_name);
1589 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1590 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1591
1592 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1593
1594 /* osd_req will get its own reference to snapc (if non-null) */
1595
1596 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1597 snapc, snap_id, mtime);
1598
1599 return osd_req;
1600}
1601
1602static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1603{
1604 ceph_osdc_put_request(osd_req);
1605}
1606
1607/* object_name is assumed to be a non-null pointer and NUL-terminated */
1608
1609static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1610 u64 offset, u64 length,
1611 enum obj_request_type type)
1612{
1613 struct rbd_obj_request *obj_request;
1614 size_t size;
1615 char *name;
1616
1617 rbd_assert(obj_request_type_valid(type));
1618
1619 size = strlen(object_name) + 1;
1620 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1621 if (!obj_request)
1622 return NULL;
1623
1624 name = (char *)(obj_request + 1);
1625 obj_request->object_name = memcpy(name, object_name, size);
1626 obj_request->offset = offset;
1627 obj_request->length = length;
1628 obj_request->which = BAD_WHICH;
1629 obj_request->type = type;
1630 INIT_LIST_HEAD(&obj_request->links);
1631 atomic_set(&obj_request->done, 0);
Alex Elder788e2df2013-01-17 12:25:27 -06001632 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001633 kref_init(&obj_request->kref);
1634
1635 return obj_request;
1636}
1637
1638static void rbd_obj_request_destroy(struct kref *kref)
1639{
1640 struct rbd_obj_request *obj_request;
1641
1642 obj_request = container_of(kref, struct rbd_obj_request, kref);
1643
1644 rbd_assert(obj_request->img_request == NULL);
1645 rbd_assert(obj_request->which == BAD_WHICH);
1646
1647 if (obj_request->osd_req)
1648 rbd_osd_req_destroy(obj_request->osd_req);
1649
1650 rbd_assert(obj_request_type_valid(obj_request->type));
1651 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001652 case OBJ_REQUEST_NODATA:
1653 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001654 case OBJ_REQUEST_BIO:
1655 if (obj_request->bio_list)
1656 bio_chain_put(obj_request->bio_list);
1657 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001658 case OBJ_REQUEST_PAGES:
1659 if (obj_request->pages)
1660 ceph_release_page_vector(obj_request->pages,
1661 obj_request->page_count);
1662 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001663 }
1664
1665 kfree(obj_request);
1666}
1667
1668/*
1669 * Caller is responsible for filling in the list of object requests
1670 * that comprises the image request, and the Linux request pointer
1671 * (if there is one).
1672 */
1673struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1674 u64 offset, u64 length,
1675 bool write_request)
1676{
1677 struct rbd_img_request *img_request;
1678 struct ceph_snap_context *snapc = NULL;
1679
1680 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1681 if (!img_request)
1682 return NULL;
1683
1684 if (write_request) {
1685 down_read(&rbd_dev->header_rwsem);
1686 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1687 up_read(&rbd_dev->header_rwsem);
1688 if (WARN_ON(!snapc)) {
1689 kfree(img_request);
1690 return NULL; /* Shouldn't happen */
1691 }
1692 }
1693
1694 img_request->rq = NULL;
1695 img_request->rbd_dev = rbd_dev;
1696 img_request->offset = offset;
1697 img_request->length = length;
1698 img_request->write_request = write_request;
1699 if (write_request)
1700 img_request->snapc = snapc;
1701 else
1702 img_request->snap_id = rbd_dev->spec->snap_id;
1703 spin_lock_init(&img_request->completion_lock);
1704 img_request->next_completion = 0;
1705 img_request->callback = NULL;
1706 img_request->obj_request_count = 0;
1707 INIT_LIST_HEAD(&img_request->obj_requests);
1708 kref_init(&img_request->kref);
1709
1710 rbd_img_request_get(img_request); /* Avoid a warning */
1711 rbd_img_request_put(img_request); /* TEMPORARY */
1712
1713 return img_request;
1714}
1715
1716static void rbd_img_request_destroy(struct kref *kref)
1717{
1718 struct rbd_img_request *img_request;
1719 struct rbd_obj_request *obj_request;
1720 struct rbd_obj_request *next_obj_request;
1721
1722 img_request = container_of(kref, struct rbd_img_request, kref);
1723
1724 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1725 rbd_img_obj_request_del(img_request, obj_request);
1726
1727 if (img_request->write_request)
1728 ceph_put_snap_context(img_request->snapc);
1729
1730 kfree(img_request);
1731}
1732
1733static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1734 struct bio *bio_list)
1735{
1736 struct rbd_device *rbd_dev = img_request->rbd_dev;
1737 struct rbd_obj_request *obj_request = NULL;
1738 struct rbd_obj_request *next_obj_request;
1739 unsigned int bio_offset;
1740 u64 image_offset;
1741 u64 resid;
1742 u16 opcode;
1743
1744 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1745 : CEPH_OSD_OP_READ;
1746 bio_offset = 0;
1747 image_offset = img_request->offset;
1748 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1749 resid = img_request->length;
1750 while (resid) {
1751 const char *object_name;
1752 unsigned int clone_size;
1753 struct ceph_osd_req_op *op;
1754 u64 offset;
1755 u64 length;
1756
1757 object_name = rbd_segment_name(rbd_dev, image_offset);
1758 if (!object_name)
1759 goto out_unwind;
1760 offset = rbd_segment_offset(rbd_dev, image_offset);
1761 length = rbd_segment_length(rbd_dev, image_offset, resid);
1762 obj_request = rbd_obj_request_create(object_name,
1763 offset, length,
1764 OBJ_REQUEST_BIO);
1765 kfree(object_name); /* object request has its own copy */
1766 if (!obj_request)
1767 goto out_unwind;
1768
1769 rbd_assert(length <= (u64) UINT_MAX);
1770 clone_size = (unsigned int) length;
1771 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1772 &bio_offset, clone_size,
1773 GFP_ATOMIC);
1774 if (!obj_request->bio_list)
1775 goto out_partial;
1776
1777 /*
1778 * Build up the op to use in building the osd
1779 * request. Note that the contents of the op are
1780 * copied by rbd_osd_req_create().
1781 */
1782 op = rbd_osd_req_op_create(opcode, offset, length);
1783 if (!op)
1784 goto out_partial;
1785 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1786 img_request->write_request,
1787 obj_request, op);
1788 rbd_osd_req_op_destroy(op);
1789 if (!obj_request->osd_req)
1790 goto out_partial;
1791 /* status and version are initially zero-filled */
1792
1793 rbd_img_obj_request_add(img_request, obj_request);
1794
1795 image_offset += length;
1796 resid -= length;
1797 }
1798
1799 return 0;
1800
1801out_partial:
1802 rbd_obj_request_put(obj_request);
1803out_unwind:
1804 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1805 rbd_obj_request_put(obj_request);
1806
1807 return -ENOMEM;
1808}
1809
1810static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1811{
1812 struct rbd_img_request *img_request;
1813 u32 which = obj_request->which;
1814 bool more = true;
1815
1816 img_request = obj_request->img_request;
1817 rbd_assert(img_request != NULL);
1818 rbd_assert(img_request->rq != NULL);
1819 rbd_assert(which != BAD_WHICH);
1820 rbd_assert(which < img_request->obj_request_count);
1821 rbd_assert(which >= img_request->next_completion);
1822
1823 spin_lock_irq(&img_request->completion_lock);
1824 if (which != img_request->next_completion)
1825 goto out;
1826
1827 for_each_obj_request_from(img_request, obj_request) {
1828 unsigned int xferred;
1829 int result;
1830
1831 rbd_assert(more);
1832 rbd_assert(which < img_request->obj_request_count);
1833
1834 if (!atomic_read(&obj_request->done))
1835 break;
1836
1837 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1838 xferred = (unsigned int) obj_request->xferred;
1839 result = (int) obj_request->result;
1840 if (result)
1841 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1842 img_request->write_request ? "write" : "read",
1843 result, xferred);
1844
1845 more = blk_end_request(img_request->rq, result, xferred);
1846 which++;
1847 }
1848 rbd_assert(more ^ (which == img_request->obj_request_count));
1849 img_request->next_completion = which;
1850out:
1851 spin_unlock_irq(&img_request->completion_lock);
1852
1853 if (!more)
1854 rbd_img_request_complete(img_request);
1855}
1856
1857static int rbd_img_request_submit(struct rbd_img_request *img_request)
1858{
1859 struct rbd_device *rbd_dev = img_request->rbd_dev;
1860 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1861 struct rbd_obj_request *obj_request;
1862
1863 for_each_obj_request(img_request, obj_request) {
1864 int ret;
1865
1866 obj_request->callback = rbd_img_obj_callback;
1867 ret = rbd_obj_request_submit(osdc, obj_request);
1868 if (ret)
1869 return ret;
1870 /*
1871 * The image request has its own reference to each
1872 * of its object requests, so we can safely drop the
1873 * initial one here.
1874 */
1875 rbd_obj_request_put(obj_request);
1876 }
1877
1878 return 0;
1879}
1880
Alex Elder9969ebc2013-01-18 12:31:10 -06001881/*
1882 * Request sync osd watch/unwatch. The value of "start" determines
1883 * whether a watch request is being initiated or torn down.
1884 */
1885static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1886{
1887 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1888 struct rbd_obj_request *obj_request;
1889 struct ceph_osd_req_op *op;
1890 int ret;
1891
1892 rbd_assert(start ^ !!rbd_dev->watch_event);
1893 rbd_assert(start ^ !!rbd_dev->watch_request);
1894
1895 if (start) {
1896 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1897 &rbd_dev->watch_event);
1898 if (ret < 0)
1899 return ret;
1900 }
1901
1902 ret = -ENOMEM;
1903 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1904 OBJ_REQUEST_NODATA);
1905 if (!obj_request)
1906 goto out_cancel;
1907
1908 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1909 rbd_dev->watch_event->cookie,
1910 rbd_dev->header.obj_version, start);
1911 if (!op)
1912 goto out_cancel;
1913 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1914 obj_request, op);
1915 rbd_osd_req_op_destroy(op);
1916 if (!obj_request->osd_req)
1917 goto out_cancel;
1918
1919 if (start) {
1920 rbd_dev->watch_request = obj_request->osd_req;
1921 ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request);
1922 }
1923 ret = rbd_obj_request_submit(osdc, obj_request);
1924 if (ret)
1925 goto out_cancel;
1926 ret = rbd_obj_request_wait(obj_request);
1927 if (ret)
1928 goto out_cancel;
1929
1930 ret = obj_request->result;
1931 if (ret)
1932 goto out_cancel;
1933
1934 if (start)
1935 goto done; /* Done if setting up the watch request */
1936out_cancel:
1937 /* Cancel the event if we're tearing down, or on error */
1938 ceph_osdc_cancel_event(rbd_dev->watch_event);
1939 rbd_dev->watch_event = NULL;
1940done:
1941 if (obj_request)
1942 rbd_obj_request_put(obj_request);
1943
1944 return ret;
1945}
1946
Alex Elderbf0d5f502012-11-22 00:00:08 -06001947static void rbd_request_fn(struct request_queue *q)
1948{
1949 struct rbd_device *rbd_dev = q->queuedata;
1950 bool read_only = rbd_dev->mapping.read_only;
1951 struct request *rq;
1952 int result;
1953
1954 while ((rq = blk_fetch_request(q))) {
1955 bool write_request = rq_data_dir(rq) == WRITE;
1956 struct rbd_img_request *img_request;
1957 u64 offset;
1958 u64 length;
1959
1960 /* Ignore any non-FS requests that filter through. */
1961
1962 if (rq->cmd_type != REQ_TYPE_FS) {
1963 __blk_end_request_all(rq, 0);
1964 continue;
1965 }
1966
1967 spin_unlock_irq(q->queue_lock);
1968
1969 /* Disallow writes to a read-only device */
1970
1971 if (write_request) {
1972 result = -EROFS;
1973 if (read_only)
1974 goto end_request;
1975 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1976 }
1977
1978 /* Quit early if the snapshot has disappeared */
1979
1980 if (!atomic_read(&rbd_dev->exists)) {
1981 dout("request for non-existent snapshot");
1982 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1983 result = -ENXIO;
1984 goto end_request;
1985 }
1986
1987 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1988 length = (u64) blk_rq_bytes(rq);
1989
1990 result = -EINVAL;
1991 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1992 goto end_request; /* Shouldn't happen */
1993
1994 result = -ENOMEM;
1995 img_request = rbd_img_request_create(rbd_dev, offset, length,
1996 write_request);
1997 if (!img_request)
1998 goto end_request;
1999
2000 img_request->rq = rq;
2001
2002 result = rbd_img_request_fill_bio(img_request, rq->bio);
2003 if (!result)
2004 result = rbd_img_request_submit(img_request);
2005 if (result)
2006 rbd_img_request_put(img_request);
2007end_request:
2008 spin_lock_irq(q->queue_lock);
2009 if (result < 0) {
2010 rbd_warn(rbd_dev, "obj_request %s result %d\n",
2011 write_request ? "write" : "read", result);
2012 __blk_end_request_all(rq, result);
2013 }
2014 }
2015}
2016
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002017/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002018 * a queue callback. Makes sure that we don't create a bio that spans across
2019 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002020 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002021 */
2022static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2023 struct bio_vec *bvec)
2024{
2025 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05002026 sector_t sector_offset;
2027 sector_t sectors_per_obj;
2028 sector_t obj_sector_offset;
2029 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002030
Alex Eldere5cfeed2012-10-20 22:17:27 -05002031 /*
2032 * Find how far into its rbd object the partition-relative
2033 * bio start sector is to offset relative to the enclosing
2034 * device.
2035 */
2036 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2037 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2038 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002039
Alex Eldere5cfeed2012-10-20 22:17:27 -05002040 /*
2041 * Compute the number of bytes from that offset to the end
2042 * of the object. Account for what's already used by the bio.
2043 */
2044 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2045 if (ret > bmd->bi_size)
2046 ret -= bmd->bi_size;
2047 else
2048 ret = 0;
2049
2050 /*
2051 * Don't send back more than was asked for. And if the bio
2052 * was empty, let the whole thing through because: "Note
2053 * that a block device *must* allow a single page to be
2054 * added to an empty bio."
2055 */
2056 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2057 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2058 ret = (int) bvec->bv_len;
2059
2060 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002061}
2062
2063static void rbd_free_disk(struct rbd_device *rbd_dev)
2064{
2065 struct gendisk *disk = rbd_dev->disk;
2066
2067 if (!disk)
2068 return;
2069
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002070 if (disk->flags & GENHD_FL_UP)
2071 del_gendisk(disk);
2072 if (disk->queue)
2073 blk_cleanup_queue(disk->queue);
2074 put_disk(disk);
2075}
2076
Alex Elder788e2df2013-01-17 12:25:27 -06002077static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2078 const char *object_name,
2079 u64 offset, u64 length,
2080 char *buf, u64 *version)
2081
2082{
2083 struct ceph_osd_req_op *op;
2084 struct rbd_obj_request *obj_request;
2085 struct ceph_osd_client *osdc;
2086 struct page **pages = NULL;
2087 u32 page_count;
2088 int ret;
2089
2090 page_count = (u32) calc_pages_for(offset, length);
2091 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2092 if (IS_ERR(pages))
2093 ret = PTR_ERR(pages);
2094
2095 ret = -ENOMEM;
2096 obj_request = rbd_obj_request_create(object_name, offset, length,
2097 OBJ_REQUEST_PAGES);
2098 if (!obj_request)
2099 goto out;
2100
2101 obj_request->pages = pages;
2102 obj_request->page_count = page_count;
2103
2104 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2105 if (!op)
2106 goto out;
2107 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2108 obj_request, op);
2109 rbd_osd_req_op_destroy(op);
2110 if (!obj_request->osd_req)
2111 goto out;
2112
2113 osdc = &rbd_dev->rbd_client->client->osdc;
2114 ret = rbd_obj_request_submit(osdc, obj_request);
2115 if (ret)
2116 goto out;
2117 ret = rbd_obj_request_wait(obj_request);
2118 if (ret)
2119 goto out;
2120
2121 ret = obj_request->result;
2122 if (ret < 0)
2123 goto out;
2124 ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
2125 if (version)
2126 *version = obj_request->version;
2127out:
2128 if (obj_request)
2129 rbd_obj_request_put(obj_request);
2130 else
2131 ceph_release_page_vector(pages, page_count);
2132
2133 return ret;
2134}
2135
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002136/*
Alex Elder4156d992012-08-02 11:29:46 -05002137 * Read the complete header for the given rbd device.
2138 *
2139 * Returns a pointer to a dynamically-allocated buffer containing
2140 * the complete and validated header. Caller can pass the address
2141 * of a variable that will be filled in with the version of the
2142 * header object at the time it was read.
2143 *
2144 * Returns a pointer-coded errno if a failure occurs.
2145 */
2146static struct rbd_image_header_ondisk *
2147rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2148{
2149 struct rbd_image_header_ondisk *ondisk = NULL;
2150 u32 snap_count = 0;
2151 u64 names_size = 0;
2152 u32 want_count;
2153 int ret;
2154
2155 /*
2156 * The complete header will include an array of its 64-bit
2157 * snapshot ids, followed by the names of those snapshots as
2158 * a contiguous block of NUL-terminated strings. Note that
2159 * the number of snapshots could change by the time we read
2160 * it in, in which case we re-read it.
2161 */
2162 do {
2163 size_t size;
2164
2165 kfree(ondisk);
2166
2167 size = sizeof (*ondisk);
2168 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2169 size += names_size;
2170 ondisk = kmalloc(size, GFP_KERNEL);
2171 if (!ondisk)
2172 return ERR_PTR(-ENOMEM);
2173
Alex Elder788e2df2013-01-17 12:25:27 -06002174 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002175 0, size,
2176 (char *) ondisk, version);
2177
2178 if (ret < 0)
2179 goto out_err;
2180 if (WARN_ON((size_t) ret < size)) {
2181 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002182 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2183 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002184 goto out_err;
2185 }
2186 if (!rbd_dev_ondisk_valid(ondisk)) {
2187 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002188 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002189 goto out_err;
2190 }
2191
2192 names_size = le64_to_cpu(ondisk->snap_names_len);
2193 want_count = snap_count;
2194 snap_count = le32_to_cpu(ondisk->snap_count);
2195 } while (snap_count != want_count);
2196
2197 return ondisk;
2198
2199out_err:
2200 kfree(ondisk);
2201
2202 return ERR_PTR(ret);
2203}
2204
2205/*
2206 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002207 */
2208static int rbd_read_header(struct rbd_device *rbd_dev,
2209 struct rbd_image_header *header)
2210{
Alex Elder4156d992012-08-02 11:29:46 -05002211 struct rbd_image_header_ondisk *ondisk;
2212 u64 ver = 0;
2213 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002214
Alex Elder4156d992012-08-02 11:29:46 -05002215 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2216 if (IS_ERR(ondisk))
2217 return PTR_ERR(ondisk);
2218 ret = rbd_header_from_disk(header, ondisk);
2219 if (ret >= 0)
2220 header->obj_version = ver;
2221 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002222
Alex Elder4156d992012-08-02 11:29:46 -05002223 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002224}
2225
Alex Elder41f38c22012-10-25 23:34:40 -05002226static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002227{
2228 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002229 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002230
Alex Eldera0593292012-07-19 09:09:27 -05002231 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002232 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002233}
2234
Alex Elder94785542012-10-09 13:50:17 -07002235static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2236{
2237 sector_t size;
2238
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002239 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002240 return;
2241
2242 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2243 dout("setting size to %llu sectors", (unsigned long long) size);
2244 rbd_dev->mapping.size = (u64) size;
2245 set_capacity(rbd_dev->disk, size);
2246}
2247
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002248/*
2249 * only read the first part of the ondisk header, without the snaps info
2250 */
Alex Elder117973f2012-08-31 17:29:55 -05002251static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002252{
2253 int ret;
2254 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002255
2256 ret = rbd_read_header(rbd_dev, &h);
2257 if (ret < 0)
2258 return ret;
2259
Josh Durgina51aa0c2011-12-05 10:35:04 -08002260 down_write(&rbd_dev->header_rwsem);
2261
Alex Elder94785542012-10-09 13:50:17 -07002262 /* Update image size, and check for resize of mapped image */
2263 rbd_dev->header.image_size = h.image_size;
2264 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002265
Alex Elder849b4262012-07-09 21:04:24 -05002266 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002267 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002268 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002269 /* osd requests may still refer to snapc */
2270 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002271
Alex Elderb8136232012-07-25 09:32:41 -05002272 if (hver)
2273 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002274 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002275 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002276 rbd_dev->header.snapc = h.snapc;
2277 rbd_dev->header.snap_names = h.snap_names;
2278 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002279 /* Free the extra copy of the object prefix */
2280 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2281 kfree(h.object_prefix);
2282
Alex Elder304f6802012-08-31 17:29:52 -05002283 ret = rbd_dev_snaps_update(rbd_dev);
2284 if (!ret)
2285 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002286
Josh Durginc6666012011-11-21 17:11:12 -08002287 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002288
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002289 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002290}
2291
Alex Elder117973f2012-08-31 17:29:55 -05002292static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002293{
2294 int ret;
2295
Alex Elder117973f2012-08-31 17:29:55 -05002296 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002297 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002298 if (rbd_dev->image_format == 1)
2299 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2300 else
2301 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002302 mutex_unlock(&ctl_mutex);
2303
2304 return ret;
2305}
2306
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002307static int rbd_init_disk(struct rbd_device *rbd_dev)
2308{
2309 struct gendisk *disk;
2310 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002311 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002312
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002313 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002314 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2315 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002316 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002317
Alex Elderf0f8cef2012-01-29 13:57:44 -06002318 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002319 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002320 disk->major = rbd_dev->major;
2321 disk->first_minor = 0;
2322 disk->fops = &rbd_bd_ops;
2323 disk->private_data = rbd_dev;
2324
Alex Elderbf0d5f502012-11-22 00:00:08 -06002325 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002326 if (!q)
2327 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002328
Alex Elder593a9e72012-02-07 12:03:37 -06002329 /* We use the default size, but let's be explicit about it. */
2330 blk_queue_physical_block_size(q, SECTOR_SIZE);
2331
Josh Durgin029bcbd2011-07-22 11:35:23 -07002332 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002333 segment_size = rbd_obj_bytes(&rbd_dev->header);
2334 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2335 blk_queue_max_segment_size(q, segment_size);
2336 blk_queue_io_min(q, segment_size);
2337 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002338
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002339 blk_queue_merge_bvec(q, rbd_merge_bvec);
2340 disk->queue = q;
2341
2342 q->queuedata = rbd_dev;
2343
2344 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002345
Alex Elder12f02942012-08-29 17:11:07 -05002346 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2347
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002348 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002349out_disk:
2350 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002351
2352 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002353}
2354
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002355/*
2356 sysfs
2357*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002358
Alex Elder593a9e72012-02-07 12:03:37 -06002359static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2360{
2361 return container_of(dev, struct rbd_device, dev);
2362}
2363
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002364static ssize_t rbd_size_show(struct device *dev,
2365 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002366{
Alex Elder593a9e72012-02-07 12:03:37 -06002367 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002368 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002369
Josh Durgina51aa0c2011-12-05 10:35:04 -08002370 down_read(&rbd_dev->header_rwsem);
2371 size = get_capacity(rbd_dev->disk);
2372 up_read(&rbd_dev->header_rwsem);
2373
2374 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002375}
2376
Alex Elder34b13182012-07-13 20:35:12 -05002377/*
2378 * Note this shows the features for whatever's mapped, which is not
2379 * necessarily the base image.
2380 */
2381static ssize_t rbd_features_show(struct device *dev,
2382 struct device_attribute *attr, char *buf)
2383{
2384 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2385
2386 return sprintf(buf, "0x%016llx\n",
2387 (unsigned long long) rbd_dev->mapping.features);
2388}
2389
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002390static ssize_t rbd_major_show(struct device *dev,
2391 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002392{
Alex Elder593a9e72012-02-07 12:03:37 -06002393 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002394
2395 return sprintf(buf, "%d\n", rbd_dev->major);
2396}
2397
2398static ssize_t rbd_client_id_show(struct device *dev,
2399 struct device_attribute *attr, char *buf)
2400{
Alex Elder593a9e72012-02-07 12:03:37 -06002401 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002402
Alex Elder1dbb4392012-01-24 10:08:37 -06002403 return sprintf(buf, "client%lld\n",
2404 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002405}
2406
2407static ssize_t rbd_pool_show(struct device *dev,
2408 struct device_attribute *attr, char *buf)
2409{
Alex Elder593a9e72012-02-07 12:03:37 -06002410 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002411
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002412 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002413}
2414
Alex Elder9bb2f332012-07-12 10:46:35 -05002415static ssize_t rbd_pool_id_show(struct device *dev,
2416 struct device_attribute *attr, char *buf)
2417{
2418 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2419
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002420 return sprintf(buf, "%llu\n",
2421 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002422}
2423
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002424static ssize_t rbd_name_show(struct device *dev,
2425 struct device_attribute *attr, char *buf)
2426{
Alex Elder593a9e72012-02-07 12:03:37 -06002427 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002428
Alex Eldera92ffdf2012-10-30 19:40:33 -05002429 if (rbd_dev->spec->image_name)
2430 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2431
2432 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002433}
2434
Alex Elder589d30e2012-07-10 20:30:11 -05002435static ssize_t rbd_image_id_show(struct device *dev,
2436 struct device_attribute *attr, char *buf)
2437{
2438 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2439
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002440 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002441}
2442
Alex Elder34b13182012-07-13 20:35:12 -05002443/*
2444 * Shows the name of the currently-mapped snapshot (or
2445 * RBD_SNAP_HEAD_NAME for the base image).
2446 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002447static ssize_t rbd_snap_show(struct device *dev,
2448 struct device_attribute *attr,
2449 char *buf)
2450{
Alex Elder593a9e72012-02-07 12:03:37 -06002451 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002452
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002453 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002454}
2455
Alex Elder86b00e02012-10-25 23:34:42 -05002456/*
2457 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2458 * for the parent image. If there is no parent, simply shows
2459 * "(no parent image)".
2460 */
2461static ssize_t rbd_parent_show(struct device *dev,
2462 struct device_attribute *attr,
2463 char *buf)
2464{
2465 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2466 struct rbd_spec *spec = rbd_dev->parent_spec;
2467 int count;
2468 char *bufp = buf;
2469
2470 if (!spec)
2471 return sprintf(buf, "(no parent image)\n");
2472
2473 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2474 (unsigned long long) spec->pool_id, spec->pool_name);
2475 if (count < 0)
2476 return count;
2477 bufp += count;
2478
2479 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2480 spec->image_name ? spec->image_name : "(unknown)");
2481 if (count < 0)
2482 return count;
2483 bufp += count;
2484
2485 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2486 (unsigned long long) spec->snap_id, spec->snap_name);
2487 if (count < 0)
2488 return count;
2489 bufp += count;
2490
2491 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2492 if (count < 0)
2493 return count;
2494 bufp += count;
2495
2496 return (ssize_t) (bufp - buf);
2497}
2498
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002499static ssize_t rbd_image_refresh(struct device *dev,
2500 struct device_attribute *attr,
2501 const char *buf,
2502 size_t size)
2503{
Alex Elder593a9e72012-02-07 12:03:37 -06002504 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002505 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002506
Alex Elder117973f2012-08-31 17:29:55 -05002507 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002508
2509 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002510}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002511
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002512static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002513static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002514static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2515static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2516static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002517static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002518static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002519static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002520static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2521static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002522static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002523
2524static struct attribute *rbd_attrs[] = {
2525 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002526 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002527 &dev_attr_major.attr,
2528 &dev_attr_client_id.attr,
2529 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002530 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002531 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002532 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002533 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002534 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002535 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002536 NULL
2537};
2538
2539static struct attribute_group rbd_attr_group = {
2540 .attrs = rbd_attrs,
2541};
2542
2543static const struct attribute_group *rbd_attr_groups[] = {
2544 &rbd_attr_group,
2545 NULL
2546};
2547
2548static void rbd_sysfs_dev_release(struct device *dev)
2549{
2550}
2551
2552static struct device_type rbd_device_type = {
2553 .name = "rbd",
2554 .groups = rbd_attr_groups,
2555 .release = rbd_sysfs_dev_release,
2556};
2557
2558
2559/*
2560 sysfs - snapshots
2561*/
2562
2563static ssize_t rbd_snap_size_show(struct device *dev,
2564 struct device_attribute *attr,
2565 char *buf)
2566{
2567 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2568
Josh Durgin35915382011-12-05 18:25:13 -08002569 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002570}
2571
2572static ssize_t rbd_snap_id_show(struct device *dev,
2573 struct device_attribute *attr,
2574 char *buf)
2575{
2576 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2577
Josh Durgin35915382011-12-05 18:25:13 -08002578 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002579}
2580
Alex Elder34b13182012-07-13 20:35:12 -05002581static ssize_t rbd_snap_features_show(struct device *dev,
2582 struct device_attribute *attr,
2583 char *buf)
2584{
2585 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2586
2587 return sprintf(buf, "0x%016llx\n",
2588 (unsigned long long) snap->features);
2589}
2590
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002591static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2592static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002593static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002594
2595static struct attribute *rbd_snap_attrs[] = {
2596 &dev_attr_snap_size.attr,
2597 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002598 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002599 NULL,
2600};
2601
2602static struct attribute_group rbd_snap_attr_group = {
2603 .attrs = rbd_snap_attrs,
2604};
2605
2606static void rbd_snap_dev_release(struct device *dev)
2607{
2608 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2609 kfree(snap->name);
2610 kfree(snap);
2611}
2612
2613static const struct attribute_group *rbd_snap_attr_groups[] = {
2614 &rbd_snap_attr_group,
2615 NULL
2616};
2617
2618static struct device_type rbd_snap_device_type = {
2619 .groups = rbd_snap_attr_groups,
2620 .release = rbd_snap_dev_release,
2621};
2622
Alex Elder8b8fb992012-10-26 17:25:24 -05002623static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2624{
2625 kref_get(&spec->kref);
2626
2627 return spec;
2628}
2629
2630static void rbd_spec_free(struct kref *kref);
2631static void rbd_spec_put(struct rbd_spec *spec)
2632{
2633 if (spec)
2634 kref_put(&spec->kref, rbd_spec_free);
2635}
2636
2637static struct rbd_spec *rbd_spec_alloc(void)
2638{
2639 struct rbd_spec *spec;
2640
2641 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2642 if (!spec)
2643 return NULL;
2644 kref_init(&spec->kref);
2645
2646 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2647
2648 return spec;
2649}
2650
2651static void rbd_spec_free(struct kref *kref)
2652{
2653 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2654
2655 kfree(spec->pool_name);
2656 kfree(spec->image_id);
2657 kfree(spec->image_name);
2658 kfree(spec->snap_name);
2659 kfree(spec);
2660}
2661
Alex Elderc53d5892012-10-25 23:34:42 -05002662struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2663 struct rbd_spec *spec)
2664{
2665 struct rbd_device *rbd_dev;
2666
2667 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2668 if (!rbd_dev)
2669 return NULL;
2670
2671 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002672 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002673 INIT_LIST_HEAD(&rbd_dev->node);
2674 INIT_LIST_HEAD(&rbd_dev->snaps);
2675 init_rwsem(&rbd_dev->header_rwsem);
2676
2677 rbd_dev->spec = spec;
2678 rbd_dev->rbd_client = rbdc;
2679
Alex Elder0903e872012-11-14 12:25:19 -06002680 /* Initialize the layout used for all rbd requests */
2681
2682 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2683 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2684 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2685 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2686
Alex Elderc53d5892012-10-25 23:34:42 -05002687 return rbd_dev;
2688}
2689
2690static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2691{
Alex Elder86b00e02012-10-25 23:34:42 -05002692 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002693 kfree(rbd_dev->header_name);
2694 rbd_put_client(rbd_dev->rbd_client);
2695 rbd_spec_put(rbd_dev->spec);
2696 kfree(rbd_dev);
2697}
2698
Alex Elder304f6802012-08-31 17:29:52 -05002699static bool rbd_snap_registered(struct rbd_snap *snap)
2700{
2701 bool ret = snap->dev.type == &rbd_snap_device_type;
2702 bool reg = device_is_registered(&snap->dev);
2703
2704 rbd_assert(!ret ^ reg);
2705
2706 return ret;
2707}
2708
Alex Elder41f38c22012-10-25 23:34:40 -05002709static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002710{
2711 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002712 if (device_is_registered(&snap->dev))
2713 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002714}
2715
Alex Elder14e70852012-07-19 09:09:27 -05002716static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002717 struct device *parent)
2718{
2719 struct device *dev = &snap->dev;
2720 int ret;
2721
2722 dev->type = &rbd_snap_device_type;
2723 dev->parent = parent;
2724 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002725 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002726 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2727
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002728 ret = device_register(dev);
2729
2730 return ret;
2731}
2732
Alex Elder4e891e02012-07-10 20:30:10 -05002733static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002734 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002735 u64 snap_id, u64 snap_size,
2736 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002737{
Alex Elder4e891e02012-07-10 20:30:10 -05002738 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002739 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002740
2741 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002742 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002743 return ERR_PTR(-ENOMEM);
2744
2745 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002746 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002747 if (!snap->name)
2748 goto err;
2749
Alex Elderc8d18422012-07-10 20:30:11 -05002750 snap->id = snap_id;
2751 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002752 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002753
2754 return snap;
2755
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002756err:
2757 kfree(snap->name);
2758 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002759
2760 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002761}
2762
Alex Eldercd892122012-07-03 16:01:19 -05002763static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2764 u64 *snap_size, u64 *snap_features)
2765{
2766 char *snap_name;
2767
2768 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2769
2770 *snap_size = rbd_dev->header.snap_sizes[which];
2771 *snap_features = 0; /* No features for v1 */
2772
2773 /* Skip over names until we find the one we are looking for */
2774
2775 snap_name = rbd_dev->header.snap_names;
2776 while (which--)
2777 snap_name += strlen(snap_name) + 1;
2778
2779 return snap_name;
2780}
2781
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002782/*
Alex Elder9d475de2012-07-03 16:01:19 -05002783 * Get the size and object order for an image snapshot, or if
2784 * snap_id is CEPH_NOSNAP, gets this information for the base
2785 * image.
2786 */
2787static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2788 u8 *order, u64 *snap_size)
2789{
2790 __le64 snapid = cpu_to_le64(snap_id);
2791 int ret;
2792 struct {
2793 u8 order;
2794 __le64 size;
2795 } __attribute__ ((packed)) size_buf = { 0 };
2796
2797 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2798 "rbd", "get_size",
2799 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002800 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder9d475de2012-07-03 16:01:19 -05002801 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2802 if (ret < 0)
2803 return ret;
2804
2805 *order = size_buf.order;
2806 *snap_size = le64_to_cpu(size_buf.size);
2807
2808 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2809 (unsigned long long) snap_id, (unsigned int) *order,
2810 (unsigned long long) *snap_size);
2811
2812 return 0;
2813}
2814
2815static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2816{
2817 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2818 &rbd_dev->header.obj_order,
2819 &rbd_dev->header.image_size);
2820}
2821
Alex Elder1e130192012-07-03 16:01:19 -05002822static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2823{
2824 void *reply_buf;
2825 int ret;
2826 void *p;
2827
2828 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2829 if (!reply_buf)
2830 return -ENOMEM;
2831
2832 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2833 "rbd", "get_object_prefix",
2834 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002835 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder1e130192012-07-03 16:01:19 -05002836 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2837 if (ret < 0)
2838 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002839 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002840
2841 p = reply_buf;
2842 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2843 p + RBD_OBJ_PREFIX_LEN_MAX,
2844 NULL, GFP_NOIO);
2845
2846 if (IS_ERR(rbd_dev->header.object_prefix)) {
2847 ret = PTR_ERR(rbd_dev->header.object_prefix);
2848 rbd_dev->header.object_prefix = NULL;
2849 } else {
2850 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2851 }
2852
2853out:
2854 kfree(reply_buf);
2855
2856 return ret;
2857}
2858
Alex Elderb1b54022012-07-03 16:01:19 -05002859static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2860 u64 *snap_features)
2861{
2862 __le64 snapid = cpu_to_le64(snap_id);
2863 struct {
2864 __le64 features;
2865 __le64 incompat;
2866 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002867 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002868 int ret;
2869
2870 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2871 "rbd", "get_features",
2872 (char *) &snapid, sizeof (snapid),
2873 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002874 NULL);
Alex Elderb1b54022012-07-03 16:01:19 -05002875 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2876 if (ret < 0)
2877 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002878
2879 incompat = le64_to_cpu(features_buf.incompat);
2880 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002881 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002882
Alex Elderb1b54022012-07-03 16:01:19 -05002883 *snap_features = le64_to_cpu(features_buf.features);
2884
2885 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2886 (unsigned long long) snap_id,
2887 (unsigned long long) *snap_features,
2888 (unsigned long long) le64_to_cpu(features_buf.incompat));
2889
2890 return 0;
2891}
2892
2893static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2894{
2895 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2896 &rbd_dev->header.features);
2897}
2898
Alex Elder86b00e02012-10-25 23:34:42 -05002899static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2900{
2901 struct rbd_spec *parent_spec;
2902 size_t size;
2903 void *reply_buf = NULL;
2904 __le64 snapid;
2905 void *p;
2906 void *end;
2907 char *image_id;
2908 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002909 int ret;
2910
2911 parent_spec = rbd_spec_alloc();
2912 if (!parent_spec)
2913 return -ENOMEM;
2914
2915 size = sizeof (__le64) + /* pool_id */
2916 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2917 sizeof (__le64) + /* snap_id */
2918 sizeof (__le64); /* overlap */
2919 reply_buf = kmalloc(size, GFP_KERNEL);
2920 if (!reply_buf) {
2921 ret = -ENOMEM;
2922 goto out_err;
2923 }
2924
2925 snapid = cpu_to_le64(CEPH_NOSNAP);
2926 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2927 "rbd", "get_parent",
2928 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002929 (char *) reply_buf, size, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002930 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2931 if (ret < 0)
2932 goto out_err;
2933
2934 ret = -ERANGE;
2935 p = reply_buf;
2936 end = (char *) reply_buf + size;
2937 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2938 if (parent_spec->pool_id == CEPH_NOPOOL)
2939 goto out; /* No parent? No problem. */
2940
Alex Elder0903e872012-11-14 12:25:19 -06002941 /* The ceph file layout needs to fit pool id in 32 bits */
2942
2943 ret = -EIO;
2944 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2945 goto out;
2946
Alex Elder979ed482012-11-01 08:39:26 -05002947 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002948 if (IS_ERR(image_id)) {
2949 ret = PTR_ERR(image_id);
2950 goto out_err;
2951 }
2952 parent_spec->image_id = image_id;
2953 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2954 ceph_decode_64_safe(&p, end, overlap, out_err);
2955
2956 rbd_dev->parent_overlap = overlap;
2957 rbd_dev->parent_spec = parent_spec;
2958 parent_spec = NULL; /* rbd_dev now owns this */
2959out:
2960 ret = 0;
2961out_err:
2962 kfree(reply_buf);
2963 rbd_spec_put(parent_spec);
2964
2965 return ret;
2966}
2967
Alex Elder9e15b772012-10-30 19:40:33 -05002968static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2969{
2970 size_t image_id_size;
2971 char *image_id;
2972 void *p;
2973 void *end;
2974 size_t size;
2975 void *reply_buf = NULL;
2976 size_t len = 0;
2977 char *image_name = NULL;
2978 int ret;
2979
2980 rbd_assert(!rbd_dev->spec->image_name);
2981
Alex Elder69e7a022012-11-01 08:39:26 -05002982 len = strlen(rbd_dev->spec->image_id);
2983 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002984 image_id = kmalloc(image_id_size, GFP_KERNEL);
2985 if (!image_id)
2986 return NULL;
2987
2988 p = image_id;
2989 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002990 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002991
2992 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2993 reply_buf = kmalloc(size, GFP_KERNEL);
2994 if (!reply_buf)
2995 goto out;
2996
2997 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2998 "rbd", "dir_get_name",
2999 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06003000 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05003001 if (ret < 0)
3002 goto out;
3003 p = reply_buf;
3004 end = (char *) reply_buf + size;
3005 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3006 if (IS_ERR(image_name))
3007 image_name = NULL;
3008 else
3009 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3010out:
3011 kfree(reply_buf);
3012 kfree(image_id);
3013
3014 return image_name;
3015}
3016
3017/*
3018 * When a parent image gets probed, we only have the pool, image,
3019 * and snapshot ids but not the names of any of them. This call
3020 * is made later to fill in those names. It has to be done after
3021 * rbd_dev_snaps_update() has completed because some of the
3022 * information (in particular, snapshot name) is not available
3023 * until then.
3024 */
3025static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3026{
3027 struct ceph_osd_client *osdc;
3028 const char *name;
3029 void *reply_buf = NULL;
3030 int ret;
3031
3032 if (rbd_dev->spec->pool_name)
3033 return 0; /* Already have the names */
3034
3035 /* Look up the pool name */
3036
3037 osdc = &rbd_dev->rbd_client->client->osdc;
3038 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003039 if (!name) {
3040 rbd_warn(rbd_dev, "there is no pool with id %llu",
3041 rbd_dev->spec->pool_id); /* Really a BUG() */
3042 return -EIO;
3043 }
Alex Elder9e15b772012-10-30 19:40:33 -05003044
3045 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3046 if (!rbd_dev->spec->pool_name)
3047 return -ENOMEM;
3048
3049 /* Fetch the image name; tolerate failure here */
3050
3051 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003052 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05003053 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05003054 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003055 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003056
3057 /* Look up the snapshot name. */
3058
3059 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3060 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003061 rbd_warn(rbd_dev, "no snapshot with id %llu",
3062 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003063 ret = -EIO;
3064 goto out_err;
3065 }
3066 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3067 if(!rbd_dev->spec->snap_name)
3068 goto out_err;
3069
3070 return 0;
3071out_err:
3072 kfree(reply_buf);
3073 kfree(rbd_dev->spec->pool_name);
3074 rbd_dev->spec->pool_name = NULL;
3075
3076 return ret;
3077}
3078
Alex Elder6e14b1a2012-07-03 16:01:19 -05003079static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003080{
3081 size_t size;
3082 int ret;
3083 void *reply_buf;
3084 void *p;
3085 void *end;
3086 u64 seq;
3087 u32 snap_count;
3088 struct ceph_snap_context *snapc;
3089 u32 i;
3090
3091 /*
3092 * We'll need room for the seq value (maximum snapshot id),
3093 * snapshot count, and array of that many snapshot ids.
3094 * For now we have a fixed upper limit on the number we're
3095 * prepared to receive.
3096 */
3097 size = sizeof (__le64) + sizeof (__le32) +
3098 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3099 reply_buf = kzalloc(size, GFP_KERNEL);
3100 if (!reply_buf)
3101 return -ENOMEM;
3102
3103 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3104 "rbd", "get_snapcontext",
3105 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003106 reply_buf, size, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003107 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3108 if (ret < 0)
3109 goto out;
3110
3111 ret = -ERANGE;
3112 p = reply_buf;
3113 end = (char *) reply_buf + size;
3114 ceph_decode_64_safe(&p, end, seq, out);
3115 ceph_decode_32_safe(&p, end, snap_count, out);
3116
3117 /*
3118 * Make sure the reported number of snapshot ids wouldn't go
3119 * beyond the end of our buffer. But before checking that,
3120 * make sure the computed size of the snapshot context we
3121 * allocate is representable in a size_t.
3122 */
3123 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3124 / sizeof (u64)) {
3125 ret = -EINVAL;
3126 goto out;
3127 }
3128 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3129 goto out;
3130
3131 size = sizeof (struct ceph_snap_context) +
3132 snap_count * sizeof (snapc->snaps[0]);
3133 snapc = kmalloc(size, GFP_KERNEL);
3134 if (!snapc) {
3135 ret = -ENOMEM;
3136 goto out;
3137 }
3138
3139 atomic_set(&snapc->nref, 1);
3140 snapc->seq = seq;
3141 snapc->num_snaps = snap_count;
3142 for (i = 0; i < snap_count; i++)
3143 snapc->snaps[i] = ceph_decode_64(&p);
3144
3145 rbd_dev->header.snapc = snapc;
3146
3147 dout(" snap context seq = %llu, snap_count = %u\n",
3148 (unsigned long long) seq, (unsigned int) snap_count);
3149
3150out:
3151 kfree(reply_buf);
3152
3153 return 0;
3154}
3155
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003156static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3157{
3158 size_t size;
3159 void *reply_buf;
3160 __le64 snap_id;
3161 int ret;
3162 void *p;
3163 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003164 char *snap_name;
3165
3166 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3167 reply_buf = kmalloc(size, GFP_KERNEL);
3168 if (!reply_buf)
3169 return ERR_PTR(-ENOMEM);
3170
3171 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3172 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3173 "rbd", "get_snapshot_name",
3174 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003175 reply_buf, size, NULL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003176 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3177 if (ret < 0)
3178 goto out;
3179
3180 p = reply_buf;
3181 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003182 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003183 if (IS_ERR(snap_name)) {
3184 ret = PTR_ERR(snap_name);
3185 goto out;
3186 } else {
3187 dout(" snap_id 0x%016llx snap_name = %s\n",
3188 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3189 }
3190 kfree(reply_buf);
3191
3192 return snap_name;
3193out:
3194 kfree(reply_buf);
3195
3196 return ERR_PTR(ret);
3197}
3198
3199static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3200 u64 *snap_size, u64 *snap_features)
3201{
Alex Eldere0b49862013-01-09 14:44:18 -06003202 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003203 u8 order;
3204 int ret;
3205
3206 snap_id = rbd_dev->header.snapc->snaps[which];
3207 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3208 if (ret)
3209 return ERR_PTR(ret);
3210 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3211 if (ret)
3212 return ERR_PTR(ret);
3213
3214 return rbd_dev_v2_snap_name(rbd_dev, which);
3215}
3216
3217static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3218 u64 *snap_size, u64 *snap_features)
3219{
3220 if (rbd_dev->image_format == 1)
3221 return rbd_dev_v1_snap_info(rbd_dev, which,
3222 snap_size, snap_features);
3223 if (rbd_dev->image_format == 2)
3224 return rbd_dev_v2_snap_info(rbd_dev, which,
3225 snap_size, snap_features);
3226 return ERR_PTR(-EINVAL);
3227}
3228
Alex Elder117973f2012-08-31 17:29:55 -05003229static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3230{
3231 int ret;
3232 __u8 obj_order;
3233
3234 down_write(&rbd_dev->header_rwsem);
3235
3236 /* Grab old order first, to see if it changes */
3237
3238 obj_order = rbd_dev->header.obj_order,
3239 ret = rbd_dev_v2_image_size(rbd_dev);
3240 if (ret)
3241 goto out;
3242 if (rbd_dev->header.obj_order != obj_order) {
3243 ret = -EIO;
3244 goto out;
3245 }
3246 rbd_update_mapping_size(rbd_dev);
3247
3248 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3249 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3250 if (ret)
3251 goto out;
3252 ret = rbd_dev_snaps_update(rbd_dev);
3253 dout("rbd_dev_snaps_update returned %d\n", ret);
3254 if (ret)
3255 goto out;
3256 ret = rbd_dev_snaps_register(rbd_dev);
3257 dout("rbd_dev_snaps_register returned %d\n", ret);
3258out:
3259 up_write(&rbd_dev->header_rwsem);
3260
3261 return ret;
3262}
3263
Alex Elder9d475de2012-07-03 16:01:19 -05003264/*
Alex Elder35938152012-08-02 11:29:46 -05003265 * Scan the rbd device's current snapshot list and compare it to the
3266 * newly-received snapshot context. Remove any existing snapshots
3267 * not present in the new snapshot context. Add a new snapshot for
3268 * any snaphots in the snapshot context not in the current list.
3269 * And verify there are no changes to snapshots we already know
3270 * about.
3271 *
3272 * Assumes the snapshots in the snapshot context are sorted by
3273 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3274 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003275 */
Alex Elder304f6802012-08-31 17:29:52 -05003276static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003277{
Alex Elder35938152012-08-02 11:29:46 -05003278 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3279 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003280 struct list_head *head = &rbd_dev->snaps;
3281 struct list_head *links = head->next;
3282 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003283
Alex Elder9fcbb802012-08-23 23:48:49 -05003284 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003285 while (index < snap_count || links != head) {
3286 u64 snap_id;
3287 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003288 char *snap_name;
3289 u64 snap_size = 0;
3290 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003291
Alex Elder35938152012-08-02 11:29:46 -05003292 snap_id = index < snap_count ? snapc->snaps[index]
3293 : CEPH_NOSNAP;
3294 snap = links != head ? list_entry(links, struct rbd_snap, node)
3295 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05003296 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003297
Alex Elder35938152012-08-02 11:29:46 -05003298 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3299 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003300
Alex Elder35938152012-08-02 11:29:46 -05003301 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003302
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003303 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06003304 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05003305 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003306 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003307 rbd_dev->spec->snap_id == snap->id ?
3308 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003309 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003310
Alex Elder35938152012-08-02 11:29:46 -05003311 /* Done with this list entry; advance */
3312
3313 links = next;
3314 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003315 }
Alex Elder35938152012-08-02 11:29:46 -05003316
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003317 snap_name = rbd_dev_snap_info(rbd_dev, index,
3318 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003319 if (IS_ERR(snap_name))
3320 return PTR_ERR(snap_name);
3321
Alex Elder9fcbb802012-08-23 23:48:49 -05003322 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3323 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003324 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3325 struct rbd_snap *new_snap;
3326
3327 /* We haven't seen this snapshot before */
3328
Alex Elderc8d18422012-07-10 20:30:11 -05003329 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003330 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003331 if (IS_ERR(new_snap)) {
3332 int err = PTR_ERR(new_snap);
3333
3334 dout(" failed to add dev, error %d\n", err);
3335
3336 return err;
3337 }
Alex Elder35938152012-08-02 11:29:46 -05003338
3339 /* New goes before existing, or at end of list */
3340
Alex Elder9fcbb802012-08-23 23:48:49 -05003341 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003342 if (snap)
3343 list_add_tail(&new_snap->node, &snap->node);
3344 else
Alex Elder523f3252012-08-30 00:16:37 -05003345 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003346 } else {
3347 /* Already have this one */
3348
Alex Elder9fcbb802012-08-23 23:48:49 -05003349 dout(" already present\n");
3350
Alex Eldercd892122012-07-03 16:01:19 -05003351 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05003352 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003353 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003354
3355 /* Done with this list entry; advance */
3356
3357 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003358 }
Alex Elder35938152012-08-02 11:29:46 -05003359
3360 /* Advance to the next entry in the snapshot context */
3361
3362 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003363 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003364 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003365
3366 return 0;
3367}
3368
Alex Elder304f6802012-08-31 17:29:52 -05003369/*
3370 * Scan the list of snapshots and register the devices for any that
3371 * have not already been registered.
3372 */
3373static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3374{
3375 struct rbd_snap *snap;
3376 int ret = 0;
3377
3378 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003379 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3380 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003381
3382 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3383 if (!rbd_snap_registered(snap)) {
3384 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3385 if (ret < 0)
3386 break;
3387 }
3388 }
3389 dout("%s: returning %d\n", __func__, ret);
3390
3391 return ret;
3392}
3393
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003394static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3395{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003396 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003397 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003398
3399 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003400
Alex Eldercd789ab2012-08-30 00:16:38 -05003401 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003402 dev->bus = &rbd_bus_type;
3403 dev->type = &rbd_device_type;
3404 dev->parent = &rbd_root_dev;
3405 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003406 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003407 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003408
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003409 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003410
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003411 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003412}
3413
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003414static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3415{
3416 device_unregister(&rbd_dev->dev);
3417}
3418
Alex Eldere2839302012-08-29 17:11:06 -05003419static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003420
3421/*
Alex Elder499afd52012-02-02 08:13:29 -06003422 * Get a unique rbd identifier for the given new rbd_dev, and add
3423 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003424 */
Alex Eldere2839302012-08-29 17:11:06 -05003425static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003426{
Alex Eldere2839302012-08-29 17:11:06 -05003427 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003428
3429 spin_lock(&rbd_dev_list_lock);
3430 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3431 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003432 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3433 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003434}
Alex Elderb7f23c32012-01-29 13:57:43 -06003435
Alex Elder1ddbe942012-01-29 13:57:44 -06003436/*
Alex Elder499afd52012-02-02 08:13:29 -06003437 * Remove an rbd_dev from the global list, and record that its
3438 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003439 */
Alex Eldere2839302012-08-29 17:11:06 -05003440static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003441{
Alex Elderd184f6b2012-01-29 13:57:44 -06003442 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003443 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003444 int max_id;
3445
Alex Elderaafb2302012-09-06 16:00:54 -05003446 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003447
Alex Eldere2839302012-08-29 17:11:06 -05003448 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3449 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003450 spin_lock(&rbd_dev_list_lock);
3451 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003452
3453 /*
3454 * If the id being "put" is not the current maximum, there
3455 * is nothing special we need to do.
3456 */
Alex Eldere2839302012-08-29 17:11:06 -05003457 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003458 spin_unlock(&rbd_dev_list_lock);
3459 return;
3460 }
3461
3462 /*
3463 * We need to update the current maximum id. Search the
3464 * list to find out what it is. We're more likely to find
3465 * the maximum at the end, so search the list backward.
3466 */
3467 max_id = 0;
3468 list_for_each_prev(tmp, &rbd_dev_list) {
3469 struct rbd_device *rbd_dev;
3470
3471 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003472 if (rbd_dev->dev_id > max_id)
3473 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003474 }
Alex Elder499afd52012-02-02 08:13:29 -06003475 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003476
Alex Elder1ddbe942012-01-29 13:57:44 -06003477 /*
Alex Eldere2839302012-08-29 17:11:06 -05003478 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003479 * which case it now accurately reflects the new maximum.
3480 * Be careful not to overwrite the maximum value in that
3481 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003482 */
Alex Eldere2839302012-08-29 17:11:06 -05003483 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3484 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003485}
3486
Alex Eldera725f65e2012-02-02 08:13:30 -06003487/*
Alex Eldere28fff262012-02-02 08:13:30 -06003488 * Skips over white space at *buf, and updates *buf to point to the
3489 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003490 * the token (string of non-white space characters) found. Note
3491 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003492 */
3493static inline size_t next_token(const char **buf)
3494{
3495 /*
3496 * These are the characters that produce nonzero for
3497 * isspace() in the "C" and "POSIX" locales.
3498 */
3499 const char *spaces = " \f\n\r\t\v";
3500
3501 *buf += strspn(*buf, spaces); /* Find start of token */
3502
3503 return strcspn(*buf, spaces); /* Return token length */
3504}
3505
3506/*
3507 * Finds the next token in *buf, and if the provided token buffer is
3508 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003509 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3510 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003511 *
3512 * Returns the length of the token found (not including the '\0').
3513 * Return value will be 0 if no token is found, and it will be >=
3514 * token_size if the token would not fit.
3515 *
Alex Elder593a9e72012-02-07 12:03:37 -06003516 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003517 * found token. Note that this occurs even if the token buffer is
3518 * too small to hold it.
3519 */
3520static inline size_t copy_token(const char **buf,
3521 char *token,
3522 size_t token_size)
3523{
3524 size_t len;
3525
3526 len = next_token(buf);
3527 if (len < token_size) {
3528 memcpy(token, *buf, len);
3529 *(token + len) = '\0';
3530 }
3531 *buf += len;
3532
3533 return len;
3534}
3535
3536/*
Alex Elderea3352f2012-07-09 21:04:23 -05003537 * Finds the next token in *buf, dynamically allocates a buffer big
3538 * enough to hold a copy of it, and copies the token into the new
3539 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3540 * that a duplicate buffer is created even for a zero-length token.
3541 *
3542 * Returns a pointer to the newly-allocated duplicate, or a null
3543 * pointer if memory for the duplicate was not available. If
3544 * the lenp argument is a non-null pointer, the length of the token
3545 * (not including the '\0') is returned in *lenp.
3546 *
3547 * If successful, the *buf pointer will be updated to point beyond
3548 * the end of the found token.
3549 *
3550 * Note: uses GFP_KERNEL for allocation.
3551 */
3552static inline char *dup_token(const char **buf, size_t *lenp)
3553{
3554 char *dup;
3555 size_t len;
3556
3557 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003558 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003559 if (!dup)
3560 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003561 *(dup + len) = '\0';
3562 *buf += len;
3563
3564 if (lenp)
3565 *lenp = len;
3566
3567 return dup;
3568}
3569
3570/*
Alex Elder859c31d2012-10-25 23:34:42 -05003571 * Parse the options provided for an "rbd add" (i.e., rbd image
3572 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3573 * and the data written is passed here via a NUL-terminated buffer.
3574 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003575 *
Alex Elder859c31d2012-10-25 23:34:42 -05003576 * The information extracted from these options is recorded in
3577 * the other parameters which return dynamically-allocated
3578 * structures:
3579 * ceph_opts
3580 * The address of a pointer that will refer to a ceph options
3581 * structure. Caller must release the returned pointer using
3582 * ceph_destroy_options() when it is no longer needed.
3583 * rbd_opts
3584 * Address of an rbd options pointer. Fully initialized by
3585 * this function; caller must release with kfree().
3586 * spec
3587 * Address of an rbd image specification pointer. Fully
3588 * initialized by this function based on parsed options.
3589 * Caller must release with rbd_spec_put().
3590 *
3591 * The options passed take this form:
3592 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3593 * where:
3594 * <mon_addrs>
3595 * A comma-separated list of one or more monitor addresses.
3596 * A monitor address is an ip address, optionally followed
3597 * by a port number (separated by a colon).
3598 * I.e.: ip1[:port1][,ip2[:port2]...]
3599 * <options>
3600 * A comma-separated list of ceph and/or rbd options.
3601 * <pool_name>
3602 * The name of the rados pool containing the rbd image.
3603 * <image_name>
3604 * The name of the image in that pool to map.
3605 * <snap_id>
3606 * An optional snapshot id. If provided, the mapping will
3607 * present data from the image at the time that snapshot was
3608 * created. The image head is used if no snapshot id is
3609 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003610 */
Alex Elder859c31d2012-10-25 23:34:42 -05003611static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003612 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003613 struct rbd_options **opts,
3614 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003615{
Alex Elderd22f76e2012-07-12 10:46:35 -05003616 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003617 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003618 const char *mon_addrs;
3619 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003620 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003621 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003622 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003623 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003624
3625 /* The first four tokens are required */
3626
Alex Elder7ef32142012-02-02 08:13:30 -06003627 len = next_token(&buf);
Alex Elder4fb5d672012-11-01 10:17:15 -05003628 if (!len) {
3629 rbd_warn(NULL, "no monitor address(es) provided");
3630 return -EINVAL;
3631 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003632 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003633 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003634 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003635
Alex Elderdc79b112012-10-25 23:34:41 -05003636 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003637 options = dup_token(&buf, NULL);
3638 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003639 return -ENOMEM;
Alex Elder4fb5d672012-11-01 10:17:15 -05003640 if (!*options) {
3641 rbd_warn(NULL, "no options provided");
3642 goto out_err;
3643 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003644
Alex Elder859c31d2012-10-25 23:34:42 -05003645 spec = rbd_spec_alloc();
3646 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003647 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003648
3649 spec->pool_name = dup_token(&buf, NULL);
3650 if (!spec->pool_name)
3651 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003652 if (!*spec->pool_name) {
3653 rbd_warn(NULL, "no pool name provided");
3654 goto out_err;
3655 }
Alex Eldere28fff262012-02-02 08:13:30 -06003656
Alex Elder69e7a022012-11-01 08:39:26 -05003657 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003658 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003659 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003660 if (!*spec->image_name) {
3661 rbd_warn(NULL, "no image name provided");
3662 goto out_err;
3663 }
Alex Eldere28fff262012-02-02 08:13:30 -06003664
Alex Elderf28e5652012-10-25 23:34:41 -05003665 /*
3666 * Snapshot name is optional; default is to use "-"
3667 * (indicating the head/no snapshot).
3668 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003669 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003670 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003671 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3672 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003673 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003674 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003675 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003676 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003677 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003678 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003679 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003680 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003681
Alex Elder0ddebc02012-10-25 23:34:41 -05003682 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003683
Alex Elder4e9afeb2012-10-25 23:34:41 -05003684 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3685 if (!rbd_opts)
3686 goto out_mem;
3687
3688 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003689
Alex Elder859c31d2012-10-25 23:34:42 -05003690 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003691 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003692 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003693 if (IS_ERR(copts)) {
3694 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003695 goto out_err;
3696 }
Alex Elder859c31d2012-10-25 23:34:42 -05003697 kfree(options);
3698
3699 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003700 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003701 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003702
Alex Elderdc79b112012-10-25 23:34:41 -05003703 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003704out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003705 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003706out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003707 kfree(rbd_opts);
3708 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003709 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003710
Alex Elderdc79b112012-10-25 23:34:41 -05003711 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003712}
3713
Alex Elder589d30e2012-07-10 20:30:11 -05003714/*
3715 * An rbd format 2 image has a unique identifier, distinct from the
3716 * name given to it by the user. Internally, that identifier is
3717 * what's used to specify the names of objects related to the image.
3718 *
3719 * A special "rbd id" object is used to map an rbd image name to its
3720 * id. If that object doesn't exist, then there is no v2 rbd image
3721 * with the supplied name.
3722 *
3723 * This function will record the given rbd_dev's image_id field if
3724 * it can be determined, and in that case will return 0. If any
3725 * errors occur a negative errno will be returned and the rbd_dev's
3726 * image_id field will be unchanged (and should be NULL).
3727 */
3728static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3729{
3730 int ret;
3731 size_t size;
3732 char *object_name;
3733 void *response;
3734 void *p;
3735
3736 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003737 * When probing a parent image, the image id is already
3738 * known (and the image name likely is not). There's no
3739 * need to fetch the image id again in this case.
3740 */
3741 if (rbd_dev->spec->image_id)
3742 return 0;
3743
3744 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003745 * First, see if the format 2 image id file exists, and if
3746 * so, get the image's persistent id from it.
3747 */
Alex Elder69e7a022012-11-01 08:39:26 -05003748 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003749 object_name = kmalloc(size, GFP_NOIO);
3750 if (!object_name)
3751 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003752 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003753 dout("rbd id object name is %s\n", object_name);
3754
3755 /* Response will be an encoded string, which includes a length */
3756
3757 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3758 response = kzalloc(size, GFP_NOIO);
3759 if (!response) {
3760 ret = -ENOMEM;
3761 goto out;
3762 }
3763
3764 ret = rbd_req_sync_exec(rbd_dev, object_name,
3765 "rbd", "get_id",
3766 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003767 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003768 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3769 if (ret < 0)
3770 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003771 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003772
3773 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003774 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003775 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003776 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003777 if (IS_ERR(rbd_dev->spec->image_id)) {
3778 ret = PTR_ERR(rbd_dev->spec->image_id);
3779 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003780 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003781 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003782 }
3783out:
3784 kfree(response);
3785 kfree(object_name);
3786
3787 return ret;
3788}
3789
Alex Eldera30b71b2012-07-10 20:30:11 -05003790static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3791{
3792 int ret;
3793 size_t size;
3794
3795 /* Version 1 images have no id; empty string is used */
3796
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003797 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3798 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003799 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003800
3801 /* Record the header object name for this rbd image. */
3802
Alex Elder69e7a022012-11-01 08:39:26 -05003803 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003804 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3805 if (!rbd_dev->header_name) {
3806 ret = -ENOMEM;
3807 goto out_err;
3808 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003809 sprintf(rbd_dev->header_name, "%s%s",
3810 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003811
3812 /* Populate rbd image metadata */
3813
3814 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3815 if (ret < 0)
3816 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003817
3818 /* Version 1 images have no parent (no layering) */
3819
3820 rbd_dev->parent_spec = NULL;
3821 rbd_dev->parent_overlap = 0;
3822
Alex Eldera30b71b2012-07-10 20:30:11 -05003823 rbd_dev->image_format = 1;
3824
3825 dout("discovered version 1 image, header name is %s\n",
3826 rbd_dev->header_name);
3827
3828 return 0;
3829
3830out_err:
3831 kfree(rbd_dev->header_name);
3832 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003833 kfree(rbd_dev->spec->image_id);
3834 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003835
3836 return ret;
3837}
3838
3839static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3840{
3841 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003842 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003843 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003844
3845 /*
3846 * Image id was filled in by the caller. Record the header
3847 * object name for this rbd image.
3848 */
Alex Elder979ed482012-11-01 08:39:26 -05003849 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003850 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3851 if (!rbd_dev->header_name)
3852 return -ENOMEM;
3853 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003854 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003855
3856 /* Get the size and object order for the image */
3857
3858 ret = rbd_dev_v2_image_size(rbd_dev);
3859 if (ret < 0)
3860 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003861
3862 /* Get the object prefix (a.k.a. block_name) for the image */
3863
3864 ret = rbd_dev_v2_object_prefix(rbd_dev);
3865 if (ret < 0)
3866 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003867
Alex Elderd8891402012-10-09 13:50:17 -07003868 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003869
3870 ret = rbd_dev_v2_features(rbd_dev);
3871 if (ret < 0)
3872 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003873
Alex Elder86b00e02012-10-25 23:34:42 -05003874 /* If the image supports layering, get the parent info */
3875
3876 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3877 ret = rbd_dev_v2_parent_info(rbd_dev);
3878 if (ret < 0)
3879 goto out_err;
3880 }
3881
Alex Elder6e14b1a2012-07-03 16:01:19 -05003882 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003883
Alex Elder6e14b1a2012-07-03 16:01:19 -05003884 rbd_dev->header.crypt_type = 0;
3885 rbd_dev->header.comp_type = 0;
3886
3887 /* Get the snapshot context, plus the header version */
3888
3889 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003890 if (ret)
3891 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003892 rbd_dev->header.obj_version = ver;
3893
Alex Eldera30b71b2012-07-10 20:30:11 -05003894 rbd_dev->image_format = 2;
3895
3896 dout("discovered version 2 image, header name is %s\n",
3897 rbd_dev->header_name);
3898
Alex Elder35152972012-08-31 17:29:55 -05003899 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003900out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003901 rbd_dev->parent_overlap = 0;
3902 rbd_spec_put(rbd_dev->parent_spec);
3903 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003904 kfree(rbd_dev->header_name);
3905 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003906 kfree(rbd_dev->header.object_prefix);
3907 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003908
3909 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003910}
3911
Alex Elder83a06262012-10-30 15:47:17 -05003912static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3913{
3914 int ret;
3915
3916 /* no need to lock here, as rbd_dev is not registered yet */
3917 ret = rbd_dev_snaps_update(rbd_dev);
3918 if (ret)
3919 return ret;
3920
Alex Elder9e15b772012-10-30 19:40:33 -05003921 ret = rbd_dev_probe_update_spec(rbd_dev);
3922 if (ret)
3923 goto err_out_snaps;
3924
Alex Elder83a06262012-10-30 15:47:17 -05003925 ret = rbd_dev_set_mapping(rbd_dev);
3926 if (ret)
3927 goto err_out_snaps;
3928
3929 /* generate unique id: find highest unique id, add one */
3930 rbd_dev_id_get(rbd_dev);
3931
3932 /* Fill in the device name, now that we have its id. */
3933 BUILD_BUG_ON(DEV_NAME_LEN
3934 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3935 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3936
3937 /* Get our block major device number. */
3938
3939 ret = register_blkdev(0, rbd_dev->name);
3940 if (ret < 0)
3941 goto err_out_id;
3942 rbd_dev->major = ret;
3943
3944 /* Set up the blkdev mapping. */
3945
3946 ret = rbd_init_disk(rbd_dev);
3947 if (ret)
3948 goto err_out_blkdev;
3949
3950 ret = rbd_bus_add_dev(rbd_dev);
3951 if (ret)
3952 goto err_out_disk;
3953
3954 /*
3955 * At this point cleanup in the event of an error is the job
3956 * of the sysfs code (initiated by rbd_bus_del_dev()).
3957 */
3958 down_write(&rbd_dev->header_rwsem);
3959 ret = rbd_dev_snaps_register(rbd_dev);
3960 up_write(&rbd_dev->header_rwsem);
3961 if (ret)
3962 goto err_out_bus;
3963
Alex Elder9969ebc2013-01-18 12:31:10 -06003964 (void) rbd_req_sync_watch; /* avoid a warning */
3965 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05003966 if (ret)
3967 goto err_out_bus;
3968
3969 /* Everything's ready. Announce the disk to the world. */
3970
3971 add_disk(rbd_dev->disk);
3972
3973 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3974 (unsigned long long) rbd_dev->mapping.size);
3975
3976 return ret;
3977err_out_bus:
3978 /* this will also clean up rest of rbd_dev stuff */
3979
3980 rbd_bus_del_dev(rbd_dev);
3981
3982 return ret;
3983err_out_disk:
3984 rbd_free_disk(rbd_dev);
3985err_out_blkdev:
3986 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3987err_out_id:
3988 rbd_dev_id_put(rbd_dev);
3989err_out_snaps:
3990 rbd_remove_all_snaps(rbd_dev);
3991
3992 return ret;
3993}
3994
Alex Eldera30b71b2012-07-10 20:30:11 -05003995/*
3996 * Probe for the existence of the header object for the given rbd
3997 * device. For format 2 images this includes determining the image
3998 * id.
3999 */
4000static int rbd_dev_probe(struct rbd_device *rbd_dev)
4001{
4002 int ret;
4003
4004 /*
4005 * Get the id from the image id object. If it's not a
4006 * format 2 image, we'll get ENOENT back, and we'll assume
4007 * it's a format 1 image.
4008 */
4009 ret = rbd_dev_image_id(rbd_dev);
4010 if (ret)
4011 ret = rbd_dev_v1_probe(rbd_dev);
4012 else
4013 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004014 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05004015 dout("probe failed, returning %d\n", ret);
4016
Alex Elder83a06262012-10-30 15:47:17 -05004017 return ret;
4018 }
4019
4020 ret = rbd_dev_probe_finish(rbd_dev);
4021 if (ret)
4022 rbd_header_free(&rbd_dev->header);
4023
Alex Eldera30b71b2012-07-10 20:30:11 -05004024 return ret;
4025}
4026
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004027static ssize_t rbd_add(struct bus_type *bus,
4028 const char *buf,
4029 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004030{
Alex Eldercb8627c2012-07-09 21:04:23 -05004031 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004032 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004033 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004034 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004035 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004036 struct ceph_osd_client *osdc;
4037 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004038
4039 if (!try_module_get(THIS_MODULE))
4040 return -ENODEV;
4041
Alex Eldera725f65e2012-02-02 08:13:30 -06004042 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004043 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004044 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004045 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004046
Alex Elder9d3997f2012-10-25 23:34:42 -05004047 rbdc = rbd_get_client(ceph_opts);
4048 if (IS_ERR(rbdc)) {
4049 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004050 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004051 }
Alex Elderc53d5892012-10-25 23:34:42 -05004052 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004053
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004054 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004055 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004056 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004057 if (rc < 0)
4058 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004059 spec->pool_id = (u64) rc;
4060
Alex Elder0903e872012-11-14 12:25:19 -06004061 /* The ceph file layout needs to fit pool id in 32 bits */
4062
4063 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4064 rc = -EIO;
4065 goto err_out_client;
4066 }
4067
Alex Elderc53d5892012-10-25 23:34:42 -05004068 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004069 if (!rbd_dev)
4070 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004071 rbdc = NULL; /* rbd_dev now owns this */
4072 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004073
Alex Elderbd4ba652012-10-25 23:34:42 -05004074 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004075 kfree(rbd_opts);
4076 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004077
Alex Eldera30b71b2012-07-10 20:30:11 -05004078 rc = rbd_dev_probe(rbd_dev);
4079 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004080 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004081
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004082 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004083err_out_rbd_dev:
4084 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004085err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004086 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004087err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004088 if (ceph_opts)
4089 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004090 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004091 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004092err_out_module:
4093 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004094
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004095 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004096
4097 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004098}
4099
Alex Elderde71a292012-07-03 16:01:19 -05004100static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004101{
4102 struct list_head *tmp;
4103 struct rbd_device *rbd_dev;
4104
Alex Eldere124a822012-01-29 13:57:44 -06004105 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004106 list_for_each(tmp, &rbd_dev_list) {
4107 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004108 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06004109 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004110 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06004111 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004112 }
Alex Eldere124a822012-01-29 13:57:44 -06004113 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004114 return NULL;
4115}
4116
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004117static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004118{
Alex Elder593a9e72012-02-07 12:03:37 -06004119 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004120
Alex Elder1dbb4392012-01-24 10:08:37 -06004121 if (rbd_dev->watch_request) {
4122 struct ceph_client *client = rbd_dev->rbd_client->client;
4123
4124 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004125 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06004126 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004127 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004128 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004129
4130 /* clean up and free blkdev */
4131 rbd_free_disk(rbd_dev);
4132 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004133
Alex Elder2ac4e752012-07-10 20:30:10 -05004134 /* release allocated disk header fields */
4135 rbd_header_free(&rbd_dev->header);
4136
Alex Elder32eec682012-02-08 16:11:14 -06004137 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004138 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004139 rbd_assert(rbd_dev->rbd_client != NULL);
4140 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004141
4142 /* release module ref */
4143 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004144}
4145
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004146static ssize_t rbd_remove(struct bus_type *bus,
4147 const char *buf,
4148 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004149{
4150 struct rbd_device *rbd_dev = NULL;
4151 int target_id, rc;
4152 unsigned long ul;
4153 int ret = count;
4154
4155 rc = strict_strtoul(buf, 10, &ul);
4156 if (rc)
4157 return rc;
4158
4159 /* convert to int; abort if we lost anything in the conversion */
4160 target_id = (int) ul;
4161 if (target_id != ul)
4162 return -EINVAL;
4163
4164 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4165
4166 rbd_dev = __rbd_get_dev(target_id);
4167 if (!rbd_dev) {
4168 ret = -ENOENT;
4169 goto done;
4170 }
4171
Alex Elder42382b72012-11-16 09:29:16 -06004172 if (rbd_dev->open_count) {
4173 ret = -EBUSY;
4174 goto done;
4175 }
4176
Alex Elder41f38c22012-10-25 23:34:40 -05004177 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004178 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004179
4180done:
4181 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004182
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004183 return ret;
4184}
4185
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004186/*
4187 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004188 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004189 */
4190static int rbd_sysfs_init(void)
4191{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004192 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004193
Alex Elderfed4c142012-02-07 12:03:36 -06004194 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004195 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004196 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004197
Alex Elderfed4c142012-02-07 12:03:36 -06004198 ret = bus_register(&rbd_bus_type);
4199 if (ret < 0)
4200 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004201
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004202 return ret;
4203}
4204
4205static void rbd_sysfs_cleanup(void)
4206{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004207 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004208 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004209}
4210
4211int __init rbd_init(void)
4212{
4213 int rc;
4214
4215 rc = rbd_sysfs_init();
4216 if (rc)
4217 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004218 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004219 return 0;
4220}
4221
4222void __exit rbd_exit(void)
4223{
4224 rbd_sysfs_cleanup();
4225}
4226
4227module_init(rbd_init);
4228module_exit(rbd_exit);
4229
4230MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4231MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4232MODULE_DESCRIPTION("rados block device");
4233
4234/* following authorship retained from original osdblk.c */
4235MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4236
4237MODULE_LICENSE("GPL");