blob: daa0f18f70894bb489d567a11395fcd8f2ce52b9 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
165/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600166 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700167 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700168struct rbd_req_status {
169 int done;
Alex Elder8986cb32012-11-08 08:01:39 -0600170 s32 rc;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700171 u64 bytes;
172};
173
174/*
175 * a collection of requests
176 */
177struct rbd_req_coll {
178 int total;
179 int num_done;
180 struct kref kref;
181 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182};
183
Alex Elderbf0d5f502012-11-22 00:00:08 -0600184struct rbd_img_request;
185typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
186
187#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
188
189struct rbd_obj_request;
190typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
191
192enum obj_request_type { OBJ_REQUEST_BIO }; /* More types to come */
193
194struct rbd_obj_request {
195 const char *object_name;
196 u64 offset; /* object start byte */
197 u64 length; /* bytes from offset */
198
199 struct rbd_img_request *img_request;
200 struct list_head links; /* img_request->obj_requests */
201 u32 which; /* posn image request list */
202
203 enum obj_request_type type;
204 struct bio *bio_list;
205
206 struct ceph_osd_request *osd_req;
207
208 u64 xferred; /* bytes transferred */
209 u64 version;
210 s32 result;
211 atomic_t done;
212
213 rbd_obj_callback_t callback;
214
215 struct kref kref;
216};
217
218struct rbd_img_request {
219 struct request *rq;
220 struct rbd_device *rbd_dev;
221 u64 offset; /* starting image byte offset */
222 u64 length; /* byte count from offset */
223 bool write_request; /* false for read */
224 union {
225 struct ceph_snap_context *snapc; /* for writes */
226 u64 snap_id; /* for reads */
227 };
228 spinlock_t completion_lock;/* protects next_completion */
229 u32 next_completion;
230 rbd_img_callback_t callback;
231
232 u32 obj_request_count;
233 struct list_head obj_requests; /* rbd_obj_request structs */
234
235 struct kref kref;
236};
237
238#define for_each_obj_request(ireq, oreq) \
239 list_for_each_entry(oreq, &ireq->obj_requests, links)
240#define for_each_obj_request_from(ireq, oreq) \
241 list_for_each_entry_from(oreq, &ireq->obj_requests, links)
242#define for_each_obj_request_safe(ireq, oreq, n) \
243 list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
244
Alex Elderf0f8cef2012-01-29 13:57:44 -0600245/*
246 * a single io request
247 */
248struct rbd_request {
249 struct request *rq; /* blk layer request */
250 struct bio *bio; /* cloned bio */
251 struct page **pages; /* list of used pages */
252 u64 len;
253 int coll_index;
254 struct rbd_req_coll *coll;
255};
256
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800257struct rbd_snap {
258 struct device dev;
259 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800260 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800261 struct list_head node;
262 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500263 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800264};
265
Alex Elderf84344f2012-08-31 17:29:51 -0500266struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500267 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500268 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500269 bool read_only;
270};
271
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700272/*
273 * a single device
274 */
275struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500276 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700277
278 int major; /* blkdev assigned major */
279 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700280
Alex Eldera30b71b2012-07-10 20:30:11 -0500281 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700282 struct rbd_client *rbd_client;
283
284 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
285
286 spinlock_t lock; /* queue lock */
287
288 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600289 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500290 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700291
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500292 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500293
Alex Elder0903e872012-11-14 12:25:19 -0600294 struct ceph_file_layout layout;
295
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700296 struct ceph_osd_event *watch_event;
297 struct ceph_osd_request *watch_request;
298
Alex Elder86b00e02012-10-25 23:34:42 -0500299 struct rbd_spec *parent_spec;
300 u64 parent_overlap;
301
Josh Durginc6666012011-11-21 17:11:12 -0800302 /* protects updating the header */
303 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500304
305 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306
307 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800308
309 /* list of snapshots */
310 struct list_head snaps;
311
312 /* sysfs related */
313 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600314 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800315};
316
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600318
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700319static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600320static DEFINE_SPINLOCK(rbd_dev_list_lock);
321
Alex Elder432b8582012-01-29 13:57:44 -0600322static LIST_HEAD(rbd_client_list); /* clients */
323static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700324
Alex Elder304f6802012-08-31 17:29:52 -0500325static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
326static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
327
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800328static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500329static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800330
Alex Elderf0f8cef2012-01-29 13:57:44 -0600331static ssize_t rbd_add(struct bus_type *bus, const char *buf,
332 size_t count);
333static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
334 size_t count);
335
336static struct bus_attribute rbd_bus_attrs[] = {
337 __ATTR(add, S_IWUSR, NULL, rbd_add),
338 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
339 __ATTR_NULL
340};
341
342static struct bus_type rbd_bus_type = {
343 .name = "rbd",
344 .bus_attrs = rbd_bus_attrs,
345};
346
347static void rbd_root_dev_release(struct device *dev)
348{
349}
350
351static struct device rbd_root_dev = {
352 .init_name = "rbd",
353 .release = rbd_root_dev_release,
354};
355
Alex Elder06ecc6c2012-11-01 10:17:15 -0500356static __printf(2, 3)
357void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
358{
359 struct va_format vaf;
360 va_list args;
361
362 va_start(args, fmt);
363 vaf.fmt = fmt;
364 vaf.va = &args;
365
366 if (!rbd_dev)
367 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
368 else if (rbd_dev->disk)
369 printk(KERN_WARNING "%s: %s: %pV\n",
370 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
371 else if (rbd_dev->spec && rbd_dev->spec->image_name)
372 printk(KERN_WARNING "%s: image %s: %pV\n",
373 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
374 else if (rbd_dev->spec && rbd_dev->spec->image_id)
375 printk(KERN_WARNING "%s: id %s: %pV\n",
376 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
377 else /* punt */
378 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
379 RBD_DRV_NAME, rbd_dev, &vaf);
380 va_end(args);
381}
382
Alex Elderaafb2302012-09-06 16:00:54 -0500383#ifdef RBD_DEBUG
384#define rbd_assert(expr) \
385 if (unlikely(!(expr))) { \
386 printk(KERN_ERR "\nAssertion failure in %s() " \
387 "at line %d:\n\n" \
388 "\trbd_assert(%s);\n\n", \
389 __func__, __LINE__, #expr); \
390 BUG(); \
391 }
392#else /* !RBD_DEBUG */
393# define rbd_assert(expr) ((void) 0)
394#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800395
Alex Elder117973f2012-08-31 17:29:55 -0500396static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
397static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700399static int rbd_open(struct block_device *bdev, fmode_t mode)
400{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600401 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700402
Alex Elderf84344f2012-08-31 17:29:51 -0500403 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700404 return -EROFS;
405
Alex Elder42382b72012-11-16 09:29:16 -0600406 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600407 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500408 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600409 rbd_dev->open_count++;
410 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700411
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700412 return 0;
413}
414
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800415static int rbd_release(struct gendisk *disk, fmode_t mode)
416{
417 struct rbd_device *rbd_dev = disk->private_data;
418
Alex Elder42382b72012-11-16 09:29:16 -0600419 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
420 rbd_assert(rbd_dev->open_count > 0);
421 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600422 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600423 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800424
425 return 0;
426}
427
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700428static const struct block_device_operations rbd_bd_ops = {
429 .owner = THIS_MODULE,
430 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800431 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432};
433
434/*
435 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500436 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437 */
Alex Elderf8c38922012-08-10 13:12:07 -0700438static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439{
440 struct rbd_client *rbdc;
441 int ret = -ENOMEM;
442
443 dout("rbd_client_create\n");
444 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
445 if (!rbdc)
446 goto out_opt;
447
448 kref_init(&rbdc->kref);
449 INIT_LIST_HEAD(&rbdc->node);
450
Alex Elderbc534d862012-01-29 13:57:44 -0600451 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
452
Alex Elder43ae4702012-07-03 16:01:18 -0500453 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600455 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500456 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457
458 ret = ceph_open_session(rbdc->client);
459 if (ret < 0)
460 goto out_err;
461
Alex Elder432b8582012-01-29 13:57:44 -0600462 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600464 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465
Alex Elderbc534d862012-01-29 13:57:44 -0600466 mutex_unlock(&ctl_mutex);
467
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468 dout("rbd_client_create created %p\n", rbdc);
469 return rbdc;
470
471out_err:
472 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600473out_mutex:
474 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700475 kfree(rbdc);
476out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500477 if (ceph_opts)
478 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400479 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700480}
481
482/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700483 * Find a ceph client with specific addr and configuration. If
484 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700485 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700486static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700487{
488 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700489 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490
Alex Elder43ae4702012-07-03 16:01:18 -0500491 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700492 return NULL;
493
Alex Elder1f7ba332012-08-10 13:12:07 -0700494 spin_lock(&rbd_client_list_lock);
495 list_for_each_entry(client_node, &rbd_client_list, node) {
496 if (!ceph_compare_options(ceph_opts, client_node->client)) {
497 kref_get(&client_node->kref);
498 found = true;
499 break;
500 }
501 }
502 spin_unlock(&rbd_client_list_lock);
503
504 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700505}
506
507/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700508 * mount options
509 */
510enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700511 Opt_last_int,
512 /* int args above */
513 Opt_last_string,
514 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700515 Opt_read_only,
516 Opt_read_write,
517 /* Boolean args above */
518 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700519};
520
Alex Elder43ae4702012-07-03 16:01:18 -0500521static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700522 /* int args above */
523 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500524 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700525 {Opt_read_only, "ro"}, /* Alternate spelling */
526 {Opt_read_write, "read_write"},
527 {Opt_read_write, "rw"}, /* Alternate spelling */
528 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700529 {-1, NULL}
530};
531
Alex Elder98571b52013-01-20 14:44:42 -0600532struct rbd_options {
533 bool read_only;
534};
535
536#define RBD_READ_ONLY_DEFAULT false
537
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700538static int parse_rbd_opts_token(char *c, void *private)
539{
Alex Elder43ae4702012-07-03 16:01:18 -0500540 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700541 substring_t argstr[MAX_OPT_ARGS];
542 int token, intval, ret;
543
Alex Elder43ae4702012-07-03 16:01:18 -0500544 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700545 if (token < 0)
546 return -EINVAL;
547
548 if (token < Opt_last_int) {
549 ret = match_int(&argstr[0], &intval);
550 if (ret < 0) {
551 pr_err("bad mount option arg (not int) "
552 "at '%s'\n", c);
553 return ret;
554 }
555 dout("got int token %d val %d\n", token, intval);
556 } else if (token > Opt_last_int && token < Opt_last_string) {
557 dout("got string token %d val %s\n", token,
558 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700559 } else if (token > Opt_last_string && token < Opt_last_bool) {
560 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700561 } else {
562 dout("got token %d\n", token);
563 }
564
565 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700566 case Opt_read_only:
567 rbd_opts->read_only = true;
568 break;
569 case Opt_read_write:
570 rbd_opts->read_only = false;
571 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700572 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500573 rbd_assert(false);
574 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700575 }
576 return 0;
577}
578
579/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580 * Get a ceph client with specific addr and configuration, if one does
581 * not exist create it.
582 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500583static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584{
Alex Elderf8c38922012-08-10 13:12:07 -0700585 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700586
Alex Elder1f7ba332012-08-10 13:12:07 -0700587 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500588 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500589 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500590 else
Alex Elderf8c38922012-08-10 13:12:07 -0700591 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592
Alex Elder9d3997f2012-10-25 23:34:42 -0500593 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700594}
595
596/*
597 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600598 *
Alex Elder432b8582012-01-29 13:57:44 -0600599 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700600 */
601static void rbd_client_release(struct kref *kref)
602{
603 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
604
605 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500606 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700607 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500608 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609
610 ceph_destroy_client(rbdc->client);
611 kfree(rbdc);
612}
613
614/*
615 * Drop reference to ceph client node. If it's not referenced anymore, release
616 * it.
617 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500618static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619{
Alex Elderc53d5892012-10-25 23:34:42 -0500620 if (rbdc)
621 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622}
623
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700624/*
625 * Destroy requests collection
626 */
627static void rbd_coll_release(struct kref *kref)
628{
629 struct rbd_req_coll *coll =
630 container_of(kref, struct rbd_req_coll, kref);
631
632 dout("rbd_coll_release %p\n", coll);
633 kfree(coll);
634}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700635
Alex Eldera30b71b2012-07-10 20:30:11 -0500636static bool rbd_image_format_valid(u32 image_format)
637{
638 return image_format == 1 || image_format == 2;
639}
640
Alex Elder8e94af82012-07-25 09:32:40 -0500641static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
642{
Alex Elder103a1502012-08-02 11:29:45 -0500643 size_t size;
644 u32 snap_count;
645
646 /* The header has to start with the magic rbd header text */
647 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
648 return false;
649
Alex Elderdb2388b2012-10-20 22:17:27 -0500650 /* The bio layer requires at least sector-sized I/O */
651
652 if (ondisk->options.order < SECTOR_SHIFT)
653 return false;
654
655 /* If we use u64 in a few spots we may be able to loosen this */
656
657 if (ondisk->options.order > 8 * sizeof (int) - 1)
658 return false;
659
Alex Elder103a1502012-08-02 11:29:45 -0500660 /*
661 * The size of a snapshot header has to fit in a size_t, and
662 * that limits the number of snapshots.
663 */
664 snap_count = le32_to_cpu(ondisk->snap_count);
665 size = SIZE_MAX - sizeof (struct ceph_snap_context);
666 if (snap_count > size / sizeof (__le64))
667 return false;
668
669 /*
670 * Not only that, but the size of the entire the snapshot
671 * header must also be representable in a size_t.
672 */
673 size -= snap_count * sizeof (__le64);
674 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
675 return false;
676
677 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500678}
679
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680/*
681 * Create a new header structure, translate header format from the on-disk
682 * header.
683 */
684static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500685 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686{
Alex Elderccece232012-07-10 20:30:10 -0500687 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500688 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500689 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500690 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700691
Alex Elder6a523252012-07-19 17:12:59 -0500692 memset(header, 0, sizeof (*header));
693
Alex Elder103a1502012-08-02 11:29:45 -0500694 snap_count = le32_to_cpu(ondisk->snap_count);
695
Alex Elder58c17b02012-08-23 23:22:06 -0500696 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
697 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500698 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500700 memcpy(header->object_prefix, ondisk->object_prefix, len);
701 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600702
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500704 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
705
Alex Elder621901d2012-08-23 23:22:06 -0500706 /* Save a copy of the snapshot names */
707
Alex Elderf785cc12012-08-23 23:22:06 -0500708 if (snap_names_len > (u64) SIZE_MAX)
709 return -EIO;
710 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700711 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500712 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500713 /*
714 * Note that rbd_dev_v1_header_read() guarantees
715 * the ondisk buffer we're working with has
716 * snap_names_len bytes beyond the end of the
717 * snapshot id array, this memcpy() is safe.
718 */
719 memcpy(header->snap_names, &ondisk->snaps[snap_count],
720 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500721
Alex Elder621901d2012-08-23 23:22:06 -0500722 /* Record each snapshot's size */
723
Alex Elderd2bb24e2012-07-26 23:37:14 -0500724 size = snap_count * sizeof (*header->snap_sizes);
725 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500727 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500728 for (i = 0; i < snap_count; i++)
729 header->snap_sizes[i] =
730 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700731 } else {
Alex Elderccece232012-07-10 20:30:10 -0500732 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733 header->snap_names = NULL;
734 header->snap_sizes = NULL;
735 }
Alex Elder849b4262012-07-09 21:04:24 -0500736
Alex Elder34b13182012-07-13 20:35:12 -0500737 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738 header->obj_order = ondisk->options.order;
739 header->crypt_type = ondisk->options.crypt_type;
740 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500741
Alex Elder621901d2012-08-23 23:22:06 -0500742 /* Allocate and fill in the snapshot context */
743
Alex Elderf84344f2012-08-31 17:29:51 -0500744 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500745 size = sizeof (struct ceph_snap_context);
746 size += snap_count * sizeof (header->snapc->snaps[0]);
747 header->snapc = kzalloc(size, GFP_KERNEL);
748 if (!header->snapc)
749 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750
751 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500752 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500754 for (i = 0; i < snap_count; i++)
755 header->snapc->snaps[i] =
756 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700757
758 return 0;
759
Alex Elder6a523252012-07-19 17:12:59 -0500760out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500761 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500762 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700763 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500764 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500765 kfree(header->object_prefix);
766 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500767
Alex Elder00f1f362012-02-07 12:03:36 -0600768 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700769}
770
Alex Elder9e15b772012-10-30 19:40:33 -0500771static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
772{
773 struct rbd_snap *snap;
774
775 if (snap_id == CEPH_NOSNAP)
776 return RBD_SNAP_HEAD_NAME;
777
778 list_for_each_entry(snap, &rbd_dev->snaps, node)
779 if (snap_id == snap->id)
780 return snap->name;
781
782 return NULL;
783}
784
Alex Elder8836b992012-08-30 14:42:15 -0500785static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787
Alex Eldere86924a2012-07-10 20:30:11 -0500788 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600789
Alex Eldere86924a2012-07-10 20:30:11 -0500790 list_for_each_entry(snap, &rbd_dev->snaps, node) {
791 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500792 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500793 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500794 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600795
Alex Eldere86924a2012-07-10 20:30:11 -0500796 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600797 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700798 }
Alex Eldere86924a2012-07-10 20:30:11 -0500799
Alex Elder00f1f362012-02-07 12:03:36 -0600800 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700801}
802
Alex Elder819d52b2012-10-25 23:34:41 -0500803static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804{
Alex Elder78dc4472012-07-19 08:49:18 -0500805 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700806
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500807 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800808 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500809 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500810 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500811 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500812 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700813 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500814 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815 if (ret < 0)
816 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500817 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700818 }
Alex Elderd78b6502012-11-09 08:43:15 -0600819 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700821 return ret;
822}
823
824static void rbd_header_free(struct rbd_image_header *header)
825{
Alex Elder849b4262012-07-09 21:04:24 -0500826 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500827 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700828 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500829 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500830 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500831 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800832 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500833 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834}
835
Alex Elder98571b52013-01-20 14:44:42 -0600836static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700837{
Alex Elder65ccfe22012-08-09 10:33:26 -0700838 char *name;
839 u64 segment;
840 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700841
Alex Elder2fd82b92012-11-09 15:05:54 -0600842 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700843 if (!name)
844 return NULL;
845 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600846 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700847 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600848 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700849 pr_err("error formatting segment name for #%llu (%d)\n",
850 segment, ret);
851 kfree(name);
852 name = NULL;
853 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854
Alex Elder65ccfe22012-08-09 10:33:26 -0700855 return name;
856}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700857
Alex Elder65ccfe22012-08-09 10:33:26 -0700858static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
859{
860 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700861
Alex Elder65ccfe22012-08-09 10:33:26 -0700862 return offset & (segment_size - 1);
863}
864
865static u64 rbd_segment_length(struct rbd_device *rbd_dev,
866 u64 offset, u64 length)
867{
868 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
869
870 offset &= segment_size - 1;
871
Alex Elderaafb2302012-09-06 16:00:54 -0500872 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700873 if (offset + length > segment_size)
874 length = segment_size - offset;
875
876 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700877}
878
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700879static int rbd_get_num_segments(struct rbd_image_header *header,
880 u64 ofs, u64 len)
881{
Alex Elderdf111be2012-08-09 10:33:26 -0700882 u64 start_seg;
883 u64 end_seg;
Alex Elder38901e02013-01-10 12:56:58 -0600884 u64 result;
Alex Elderdf111be2012-08-09 10:33:26 -0700885
886 if (!len)
887 return 0;
888 if (len - 1 > U64_MAX - ofs)
889 return -ERANGE;
890
891 start_seg = ofs >> header->obj_order;
892 end_seg = (ofs + len - 1) >> header->obj_order;
893
Alex Elder38901e02013-01-10 12:56:58 -0600894 result = end_seg - start_seg + 1;
895 if (result > (u64) INT_MAX)
896 return -ERANGE;
897
898 return (int) result;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700899}
900
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700902 * returns the size of an object in the image
903 */
904static u64 rbd_obj_bytes(struct rbd_image_header *header)
905{
906 return 1 << header->obj_order;
907}
908
909/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700910 * bio helpers
911 */
912
913static void bio_chain_put(struct bio *chain)
914{
915 struct bio *tmp;
916
917 while (chain) {
918 tmp = chain;
919 chain = chain->bi_next;
920 bio_put(tmp);
921 }
922}
923
924/*
925 * zeros a bio chain, starting at specific offset
926 */
927static void zero_bio_chain(struct bio *chain, int start_ofs)
928{
929 struct bio_vec *bv;
930 unsigned long flags;
931 void *buf;
932 int i;
933 int pos = 0;
934
935 while (chain) {
936 bio_for_each_segment(bv, chain, i) {
937 if (pos + bv->bv_len > start_ofs) {
938 int remainder = max(start_ofs - pos, 0);
939 buf = bvec_kmap_irq(bv, &flags);
940 memset(buf + remainder, 0,
941 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200942 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700943 }
944 pos += bv->bv_len;
945 }
946
947 chain = chain->bi_next;
948 }
949}
950
951/*
Alex Elderf7760da2012-10-20 22:17:27 -0500952 * Clone a portion of a bio, starting at the given byte offset
953 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700954 */
Alex Elderf7760da2012-10-20 22:17:27 -0500955static struct bio *bio_clone_range(struct bio *bio_src,
956 unsigned int offset,
957 unsigned int len,
958 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700959{
Alex Elderf7760da2012-10-20 22:17:27 -0500960 struct bio_vec *bv;
961 unsigned int resid;
962 unsigned short idx;
963 unsigned int voff;
964 unsigned short end_idx;
965 unsigned short vcnt;
966 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967
Alex Elderf7760da2012-10-20 22:17:27 -0500968 /* Handle the easy case for the caller */
969
970 if (!offset && len == bio_src->bi_size)
971 return bio_clone(bio_src, gfpmask);
972
973 if (WARN_ON_ONCE(!len))
974 return NULL;
975 if (WARN_ON_ONCE(len > bio_src->bi_size))
976 return NULL;
977 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
978 return NULL;
979
980 /* Find first affected segment... */
981
982 resid = offset;
983 __bio_for_each_segment(bv, bio_src, idx, 0) {
984 if (resid < bv->bv_len)
985 break;
986 resid -= bv->bv_len;
987 }
988 voff = resid;
989
990 /* ...and the last affected segment */
991
992 resid += len;
993 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
994 if (resid <= bv->bv_len)
995 break;
996 resid -= bv->bv_len;
997 }
998 vcnt = end_idx - idx + 1;
999
1000 /* Build the clone */
1001
1002 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1003 if (!bio)
1004 return NULL; /* ENOMEM */
1005
1006 bio->bi_bdev = bio_src->bi_bdev;
1007 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1008 bio->bi_rw = bio_src->bi_rw;
1009 bio->bi_flags |= 1 << BIO_CLONED;
1010
1011 /*
1012 * Copy over our part of the bio_vec, then update the first
1013 * and last (or only) entries.
1014 */
1015 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1016 vcnt * sizeof (struct bio_vec));
1017 bio->bi_io_vec[0].bv_offset += voff;
1018 if (vcnt > 1) {
1019 bio->bi_io_vec[0].bv_len -= voff;
1020 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1021 } else {
1022 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023 }
1024
Alex Elderf7760da2012-10-20 22:17:27 -05001025 bio->bi_vcnt = vcnt;
1026 bio->bi_size = len;
1027 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001028
Alex Elderf7760da2012-10-20 22:17:27 -05001029 return bio;
1030}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031
Alex Elderf7760da2012-10-20 22:17:27 -05001032/*
1033 * Clone a portion of a bio chain, starting at the given byte offset
1034 * into the first bio in the source chain and continuing for the
1035 * number of bytes indicated. The result is another bio chain of
1036 * exactly the given length, or a null pointer on error.
1037 *
1038 * The bio_src and offset parameters are both in-out. On entry they
1039 * refer to the first source bio and the offset into that bio where
1040 * the start of data to be cloned is located.
1041 *
1042 * On return, bio_src is updated to refer to the bio in the source
1043 * chain that contains first un-cloned byte, and *offset will
1044 * contain the offset of that byte within that bio.
1045 */
1046static struct bio *bio_chain_clone_range(struct bio **bio_src,
1047 unsigned int *offset,
1048 unsigned int len,
1049 gfp_t gfpmask)
1050{
1051 struct bio *bi = *bio_src;
1052 unsigned int off = *offset;
1053 struct bio *chain = NULL;
1054 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001055
Alex Elderf7760da2012-10-20 22:17:27 -05001056 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001057
Alex Elderf7760da2012-10-20 22:17:27 -05001058 if (!bi || off >= bi->bi_size || !len)
1059 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001060
Alex Elderf7760da2012-10-20 22:17:27 -05001061 end = &chain;
1062 while (len) {
1063 unsigned int bi_size;
1064 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001065
Alex Elderf5400b72012-11-01 10:17:15 -05001066 if (!bi) {
1067 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001068 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001069 }
Alex Elderf7760da2012-10-20 22:17:27 -05001070 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1071 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1072 if (!bio)
1073 goto out_err; /* ENOMEM */
1074
1075 *end = bio;
1076 end = &bio->bi_next;
1077
1078 off += bi_size;
1079 if (off == bi->bi_size) {
1080 bi = bi->bi_next;
1081 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001082 }
Alex Elderf7760da2012-10-20 22:17:27 -05001083 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084 }
Alex Elderf7760da2012-10-20 22:17:27 -05001085 *bio_src = bi;
1086 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001087
Alex Elderf7760da2012-10-20 22:17:27 -05001088 return chain;
1089out_err:
1090 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001091
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001092 return NULL;
1093}
1094
Alex Elderbf0d5f502012-11-22 00:00:08 -06001095static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1096{
1097 kref_get(&obj_request->kref);
1098}
1099
1100static void rbd_obj_request_destroy(struct kref *kref);
1101static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1102{
1103 rbd_assert(obj_request != NULL);
1104 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1105}
1106
1107static void rbd_img_request_get(struct rbd_img_request *img_request)
1108{
1109 kref_get(&img_request->kref);
1110}
1111
1112static void rbd_img_request_destroy(struct kref *kref);
1113static void rbd_img_request_put(struct rbd_img_request *img_request)
1114{
1115 rbd_assert(img_request != NULL);
1116 kref_put(&img_request->kref, rbd_img_request_destroy);
1117}
1118
1119static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1120 struct rbd_obj_request *obj_request)
1121{
1122 rbd_obj_request_get(obj_request);
1123 obj_request->img_request = img_request;
1124 list_add_tail(&obj_request->links, &img_request->obj_requests);
1125 obj_request->which = img_request->obj_request_count++;
1126 rbd_assert(obj_request->which != BAD_WHICH);
1127}
1128
1129static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1130 struct rbd_obj_request *obj_request)
1131{
1132 rbd_assert(obj_request->which != BAD_WHICH);
1133 obj_request->which = BAD_WHICH;
1134 list_del(&obj_request->links);
1135 rbd_assert(obj_request->img_request == img_request);
1136 obj_request->callback = NULL;
1137 obj_request->img_request = NULL;
1138 rbd_obj_request_put(obj_request);
1139}
1140
1141static bool obj_request_type_valid(enum obj_request_type type)
1142{
1143 switch (type) {
1144 case OBJ_REQUEST_BIO:
1145 return true;
1146 default:
1147 return false;
1148 }
1149}
1150
Alex Elder8d23bf22012-11-19 22:55:21 -06001151struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1152{
1153 struct ceph_osd_req_op *op;
1154 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001155 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001156
1157 op = kzalloc(sizeof (*op), GFP_NOIO);
1158 if (!op)
1159 return NULL;
1160 op->op = opcode;
1161 va_start(args, opcode);
1162 switch (opcode) {
1163 case CEPH_OSD_OP_READ:
1164 case CEPH_OSD_OP_WRITE:
1165 /* rbd_osd_req_op_create(READ, offset, length) */
1166 /* rbd_osd_req_op_create(WRITE, offset, length) */
1167 op->extent.offset = va_arg(args, u64);
1168 op->extent.length = va_arg(args, u64);
1169 if (opcode == CEPH_OSD_OP_WRITE)
1170 op->payload_len = op->extent.length;
1171 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001172 case CEPH_OSD_OP_CALL:
1173 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1174 op->cls.class_name = va_arg(args, char *);
1175 size = strlen(op->cls.class_name);
1176 rbd_assert(size <= (size_t) U8_MAX);
1177 op->cls.class_len = size;
1178 op->payload_len = size;
1179
1180 op->cls.method_name = va_arg(args, char *);
1181 size = strlen(op->cls.method_name);
1182 rbd_assert(size <= (size_t) U8_MAX);
1183 op->cls.method_len = size;
1184 op->payload_len += size;
1185
1186 op->cls.argc = 0;
1187 op->cls.indata = va_arg(args, void *);
1188 size = va_arg(args, size_t);
1189 rbd_assert(size <= (size_t) U32_MAX);
1190 op->cls.indata_len = (u32) size;
1191 op->payload_len += size;
1192 break;
Alex Elder5efea492012-11-19 22:55:21 -06001193 case CEPH_OSD_OP_NOTIFY_ACK:
1194 case CEPH_OSD_OP_WATCH:
1195 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1196 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1197 op->watch.cookie = va_arg(args, u64);
1198 op->watch.ver = va_arg(args, u64);
1199 op->watch.ver = cpu_to_le64(op->watch.ver);
1200 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1201 op->watch.flag = (u8) 1;
1202 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001203 default:
1204 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1205 kfree(op);
1206 op = NULL;
1207 break;
1208 }
1209 va_end(args);
1210
1211 return op;
1212}
1213
1214static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1215{
1216 kfree(op);
1217}
1218
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001219static void rbd_coll_end_req_index(struct request *rq,
1220 struct rbd_req_coll *coll,
1221 int index,
Alex Elder8986cb32012-11-08 08:01:39 -06001222 s32 ret, u64 len)
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001223{
1224 struct request_queue *q;
1225 int min, max, i;
1226
Alex Elderbd919d42012-07-13 20:35:11 -05001227 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
Alex Elder8986cb32012-11-08 08:01:39 -06001228 coll, index, (int)ret, (unsigned long long)len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001229
1230 if (!rq)
1231 return;
1232
1233 if (!coll) {
1234 blk_end_request(rq, ret, len);
1235 return;
1236 }
1237
1238 q = rq->q;
1239
1240 spin_lock_irq(q->queue_lock);
1241 coll->status[index].done = 1;
1242 coll->status[index].rc = ret;
1243 coll->status[index].bytes = len;
1244 max = min = coll->num_done;
1245 while (max < coll->total && coll->status[max].done)
1246 max++;
1247
1248 for (i = min; i<max; i++) {
Alex Elder8986cb32012-11-08 08:01:39 -06001249 __blk_end_request(rq, (int)coll->status[i].rc,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001250 coll->status[i].bytes);
1251 coll->num_done++;
1252 kref_put(&coll->kref, rbd_coll_release);
1253 }
1254 spin_unlock_irq(q->queue_lock);
1255}
1256
Alex Elder725afc92012-11-08 08:01:39 -06001257static void rbd_coll_end_req(struct rbd_request *rbd_req,
Alex Elder8986cb32012-11-08 08:01:39 -06001258 s32 ret, u64 len)
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001259{
Alex Elder725afc92012-11-08 08:01:39 -06001260 rbd_coll_end_req_index(rbd_req->rq,
1261 rbd_req->coll, rbd_req->coll_index,
1262 ret, len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001263}
1264
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001265/*
1266 * Send ceph osd request
1267 */
1268static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001269 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001270 struct ceph_snap_context *snapc,
1271 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001272 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001273 struct bio *bio,
1274 struct page **pages,
1275 int num_pages,
1276 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001277 struct ceph_osd_req_op *op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001278 struct rbd_req_coll *coll,
1279 int coll_index,
Alex Elder5f29ddd2012-11-08 08:01:39 -06001280 void (*rbd_cb)(struct ceph_osd_request *,
1281 struct ceph_msg *),
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001282 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001283{
Alex Elder1dbb4392012-01-24 10:08:37 -06001284 struct ceph_osd_client *osdc;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001285 struct ceph_osd_request *osd_req;
1286 struct rbd_request *rbd_req = NULL;
1287 struct timespec mtime = CURRENT_TIME;
1288 int ret;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001289
Alex Elderf7760da2012-10-20 22:17:27 -05001290 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1291 object_name, (unsigned long long) ofs,
1292 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001293
Alex Elder0ce1a792012-07-03 16:01:18 -05001294 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder30573d62012-11-13 21:11:15 -06001295 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001296 if (!osd_req)
1297 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001298
Alex Elderd178a9e2012-11-13 21:11:15 -06001299 osd_req->r_flags = flags;
Alex Elder54a54002012-11-13 21:11:15 -06001300 osd_req->r_pages = pages;
1301 if (bio) {
1302 osd_req->r_bio = bio;
1303 bio_get(osd_req->r_bio);
1304 }
Alex Elder2e53c6c2012-11-30 09:59:47 -06001305
Alex Elder18216652012-11-30 09:59:47 -06001306 if (coll) {
Alex Elder2e53c6c2012-11-30 09:59:47 -06001307 ret = -ENOMEM;
1308 rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
1309 if (!rbd_req)
1310 goto done_osd_req;
1311
1312 rbd_req->rq = rq;
1313 rbd_req->bio = bio;
1314 rbd_req->pages = pages;
1315 rbd_req->len = len;
1316 rbd_req->coll = coll;
Alex Elder18216652012-11-30 09:59:47 -06001317 rbd_req->coll_index = coll_index;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001318 }
1319
Alex Elder5f29ddd2012-11-08 08:01:39 -06001320 osd_req->r_callback = rbd_cb;
Alex Elder5f29ddd2012-11-08 08:01:39 -06001321 osd_req->r_priv = rbd_req;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001322
Alex Elder5f29ddd2012-11-08 08:01:39 -06001323 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1324 osd_req->r_oid_len = strlen(osd_req->r_oid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001325
Alex Elder0903e872012-11-14 12:25:19 -06001326 osd_req->r_file_layout = rbd_dev->layout; /* struct */
Alex Eldere01e7922012-11-14 12:25:18 -06001327 osd_req->r_num_pages = calc_pages_for(ofs, len);
1328 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001329
Alex Elder30573d62012-11-13 21:11:15 -06001330 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
Alex Elderae7ca4a32012-11-13 21:11:15 -06001331 snapc, snapid, &mtime);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001332
Alex Elder8b84de72012-11-20 14:17:17 -06001333 if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001334 ceph_osdc_set_request_linger(osdc, osd_req);
Alex Elder8b84de72012-11-20 14:17:17 -06001335 rbd_dev->watch_request = osd_req;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001336 }
1337
Alex Elder5f29ddd2012-11-08 08:01:39 -06001338 ret = ceph_osdc_start_request(osdc, osd_req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001339 if (ret < 0)
1340 goto done_err;
1341
1342 if (!rbd_cb) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001343 u64 version;
1344
1345 ret = ceph_osdc_wait_request(osdc, osd_req);
1346 version = le64_to_cpu(osd_req->r_reassert_version.version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001347 if (ver)
Alex Elder5f29ddd2012-11-08 08:01:39 -06001348 *ver = version;
1349 dout("reassert_ver=%llu\n", (unsigned long long) version);
1350 ceph_osdc_put_request(osd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001351 }
1352 return ret;
1353
1354done_err:
Alex Elder2e53c6c2012-11-30 09:59:47 -06001355 if (bio)
1356 bio_chain_put(osd_req->r_bio);
Alex Elder725afc92012-11-08 08:01:39 -06001357 kfree(rbd_req);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001358done_osd_req:
1359 ceph_osdc_put_request(osd_req);
1360
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001361 return ret;
1362}
1363
1364/*
1365 * Ceph osd op callback
1366 */
Alex Elder5f29ddd2012-11-08 08:01:39 -06001367static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001368{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001369 struct rbd_request *rbd_req = osd_req->r_priv;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001370 struct ceph_osd_reply_head *replyhead;
1371 struct ceph_osd_op *op;
Alex Elder8986cb32012-11-08 08:01:39 -06001372 s32 rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001373 u64 bytes;
1374 int read_op;
1375
1376 /* parse reply */
1377 replyhead = msg->front.iov_base;
1378 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1379 op = (void *)(replyhead + 1);
Alex Elder8986cb32012-11-08 08:01:39 -06001380 rc = (s32)le32_to_cpu(replyhead->result);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001381 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001382 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001383
Alex Elderbd919d42012-07-13 20:35:11 -05001384 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1385 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001386
Alex Elder8986cb32012-11-08 08:01:39 -06001387 if (rc == (s32)-ENOENT && read_op) {
Alex Elder725afc92012-11-08 08:01:39 -06001388 zero_bio_chain(rbd_req->bio, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001389 rc = 0;
Alex Elder725afc92012-11-08 08:01:39 -06001390 } else if (rc == 0 && read_op && bytes < rbd_req->len) {
1391 zero_bio_chain(rbd_req->bio, bytes);
1392 bytes = rbd_req->len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001393 }
1394
Alex Elder725afc92012-11-08 08:01:39 -06001395 rbd_coll_end_req(rbd_req, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001396
Alex Elder725afc92012-11-08 08:01:39 -06001397 if (rbd_req->bio)
1398 bio_chain_put(rbd_req->bio);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001399
Alex Elder5f29ddd2012-11-08 08:01:39 -06001400 ceph_osdc_put_request(osd_req);
Alex Elder725afc92012-11-08 08:01:39 -06001401 kfree(rbd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001402}
1403
Alex Elder5f29ddd2012-11-08 08:01:39 -06001404static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1405 struct ceph_msg *msg)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001406{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001407 ceph_osdc_put_request(osd_req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001408}
1409
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001410/*
1411 * Do a synchronous ceph osd operation
1412 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001413static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001414 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001415 struct ceph_osd_req_op *op,
Alex Elderaded07e2012-07-03 16:01:18 -05001416 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001417 u64 ofs, u64 inbound_size,
1418 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001419 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001420{
1421 int ret;
1422 struct page **pages;
1423 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001424
Alex Elder30573d62012-11-13 21:11:15 -06001425 rbd_assert(op != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001426
Alex Elderf8d4de62012-07-03 16:01:19 -05001427 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001428 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001429 if (IS_ERR(pages))
1430 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001431
Alex Elder25704ac2012-11-09 08:43:16 -06001432 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderf8d4de62012-07-03 16:01:19 -05001433 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434 pages, num_pages,
1435 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001436 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001437 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001438 NULL,
Alex Elder8b84de72012-11-20 14:17:17 -06001439 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001440 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001441 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001442
Alex Elderf8d4de62012-07-03 16:01:19 -05001443 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1444 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001445
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001446done:
1447 ceph_release_page_vector(pages, num_pages);
1448 return ret;
1449}
1450
1451/*
1452 * Do an asynchronous ceph osd operation
1453 */
1454static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001455 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001457 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001458 struct bio *bio,
1459 struct rbd_req_coll *coll,
1460 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001461{
Alex Elder98571b52013-01-20 14:44:42 -06001462 const char *seg_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001463 u64 seg_ofs;
1464 u64 seg_len;
1465 int ret;
Alex Elder139b4312012-11-13 21:11:15 -06001466 struct ceph_osd_req_op *op;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001467 int opcode;
1468 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001469 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001470
Alex Elder65ccfe22012-08-09 10:33:26 -07001471 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001472 if (!seg_name)
1473 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001474 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1475 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001476
Alex Elderff2e4bb2012-10-10 18:59:29 -07001477 if (rq_data_dir(rq) == WRITE) {
1478 opcode = CEPH_OSD_OP_WRITE;
1479 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001480 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001481 } else {
1482 opcode = CEPH_OSD_OP_READ;
1483 flags = CEPH_OSD_FLAG_READ;
Alex Eldera7b4c652012-11-09 08:43:15 -06001484 rbd_assert(!snapc);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001485 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001486 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001487
Alex Elder57cfc102012-06-26 12:57:03 -07001488 ret = -ENOMEM;
Alex Elder8d23bf22012-11-19 22:55:21 -06001489 op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
Alex Elder139b4312012-11-13 21:11:15 -06001490 if (!op)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001491 goto done;
1492
1493 /* we've taken care of segment sizes earlier when we
1494 cloned the bios. We should never have a segment
1495 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001496 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497
1498 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1499 seg_name, seg_ofs, seg_len,
1500 bio,
1501 NULL, 0,
1502 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001503 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001504 coll, coll_index,
Alex Elder8b84de72012-11-20 14:17:17 -06001505 rbd_req_cb, NULL);
Alex Eldercd323ac2012-11-08 08:01:39 -06001506 if (ret < 0)
1507 rbd_coll_end_req_index(rq, coll, coll_index,
1508 (s32)ret, seg_len);
Alex Elder8d23bf22012-11-19 22:55:21 -06001509 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510done:
1511 kfree(seg_name);
1512 return ret;
1513}
1514
Alex Elderbf0d5f502012-11-22 00:00:08 -06001515static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1516 struct rbd_obj_request *obj_request)
1517{
1518 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1519}
1520
1521static void rbd_img_request_complete(struct rbd_img_request *img_request)
1522{
1523 if (img_request->callback)
1524 img_request->callback(img_request);
1525 else
1526 rbd_img_request_put(img_request);
1527}
1528
1529static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1530{
1531 if (obj_request->callback)
1532 obj_request->callback(obj_request);
1533}
1534
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001536 * Request sync osd read
1537 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001538static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001539 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001541 char *buf,
1542 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543{
Alex Elder139b4312012-11-13 21:11:15 -06001544 struct ceph_osd_req_op *op;
Alex Elder913d2fd2012-06-26 12:57:03 -07001545 int ret;
1546
Alex Elder8d23bf22012-11-19 22:55:21 -06001547 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
Alex Elder139b4312012-11-13 21:11:15 -06001548 if (!op)
Alex Elder913d2fd2012-06-26 12:57:03 -07001549 return -ENOMEM;
1550
Alex Elder25704ac2012-11-09 08:43:16 -06001551 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
Alex Elder8b84de72012-11-20 14:17:17 -06001552 op, object_name, ofs, len, buf, ver);
Alex Elder8d23bf22012-11-19 22:55:21 -06001553 rbd_osd_req_op_destroy(op);
Alex Elder913d2fd2012-06-26 12:57:03 -07001554
1555 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001556}
1557
1558/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001559 * Request sync osd watch
1560 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001561static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001562 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001563 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001564{
Alex Elder139b4312012-11-13 21:11:15 -06001565 struct ceph_osd_req_op *op;
Sage Weil11f77002011-05-12 16:13:54 -07001566 int ret;
1567
Alex Elder5efea492012-11-19 22:55:21 -06001568 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
Alex Elder139b4312012-11-13 21:11:15 -06001569 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001570 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001571
Alex Elder0ce1a792012-07-03 16:01:18 -05001572 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001573 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001574 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001575 CEPH_OSD_FLAG_READ,
Alex Elder30573d62012-11-13 21:11:15 -06001576 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001577 NULL, 0,
Alex Elder8b84de72012-11-20 14:17:17 -06001578 rbd_simple_req_cb, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001579
Alex Elder5efea492012-11-19 22:55:21 -06001580 rbd_osd_req_op_destroy(op);
1581
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001582 return ret;
1583}
1584
1585static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1586{
Alex Elder0ce1a792012-07-03 16:01:18 -05001587 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001588 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001589 int rc;
1590
Alex Elder0ce1a792012-07-03 16:01:18 -05001591 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001592 return;
1593
Alex Elderbd919d42012-07-13 20:35:11 -05001594 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1595 rbd_dev->header_name, (unsigned long long) notify_id,
1596 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001597 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001598 if (rc)
Alex Elder06ecc6c2012-11-01 10:17:15 -05001599 rbd_warn(rbd_dev, "got notification but failed to "
1600 " update snaps: %d\n", rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001601
Alex Elder7f0a24d2012-07-25 09:32:40 -05001602 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001603}
1604
1605/*
Alex Elder907703d2012-11-13 21:11:15 -06001606 * Request sync osd watch/unwatch. The value of "start" determines
1607 * whether a watch request is being initiated or torn down.
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001608 */
Alex Elder907703d2012-11-13 21:11:15 -06001609static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001610{
Alex Elder5efea492012-11-19 22:55:21 -06001611 struct ceph_osd_req_op *op;
1612 int ret = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001613
Alex Elderc0430642013-01-18 12:31:09 -06001614 rbd_assert(start ^ !!rbd_dev->watch_event);
1615 rbd_assert(start ^ !!rbd_dev->watch_request);
1616
Alex Elder907703d2012-11-13 21:11:15 -06001617 if (start) {
1618 struct ceph_osd_client *osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001619
Alex Elder907703d2012-11-13 21:11:15 -06001620 osdc = &rbd_dev->rbd_client->client->osdc;
1621 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1622 &rbd_dev->watch_event);
1623 if (ret < 0)
Alex Elder5efea492012-11-19 22:55:21 -06001624 return ret;
Alex Elder907703d2012-11-13 21:11:15 -06001625 }
1626
Alex Elder5efea492012-11-19 22:55:21 -06001627 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1628 rbd_dev->watch_event->cookie,
1629 rbd_dev->header.obj_version, start);
1630 if (op)
1631 ret = rbd_req_sync_op(rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001632 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Alex Elder907703d2012-11-13 21:11:15 -06001633 op, rbd_dev->header_name,
Alex Elder8b84de72012-11-20 14:17:17 -06001634 0, 0, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001635
Alex Elder5efea492012-11-19 22:55:21 -06001636 /* Cancel the event if we're tearing down, or on error */
1637
1638 if (!start || !op || ret < 0) {
Alex Elder907703d2012-11-13 21:11:15 -06001639 ceph_osdc_cancel_event(rbd_dev->watch_event);
1640 rbd_dev->watch_event = NULL;
1641 }
Alex Elder5efea492012-11-19 22:55:21 -06001642 rbd_osd_req_op_destroy(op);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001643
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001644 return ret;
1645}
1646
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001647/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001648 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001649 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001650static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001651 const char *object_name,
1652 const char *class_name,
1653 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001654 const char *outbound,
1655 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001656 char *inbound,
1657 size_t inbound_size,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001658 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001659{
Alex Elder139b4312012-11-13 21:11:15 -06001660 struct ceph_osd_req_op *op;
Alex Elder57cfc102012-06-26 12:57:03 -07001661 int ret;
1662
Alex Elder3cb4a682012-06-26 12:57:03 -07001663 /*
1664 * Any input parameters required by the method we're calling
1665 * will be sent along with the class and method names as
1666 * part of the message payload. That data and its size are
1667 * supplied via the indata and indata_len fields (named from
1668 * the perspective of the server side) in the OSD request
1669 * operation.
1670 */
Alex Elder2647ba32012-11-19 22:55:21 -06001671 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1672 method_name, outbound, outbound_size);
Alex Elder139b4312012-11-13 21:11:15 -06001673 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001674 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001675
Alex Elder30573d62012-11-13 21:11:15 -06001676 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
Alex Elderf8d4de62012-07-03 16:01:19 -05001677 object_name, 0, inbound_size, inbound,
Alex Elder8b84de72012-11-20 14:17:17 -06001678 ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001679
Alex Elder2647ba32012-11-19 22:55:21 -06001680 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001681
1682 dout("cls_exec returned %d\n", ret);
1683 return ret;
1684}
1685
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001686static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1687{
1688 struct rbd_req_coll *coll =
1689 kzalloc(sizeof(struct rbd_req_coll) +
1690 sizeof(struct rbd_req_status) * num_reqs,
1691 GFP_ATOMIC);
1692
1693 if (!coll)
1694 return NULL;
1695 coll->total = num_reqs;
1696 kref_init(&coll->kref);
1697 return coll;
1698}
1699
Alex Elder8295cda2012-11-08 08:01:39 -06001700static int rbd_dev_do_request(struct request *rq,
1701 struct rbd_device *rbd_dev,
1702 struct ceph_snap_context *snapc,
1703 u64 ofs, unsigned int size,
1704 struct bio *bio_chain)
1705{
1706 int num_segs;
1707 struct rbd_req_coll *coll;
1708 unsigned int bio_offset;
1709 int cur_seg = 0;
1710
1711 dout("%s 0x%x bytes at 0x%llx\n",
1712 rq_data_dir(rq) == WRITE ? "write" : "read",
1713 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1714
1715 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1716 if (num_segs <= 0)
1717 return num_segs;
1718
1719 coll = rbd_alloc_coll(num_segs);
1720 if (!coll)
1721 return -ENOMEM;
1722
1723 bio_offset = 0;
1724 do {
1725 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1726 unsigned int clone_size;
1727 struct bio *bio_clone;
1728
1729 BUG_ON(limit > (u64)UINT_MAX);
1730 clone_size = (unsigned int)limit;
1731 dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
1732
1733 kref_get(&coll->kref);
1734
1735 /* Pass a cloned bio chain via an osd request */
1736
1737 bio_clone = bio_chain_clone_range(&bio_chain,
1738 &bio_offset, clone_size,
1739 GFP_ATOMIC);
1740 if (bio_clone)
1741 (void)rbd_do_op(rq, rbd_dev, snapc,
1742 ofs, clone_size,
1743 bio_clone, coll, cur_seg);
1744 else
1745 rbd_coll_end_req_index(rq, coll, cur_seg,
1746 (s32)-ENOMEM,
1747 clone_size);
1748 size -= clone_size;
1749 ofs += clone_size;
1750
1751 cur_seg++;
1752 } while (size > 0);
1753 kref_put(&coll->kref, rbd_coll_release);
1754
1755 return 0;
1756}
1757
Alex Elderbf0d5f502012-11-22 00:00:08 -06001758static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1759 struct ceph_osd_op *op)
1760{
1761 u64 xferred;
1762
1763 /*
1764 * We support a 64-bit length, but ultimately it has to be
1765 * passed to blk_end_request(), which takes an unsigned int.
1766 */
1767 xferred = le64_to_cpu(op->extent.length);
1768 rbd_assert(xferred < (u64) UINT_MAX);
1769 if (obj_request->result == (s32) -ENOENT) {
1770 zero_bio_chain(obj_request->bio_list, 0);
1771 obj_request->result = 0;
1772 } else if (xferred < obj_request->length && !obj_request->result) {
1773 zero_bio_chain(obj_request->bio_list, xferred);
1774 xferred = obj_request->length;
1775 }
1776 obj_request->xferred = xferred;
1777 atomic_set(&obj_request->done, 1);
1778}
1779
1780static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1781 struct ceph_osd_op *op)
1782{
1783 obj_request->xferred = le64_to_cpu(op->extent.length);
1784 atomic_set(&obj_request->done, 1);
1785}
1786
1787static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1788 struct ceph_msg *msg)
1789{
1790 struct rbd_obj_request *obj_request = osd_req->r_priv;
1791 struct ceph_osd_reply_head *reply_head;
1792 struct ceph_osd_op *op;
1793 u32 num_ops;
1794 u16 opcode;
1795
1796 rbd_assert(osd_req == obj_request->osd_req);
1797 rbd_assert(!!obj_request->img_request ^
1798 (obj_request->which == BAD_WHICH));
1799
1800 obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1801 reply_head = msg->front.iov_base;
1802 obj_request->result = (s32) le32_to_cpu(reply_head->result);
1803 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1804
1805 num_ops = le32_to_cpu(reply_head->num_ops);
1806 WARN_ON(num_ops != 1); /* For now */
1807
1808 op = &reply_head->ops[0];
1809 opcode = le16_to_cpu(op->op);
1810 switch (opcode) {
1811 case CEPH_OSD_OP_READ:
1812 rbd_osd_read_callback(obj_request, op);
1813 break;
1814 case CEPH_OSD_OP_WRITE:
1815 rbd_osd_write_callback(obj_request, op);
1816 break;
1817 default:
1818 rbd_warn(NULL, "%s: unsupported op %hu\n",
1819 obj_request->object_name, (unsigned short) opcode);
1820 break;
1821 }
1822
1823 if (atomic_read(&obj_request->done))
1824 rbd_obj_request_complete(obj_request);
1825}
1826
1827static struct ceph_osd_request *rbd_osd_req_create(
1828 struct rbd_device *rbd_dev,
1829 bool write_request,
1830 struct rbd_obj_request *obj_request,
1831 struct ceph_osd_req_op *op)
1832{
1833 struct rbd_img_request *img_request = obj_request->img_request;
1834 struct ceph_snap_context *snapc = NULL;
1835 struct ceph_osd_client *osdc;
1836 struct ceph_osd_request *osd_req;
1837 struct timespec now;
1838 struct timespec *mtime;
1839 u64 snap_id = CEPH_NOSNAP;
1840 u64 offset = obj_request->offset;
1841 u64 length = obj_request->length;
1842
1843 if (img_request) {
1844 rbd_assert(img_request->write_request == write_request);
1845 if (img_request->write_request)
1846 snapc = img_request->snapc;
1847 else
1848 snap_id = img_request->snap_id;
1849 }
1850
1851 /* Allocate and initialize the request, for the single op */
1852
1853 osdc = &rbd_dev->rbd_client->client->osdc;
1854 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1855 if (!osd_req)
1856 return NULL; /* ENOMEM */
1857
1858 rbd_assert(obj_request_type_valid(obj_request->type));
1859 switch (obj_request->type) {
1860 case OBJ_REQUEST_BIO:
1861 rbd_assert(obj_request->bio_list != NULL);
1862 osd_req->r_bio = obj_request->bio_list;
1863 bio_get(osd_req->r_bio);
1864 /* osd client requires "num pages" even for bio */
1865 osd_req->r_num_pages = calc_pages_for(offset, length);
1866 break;
1867 }
1868
1869 if (write_request) {
1870 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1871 now = CURRENT_TIME;
1872 mtime = &now;
1873 } else {
1874 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1875 mtime = NULL; /* not needed for reads */
1876 offset = 0; /* These are not used... */
1877 length = 0; /* ...for osd read requests */
1878 }
1879
1880 osd_req->r_callback = rbd_osd_req_callback;
1881 osd_req->r_priv = obj_request;
1882
1883 osd_req->r_oid_len = strlen(obj_request->object_name);
1884 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1885 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1886
1887 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1888
1889 /* osd_req will get its own reference to snapc (if non-null) */
1890
1891 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1892 snapc, snap_id, mtime);
1893
1894 return osd_req;
1895}
1896
1897static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1898{
1899 ceph_osdc_put_request(osd_req);
1900}
1901
1902/* object_name is assumed to be a non-null pointer and NUL-terminated */
1903
1904static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1905 u64 offset, u64 length,
1906 enum obj_request_type type)
1907{
1908 struct rbd_obj_request *obj_request;
1909 size_t size;
1910 char *name;
1911
1912 rbd_assert(obj_request_type_valid(type));
1913
1914 size = strlen(object_name) + 1;
1915 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1916 if (!obj_request)
1917 return NULL;
1918
1919 name = (char *)(obj_request + 1);
1920 obj_request->object_name = memcpy(name, object_name, size);
1921 obj_request->offset = offset;
1922 obj_request->length = length;
1923 obj_request->which = BAD_WHICH;
1924 obj_request->type = type;
1925 INIT_LIST_HEAD(&obj_request->links);
1926 atomic_set(&obj_request->done, 0);
1927 kref_init(&obj_request->kref);
1928
1929 return obj_request;
1930}
1931
1932static void rbd_obj_request_destroy(struct kref *kref)
1933{
1934 struct rbd_obj_request *obj_request;
1935
1936 obj_request = container_of(kref, struct rbd_obj_request, kref);
1937
1938 rbd_assert(obj_request->img_request == NULL);
1939 rbd_assert(obj_request->which == BAD_WHICH);
1940
1941 if (obj_request->osd_req)
1942 rbd_osd_req_destroy(obj_request->osd_req);
1943
1944 rbd_assert(obj_request_type_valid(obj_request->type));
1945 switch (obj_request->type) {
1946 case OBJ_REQUEST_BIO:
1947 if (obj_request->bio_list)
1948 bio_chain_put(obj_request->bio_list);
1949 break;
1950 }
1951
1952 kfree(obj_request);
1953}
1954
1955/*
1956 * Caller is responsible for filling in the list of object requests
1957 * that comprises the image request, and the Linux request pointer
1958 * (if there is one).
1959 */
1960struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1961 u64 offset, u64 length,
1962 bool write_request)
1963{
1964 struct rbd_img_request *img_request;
1965 struct ceph_snap_context *snapc = NULL;
1966
1967 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1968 if (!img_request)
1969 return NULL;
1970
1971 if (write_request) {
1972 down_read(&rbd_dev->header_rwsem);
1973 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1974 up_read(&rbd_dev->header_rwsem);
1975 if (WARN_ON(!snapc)) {
1976 kfree(img_request);
1977 return NULL; /* Shouldn't happen */
1978 }
1979 }
1980
1981 img_request->rq = NULL;
1982 img_request->rbd_dev = rbd_dev;
1983 img_request->offset = offset;
1984 img_request->length = length;
1985 img_request->write_request = write_request;
1986 if (write_request)
1987 img_request->snapc = snapc;
1988 else
1989 img_request->snap_id = rbd_dev->spec->snap_id;
1990 spin_lock_init(&img_request->completion_lock);
1991 img_request->next_completion = 0;
1992 img_request->callback = NULL;
1993 img_request->obj_request_count = 0;
1994 INIT_LIST_HEAD(&img_request->obj_requests);
1995 kref_init(&img_request->kref);
1996
1997 rbd_img_request_get(img_request); /* Avoid a warning */
1998 rbd_img_request_put(img_request); /* TEMPORARY */
1999
2000 return img_request;
2001}
2002
2003static void rbd_img_request_destroy(struct kref *kref)
2004{
2005 struct rbd_img_request *img_request;
2006 struct rbd_obj_request *obj_request;
2007 struct rbd_obj_request *next_obj_request;
2008
2009 img_request = container_of(kref, struct rbd_img_request, kref);
2010
2011 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2012 rbd_img_obj_request_del(img_request, obj_request);
2013
2014 if (img_request->write_request)
2015 ceph_put_snap_context(img_request->snapc);
2016
2017 kfree(img_request);
2018}
2019
2020static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
2021 struct bio *bio_list)
2022{
2023 struct rbd_device *rbd_dev = img_request->rbd_dev;
2024 struct rbd_obj_request *obj_request = NULL;
2025 struct rbd_obj_request *next_obj_request;
2026 unsigned int bio_offset;
2027 u64 image_offset;
2028 u64 resid;
2029 u16 opcode;
2030
2031 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
2032 : CEPH_OSD_OP_READ;
2033 bio_offset = 0;
2034 image_offset = img_request->offset;
2035 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
2036 resid = img_request->length;
2037 while (resid) {
2038 const char *object_name;
2039 unsigned int clone_size;
2040 struct ceph_osd_req_op *op;
2041 u64 offset;
2042 u64 length;
2043
2044 object_name = rbd_segment_name(rbd_dev, image_offset);
2045 if (!object_name)
2046 goto out_unwind;
2047 offset = rbd_segment_offset(rbd_dev, image_offset);
2048 length = rbd_segment_length(rbd_dev, image_offset, resid);
2049 obj_request = rbd_obj_request_create(object_name,
2050 offset, length,
2051 OBJ_REQUEST_BIO);
2052 kfree(object_name); /* object request has its own copy */
2053 if (!obj_request)
2054 goto out_unwind;
2055
2056 rbd_assert(length <= (u64) UINT_MAX);
2057 clone_size = (unsigned int) length;
2058 obj_request->bio_list = bio_chain_clone_range(&bio_list,
2059 &bio_offset, clone_size,
2060 GFP_ATOMIC);
2061 if (!obj_request->bio_list)
2062 goto out_partial;
2063
2064 /*
2065 * Build up the op to use in building the osd
2066 * request. Note that the contents of the op are
2067 * copied by rbd_osd_req_create().
2068 */
2069 op = rbd_osd_req_op_create(opcode, offset, length);
2070 if (!op)
2071 goto out_partial;
2072 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
2073 img_request->write_request,
2074 obj_request, op);
2075 rbd_osd_req_op_destroy(op);
2076 if (!obj_request->osd_req)
2077 goto out_partial;
2078 /* status and version are initially zero-filled */
2079
2080 rbd_img_obj_request_add(img_request, obj_request);
2081
2082 image_offset += length;
2083 resid -= length;
2084 }
2085
2086 return 0;
2087
2088out_partial:
2089 rbd_obj_request_put(obj_request);
2090out_unwind:
2091 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2092 rbd_obj_request_put(obj_request);
2093
2094 return -ENOMEM;
2095}
2096
2097static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2098{
2099 struct rbd_img_request *img_request;
2100 u32 which = obj_request->which;
2101 bool more = true;
2102
2103 img_request = obj_request->img_request;
2104 rbd_assert(img_request != NULL);
2105 rbd_assert(img_request->rq != NULL);
2106 rbd_assert(which != BAD_WHICH);
2107 rbd_assert(which < img_request->obj_request_count);
2108 rbd_assert(which >= img_request->next_completion);
2109
2110 spin_lock_irq(&img_request->completion_lock);
2111 if (which != img_request->next_completion)
2112 goto out;
2113
2114 for_each_obj_request_from(img_request, obj_request) {
2115 unsigned int xferred;
2116 int result;
2117
2118 rbd_assert(more);
2119 rbd_assert(which < img_request->obj_request_count);
2120
2121 if (!atomic_read(&obj_request->done))
2122 break;
2123
2124 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
2125 xferred = (unsigned int) obj_request->xferred;
2126 result = (int) obj_request->result;
2127 if (result)
2128 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
2129 img_request->write_request ? "write" : "read",
2130 result, xferred);
2131
2132 more = blk_end_request(img_request->rq, result, xferred);
2133 which++;
2134 }
2135 rbd_assert(more ^ (which == img_request->obj_request_count));
2136 img_request->next_completion = which;
2137out:
2138 spin_unlock_irq(&img_request->completion_lock);
2139
2140 if (!more)
2141 rbd_img_request_complete(img_request);
2142}
2143
2144static int rbd_img_request_submit(struct rbd_img_request *img_request)
2145{
2146 struct rbd_device *rbd_dev = img_request->rbd_dev;
2147 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2148 struct rbd_obj_request *obj_request;
2149
2150 for_each_obj_request(img_request, obj_request) {
2151 int ret;
2152
2153 obj_request->callback = rbd_img_obj_callback;
2154 ret = rbd_obj_request_submit(osdc, obj_request);
2155 if (ret)
2156 return ret;
2157 /*
2158 * The image request has its own reference to each
2159 * of its object requests, so we can safely drop the
2160 * initial one here.
2161 */
2162 rbd_obj_request_put(obj_request);
2163 }
2164
2165 return 0;
2166}
2167
2168static void rbd_request_fn(struct request_queue *q)
2169{
2170 struct rbd_device *rbd_dev = q->queuedata;
2171 bool read_only = rbd_dev->mapping.read_only;
2172 struct request *rq;
2173 int result;
2174
2175 while ((rq = blk_fetch_request(q))) {
2176 bool write_request = rq_data_dir(rq) == WRITE;
2177 struct rbd_img_request *img_request;
2178 u64 offset;
2179 u64 length;
2180
2181 /* Ignore any non-FS requests that filter through. */
2182
2183 if (rq->cmd_type != REQ_TYPE_FS) {
2184 __blk_end_request_all(rq, 0);
2185 continue;
2186 }
2187
2188 spin_unlock_irq(q->queue_lock);
2189
2190 /* Disallow writes to a read-only device */
2191
2192 if (write_request) {
2193 result = -EROFS;
2194 if (read_only)
2195 goto end_request;
2196 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2197 }
2198
2199 /* Quit early if the snapshot has disappeared */
2200
2201 if (!atomic_read(&rbd_dev->exists)) {
2202 dout("request for non-existent snapshot");
2203 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2204 result = -ENXIO;
2205 goto end_request;
2206 }
2207
2208 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2209 length = (u64) blk_rq_bytes(rq);
2210
2211 result = -EINVAL;
2212 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2213 goto end_request; /* Shouldn't happen */
2214
2215 result = -ENOMEM;
2216 img_request = rbd_img_request_create(rbd_dev, offset, length,
2217 write_request);
2218 if (!img_request)
2219 goto end_request;
2220
2221 img_request->rq = rq;
2222
2223 result = rbd_img_request_fill_bio(img_request, rq->bio);
2224 if (!result)
2225 result = rbd_img_request_submit(img_request);
2226 if (result)
2227 rbd_img_request_put(img_request);
2228end_request:
2229 spin_lock_irq(q->queue_lock);
2230 if (result < 0) {
2231 rbd_warn(rbd_dev, "obj_request %s result %d\n",
2232 write_request ? "write" : "read", result);
2233 __blk_end_request_all(rq, result);
2234 }
2235 }
2236}
2237
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002238/*
2239 * block device queue callback
2240 */
2241static void rbd_rq_fn(struct request_queue *q)
2242{
2243 struct rbd_device *rbd_dev = q->queuedata;
Alex Elderb395e8b2012-11-08 08:01:39 -06002244 bool read_only = rbd_dev->mapping.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002245 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002246
Alex Elder00f1f362012-02-07 12:03:36 -06002247 while ((rq = blk_fetch_request(q))) {
Alex Elderb395e8b2012-11-08 08:01:39 -06002248 struct ceph_snap_context *snapc = NULL;
2249 unsigned int size = 0;
Alex Elder8295cda2012-11-08 08:01:39 -06002250 int result;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002251
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002252 dout("fetched request\n");
2253
Alex Elderb395e8b2012-11-08 08:01:39 -06002254 /* Filter out block requests we don't understand */
2255
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002256 if ((rq->cmd_type != REQ_TYPE_FS)) {
2257 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06002258 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002259 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002260 spin_unlock_irq(q->queue_lock);
2261
Alex Eldera7b4c652012-11-09 08:43:15 -06002262 /* Write requests need a reference to the snapshot context */
Alex Elderb395e8b2012-11-08 08:01:39 -06002263
Alex Eldera7b4c652012-11-09 08:43:15 -06002264 if (rq_data_dir(rq) == WRITE) {
2265 result = -EROFS;
2266 if (read_only) /* Can't write to a read-only device */
2267 goto out_end_request;
Alex Elderb395e8b2012-11-08 08:01:39 -06002268
Alex Eldera7b4c652012-11-09 08:43:15 -06002269 /*
2270 * Note that each osd request will take its
2271 * own reference to the snapshot context
2272 * supplied. The reference we take here
2273 * just guarantees the one we provide stays
2274 * valid.
2275 */
2276 down_read(&rbd_dev->header_rwsem);
Alex Elderb395e8b2012-11-08 08:01:39 -06002277 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
Alex Eldera7b4c652012-11-09 08:43:15 -06002278 up_read(&rbd_dev->header_rwsem);
Alex Elderb395e8b2012-11-08 08:01:39 -06002279 rbd_assert(snapc != NULL);
Alex Eldera7b4c652012-11-09 08:43:15 -06002280 } else if (!atomic_read(&rbd_dev->exists)) {
Alex Elderb395e8b2012-11-08 08:01:39 -06002281 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2282 dout("request for non-existent snapshot");
2283 result = -ENXIO;
2284 goto out_end_request;
2285 }
Alex Elderf7760da2012-10-20 22:17:27 -05002286
Alex Elderb395e8b2012-11-08 08:01:39 -06002287 size = blk_rq_bytes(rq);
2288 result = rbd_dev_do_request(rq, rbd_dev, snapc,
2289 blk_rq_pos(rq) * SECTOR_SIZE,
2290 size, rq->bio);
2291out_end_request:
Alex Eldera7b4c652012-11-09 08:43:15 -06002292 if (snapc)
2293 ceph_put_snap_context(snapc);
Alex Elder8295cda2012-11-08 08:01:39 -06002294 spin_lock_irq(q->queue_lock);
2295 if (!size || result < 0)
2296 __blk_end_request_all(rq, result);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002297 }
2298}
2299
2300/*
2301 * a queue callback. Makes sure that we don't create a bio that spans across
2302 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002303 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002304 */
2305static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2306 struct bio_vec *bvec)
2307{
2308 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05002309 sector_t sector_offset;
2310 sector_t sectors_per_obj;
2311 sector_t obj_sector_offset;
2312 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002313
Alex Eldere5cfeed2012-10-20 22:17:27 -05002314 /*
2315 * Find how far into its rbd object the partition-relative
2316 * bio start sector is to offset relative to the enclosing
2317 * device.
2318 */
2319 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2320 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2321 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002322
Alex Eldere5cfeed2012-10-20 22:17:27 -05002323 /*
2324 * Compute the number of bytes from that offset to the end
2325 * of the object. Account for what's already used by the bio.
2326 */
2327 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2328 if (ret > bmd->bi_size)
2329 ret -= bmd->bi_size;
2330 else
2331 ret = 0;
2332
2333 /*
2334 * Don't send back more than was asked for. And if the bio
2335 * was empty, let the whole thing through because: "Note
2336 * that a block device *must* allow a single page to be
2337 * added to an empty bio."
2338 */
2339 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2340 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2341 ret = (int) bvec->bv_len;
2342
2343 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002344}
2345
2346static void rbd_free_disk(struct rbd_device *rbd_dev)
2347{
2348 struct gendisk *disk = rbd_dev->disk;
2349
2350 if (!disk)
2351 return;
2352
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002353 if (disk->flags & GENHD_FL_UP)
2354 del_gendisk(disk);
2355 if (disk->queue)
2356 blk_cleanup_queue(disk->queue);
2357 put_disk(disk);
2358}
2359
2360/*
Alex Elder4156d992012-08-02 11:29:46 -05002361 * Read the complete header for the given rbd device.
2362 *
2363 * Returns a pointer to a dynamically-allocated buffer containing
2364 * the complete and validated header. Caller can pass the address
2365 * of a variable that will be filled in with the version of the
2366 * header object at the time it was read.
2367 *
2368 * Returns a pointer-coded errno if a failure occurs.
2369 */
2370static struct rbd_image_header_ondisk *
2371rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2372{
2373 struct rbd_image_header_ondisk *ondisk = NULL;
2374 u32 snap_count = 0;
2375 u64 names_size = 0;
2376 u32 want_count;
2377 int ret;
2378
2379 /*
2380 * The complete header will include an array of its 64-bit
2381 * snapshot ids, followed by the names of those snapshots as
2382 * a contiguous block of NUL-terminated strings. Note that
2383 * the number of snapshots could change by the time we read
2384 * it in, in which case we re-read it.
2385 */
2386 do {
2387 size_t size;
2388
2389 kfree(ondisk);
2390
2391 size = sizeof (*ondisk);
2392 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2393 size += names_size;
2394 ondisk = kmalloc(size, GFP_KERNEL);
2395 if (!ondisk)
2396 return ERR_PTR(-ENOMEM);
2397
Alex Elder47756182012-11-09 08:43:15 -06002398 ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002399 0, size,
2400 (char *) ondisk, version);
2401
2402 if (ret < 0)
2403 goto out_err;
2404 if (WARN_ON((size_t) ret < size)) {
2405 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002406 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2407 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002408 goto out_err;
2409 }
2410 if (!rbd_dev_ondisk_valid(ondisk)) {
2411 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002412 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002413 goto out_err;
2414 }
2415
2416 names_size = le64_to_cpu(ondisk->snap_names_len);
2417 want_count = snap_count;
2418 snap_count = le32_to_cpu(ondisk->snap_count);
2419 } while (snap_count != want_count);
2420
2421 return ondisk;
2422
2423out_err:
2424 kfree(ondisk);
2425
2426 return ERR_PTR(ret);
2427}
2428
2429/*
2430 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002431 */
2432static int rbd_read_header(struct rbd_device *rbd_dev,
2433 struct rbd_image_header *header)
2434{
Alex Elder4156d992012-08-02 11:29:46 -05002435 struct rbd_image_header_ondisk *ondisk;
2436 u64 ver = 0;
2437 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002438
Alex Elder4156d992012-08-02 11:29:46 -05002439 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2440 if (IS_ERR(ondisk))
2441 return PTR_ERR(ondisk);
2442 ret = rbd_header_from_disk(header, ondisk);
2443 if (ret >= 0)
2444 header->obj_version = ver;
2445 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002446
Alex Elder4156d992012-08-02 11:29:46 -05002447 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002448}
2449
Alex Elder41f38c22012-10-25 23:34:40 -05002450static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002451{
2452 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002453 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002454
Alex Eldera0593292012-07-19 09:09:27 -05002455 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002456 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002457}
2458
Alex Elder94785542012-10-09 13:50:17 -07002459static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2460{
2461 sector_t size;
2462
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002463 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002464 return;
2465
2466 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2467 dout("setting size to %llu sectors", (unsigned long long) size);
2468 rbd_dev->mapping.size = (u64) size;
2469 set_capacity(rbd_dev->disk, size);
2470}
2471
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002472/*
2473 * only read the first part of the ondisk header, without the snaps info
2474 */
Alex Elder117973f2012-08-31 17:29:55 -05002475static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002476{
2477 int ret;
2478 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002479
2480 ret = rbd_read_header(rbd_dev, &h);
2481 if (ret < 0)
2482 return ret;
2483
Josh Durgina51aa0c2011-12-05 10:35:04 -08002484 down_write(&rbd_dev->header_rwsem);
2485
Alex Elder94785542012-10-09 13:50:17 -07002486 /* Update image size, and check for resize of mapped image */
2487 rbd_dev->header.image_size = h.image_size;
2488 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002489
Alex Elder849b4262012-07-09 21:04:24 -05002490 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002491 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002492 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002493 /* osd requests may still refer to snapc */
2494 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002495
Alex Elderb8136232012-07-25 09:32:41 -05002496 if (hver)
2497 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002498 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002499 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002500 rbd_dev->header.snapc = h.snapc;
2501 rbd_dev->header.snap_names = h.snap_names;
2502 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002503 /* Free the extra copy of the object prefix */
2504 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2505 kfree(h.object_prefix);
2506
Alex Elder304f6802012-08-31 17:29:52 -05002507 ret = rbd_dev_snaps_update(rbd_dev);
2508 if (!ret)
2509 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002510
Josh Durginc6666012011-11-21 17:11:12 -08002511 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002512
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002513 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002514}
2515
Alex Elder117973f2012-08-31 17:29:55 -05002516static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002517{
2518 int ret;
2519
Alex Elder117973f2012-08-31 17:29:55 -05002520 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002521 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002522 if (rbd_dev->image_format == 1)
2523 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2524 else
2525 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002526 mutex_unlock(&ctl_mutex);
2527
2528 return ret;
2529}
2530
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002531static int rbd_init_disk(struct rbd_device *rbd_dev)
2532{
2533 struct gendisk *disk;
2534 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002535 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002536
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002537 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002538 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2539 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002540 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002541
Alex Elderf0f8cef2012-01-29 13:57:44 -06002542 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002543 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002544 disk->major = rbd_dev->major;
2545 disk->first_minor = 0;
2546 disk->fops = &rbd_bd_ops;
2547 disk->private_data = rbd_dev;
2548
Alex Elderbf0d5f502012-11-22 00:00:08 -06002549 (void) rbd_rq_fn; /* avoid a warning */
2550 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002551 if (!q)
2552 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002553
Alex Elder593a9e72012-02-07 12:03:37 -06002554 /* We use the default size, but let's be explicit about it. */
2555 blk_queue_physical_block_size(q, SECTOR_SIZE);
2556
Josh Durgin029bcbd2011-07-22 11:35:23 -07002557 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002558 segment_size = rbd_obj_bytes(&rbd_dev->header);
2559 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2560 blk_queue_max_segment_size(q, segment_size);
2561 blk_queue_io_min(q, segment_size);
2562 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002563
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002564 blk_queue_merge_bvec(q, rbd_merge_bvec);
2565 disk->queue = q;
2566
2567 q->queuedata = rbd_dev;
2568
2569 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570
Alex Elder12f02942012-08-29 17:11:07 -05002571 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2572
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002573 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002574out_disk:
2575 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002576
2577 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002578}
2579
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002580/*
2581 sysfs
2582*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002583
Alex Elder593a9e72012-02-07 12:03:37 -06002584static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2585{
2586 return container_of(dev, struct rbd_device, dev);
2587}
2588
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002589static ssize_t rbd_size_show(struct device *dev,
2590 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002591{
Alex Elder593a9e72012-02-07 12:03:37 -06002592 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002593 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002594
Josh Durgina51aa0c2011-12-05 10:35:04 -08002595 down_read(&rbd_dev->header_rwsem);
2596 size = get_capacity(rbd_dev->disk);
2597 up_read(&rbd_dev->header_rwsem);
2598
2599 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002600}
2601
Alex Elder34b13182012-07-13 20:35:12 -05002602/*
2603 * Note this shows the features for whatever's mapped, which is not
2604 * necessarily the base image.
2605 */
2606static ssize_t rbd_features_show(struct device *dev,
2607 struct device_attribute *attr, char *buf)
2608{
2609 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2610
2611 return sprintf(buf, "0x%016llx\n",
2612 (unsigned long long) rbd_dev->mapping.features);
2613}
2614
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002615static ssize_t rbd_major_show(struct device *dev,
2616 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002617{
Alex Elder593a9e72012-02-07 12:03:37 -06002618 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002619
2620 return sprintf(buf, "%d\n", rbd_dev->major);
2621}
2622
2623static ssize_t rbd_client_id_show(struct device *dev,
2624 struct device_attribute *attr, char *buf)
2625{
Alex Elder593a9e72012-02-07 12:03:37 -06002626 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002627
Alex Elder1dbb4392012-01-24 10:08:37 -06002628 return sprintf(buf, "client%lld\n",
2629 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002630}
2631
2632static ssize_t rbd_pool_show(struct device *dev,
2633 struct device_attribute *attr, char *buf)
2634{
Alex Elder593a9e72012-02-07 12:03:37 -06002635 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002636
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002637 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002638}
2639
Alex Elder9bb2f332012-07-12 10:46:35 -05002640static ssize_t rbd_pool_id_show(struct device *dev,
2641 struct device_attribute *attr, char *buf)
2642{
2643 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2644
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002645 return sprintf(buf, "%llu\n",
2646 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002647}
2648
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002649static ssize_t rbd_name_show(struct device *dev,
2650 struct device_attribute *attr, char *buf)
2651{
Alex Elder593a9e72012-02-07 12:03:37 -06002652 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002653
Alex Eldera92ffdf2012-10-30 19:40:33 -05002654 if (rbd_dev->spec->image_name)
2655 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2656
2657 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002658}
2659
Alex Elder589d30e2012-07-10 20:30:11 -05002660static ssize_t rbd_image_id_show(struct device *dev,
2661 struct device_attribute *attr, char *buf)
2662{
2663 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2664
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002665 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002666}
2667
Alex Elder34b13182012-07-13 20:35:12 -05002668/*
2669 * Shows the name of the currently-mapped snapshot (or
2670 * RBD_SNAP_HEAD_NAME for the base image).
2671 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002672static ssize_t rbd_snap_show(struct device *dev,
2673 struct device_attribute *attr,
2674 char *buf)
2675{
Alex Elder593a9e72012-02-07 12:03:37 -06002676 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002677
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002678 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002679}
2680
Alex Elder86b00e02012-10-25 23:34:42 -05002681/*
2682 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2683 * for the parent image. If there is no parent, simply shows
2684 * "(no parent image)".
2685 */
2686static ssize_t rbd_parent_show(struct device *dev,
2687 struct device_attribute *attr,
2688 char *buf)
2689{
2690 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2691 struct rbd_spec *spec = rbd_dev->parent_spec;
2692 int count;
2693 char *bufp = buf;
2694
2695 if (!spec)
2696 return sprintf(buf, "(no parent image)\n");
2697
2698 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2699 (unsigned long long) spec->pool_id, spec->pool_name);
2700 if (count < 0)
2701 return count;
2702 bufp += count;
2703
2704 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2705 spec->image_name ? spec->image_name : "(unknown)");
2706 if (count < 0)
2707 return count;
2708 bufp += count;
2709
2710 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2711 (unsigned long long) spec->snap_id, spec->snap_name);
2712 if (count < 0)
2713 return count;
2714 bufp += count;
2715
2716 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2717 if (count < 0)
2718 return count;
2719 bufp += count;
2720
2721 return (ssize_t) (bufp - buf);
2722}
2723
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002724static ssize_t rbd_image_refresh(struct device *dev,
2725 struct device_attribute *attr,
2726 const char *buf,
2727 size_t size)
2728{
Alex Elder593a9e72012-02-07 12:03:37 -06002729 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002730 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002731
Alex Elder117973f2012-08-31 17:29:55 -05002732 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002733
2734 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002735}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002736
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002737static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002738static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002739static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2740static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2741static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002742static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002743static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002744static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002745static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2746static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002747static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002748
2749static struct attribute *rbd_attrs[] = {
2750 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002751 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002752 &dev_attr_major.attr,
2753 &dev_attr_client_id.attr,
2754 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002755 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002756 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002757 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002758 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002759 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002760 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002761 NULL
2762};
2763
2764static struct attribute_group rbd_attr_group = {
2765 .attrs = rbd_attrs,
2766};
2767
2768static const struct attribute_group *rbd_attr_groups[] = {
2769 &rbd_attr_group,
2770 NULL
2771};
2772
2773static void rbd_sysfs_dev_release(struct device *dev)
2774{
2775}
2776
2777static struct device_type rbd_device_type = {
2778 .name = "rbd",
2779 .groups = rbd_attr_groups,
2780 .release = rbd_sysfs_dev_release,
2781};
2782
2783
2784/*
2785 sysfs - snapshots
2786*/
2787
2788static ssize_t rbd_snap_size_show(struct device *dev,
2789 struct device_attribute *attr,
2790 char *buf)
2791{
2792 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2793
Josh Durgin35915382011-12-05 18:25:13 -08002794 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002795}
2796
2797static ssize_t rbd_snap_id_show(struct device *dev,
2798 struct device_attribute *attr,
2799 char *buf)
2800{
2801 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2802
Josh Durgin35915382011-12-05 18:25:13 -08002803 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002804}
2805
Alex Elder34b13182012-07-13 20:35:12 -05002806static ssize_t rbd_snap_features_show(struct device *dev,
2807 struct device_attribute *attr,
2808 char *buf)
2809{
2810 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2811
2812 return sprintf(buf, "0x%016llx\n",
2813 (unsigned long long) snap->features);
2814}
2815
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002816static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2817static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002818static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002819
2820static struct attribute *rbd_snap_attrs[] = {
2821 &dev_attr_snap_size.attr,
2822 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002823 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002824 NULL,
2825};
2826
2827static struct attribute_group rbd_snap_attr_group = {
2828 .attrs = rbd_snap_attrs,
2829};
2830
2831static void rbd_snap_dev_release(struct device *dev)
2832{
2833 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2834 kfree(snap->name);
2835 kfree(snap);
2836}
2837
2838static const struct attribute_group *rbd_snap_attr_groups[] = {
2839 &rbd_snap_attr_group,
2840 NULL
2841};
2842
2843static struct device_type rbd_snap_device_type = {
2844 .groups = rbd_snap_attr_groups,
2845 .release = rbd_snap_dev_release,
2846};
2847
Alex Elder8b8fb992012-10-26 17:25:24 -05002848static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2849{
2850 kref_get(&spec->kref);
2851
2852 return spec;
2853}
2854
2855static void rbd_spec_free(struct kref *kref);
2856static void rbd_spec_put(struct rbd_spec *spec)
2857{
2858 if (spec)
2859 kref_put(&spec->kref, rbd_spec_free);
2860}
2861
2862static struct rbd_spec *rbd_spec_alloc(void)
2863{
2864 struct rbd_spec *spec;
2865
2866 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2867 if (!spec)
2868 return NULL;
2869 kref_init(&spec->kref);
2870
2871 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2872
2873 return spec;
2874}
2875
2876static void rbd_spec_free(struct kref *kref)
2877{
2878 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2879
2880 kfree(spec->pool_name);
2881 kfree(spec->image_id);
2882 kfree(spec->image_name);
2883 kfree(spec->snap_name);
2884 kfree(spec);
2885}
2886
Alex Elderc53d5892012-10-25 23:34:42 -05002887struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2888 struct rbd_spec *spec)
2889{
2890 struct rbd_device *rbd_dev;
2891
2892 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2893 if (!rbd_dev)
2894 return NULL;
2895
2896 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002897 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002898 INIT_LIST_HEAD(&rbd_dev->node);
2899 INIT_LIST_HEAD(&rbd_dev->snaps);
2900 init_rwsem(&rbd_dev->header_rwsem);
2901
2902 rbd_dev->spec = spec;
2903 rbd_dev->rbd_client = rbdc;
2904
Alex Elder0903e872012-11-14 12:25:19 -06002905 /* Initialize the layout used for all rbd requests */
2906
2907 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2908 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2909 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2910 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2911
Alex Elderc53d5892012-10-25 23:34:42 -05002912 return rbd_dev;
2913}
2914
2915static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2916{
Alex Elder86b00e02012-10-25 23:34:42 -05002917 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002918 kfree(rbd_dev->header_name);
2919 rbd_put_client(rbd_dev->rbd_client);
2920 rbd_spec_put(rbd_dev->spec);
2921 kfree(rbd_dev);
2922}
2923
Alex Elder304f6802012-08-31 17:29:52 -05002924static bool rbd_snap_registered(struct rbd_snap *snap)
2925{
2926 bool ret = snap->dev.type == &rbd_snap_device_type;
2927 bool reg = device_is_registered(&snap->dev);
2928
2929 rbd_assert(!ret ^ reg);
2930
2931 return ret;
2932}
2933
Alex Elder41f38c22012-10-25 23:34:40 -05002934static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002935{
2936 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002937 if (device_is_registered(&snap->dev))
2938 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002939}
2940
Alex Elder14e70852012-07-19 09:09:27 -05002941static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002942 struct device *parent)
2943{
2944 struct device *dev = &snap->dev;
2945 int ret;
2946
2947 dev->type = &rbd_snap_device_type;
2948 dev->parent = parent;
2949 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002950 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002951 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2952
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002953 ret = device_register(dev);
2954
2955 return ret;
2956}
2957
Alex Elder4e891e02012-07-10 20:30:10 -05002958static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002959 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002960 u64 snap_id, u64 snap_size,
2961 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002962{
Alex Elder4e891e02012-07-10 20:30:10 -05002963 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002964 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002965
2966 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002967 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002968 return ERR_PTR(-ENOMEM);
2969
2970 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002971 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002972 if (!snap->name)
2973 goto err;
2974
Alex Elderc8d18422012-07-10 20:30:11 -05002975 snap->id = snap_id;
2976 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002977 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002978
2979 return snap;
2980
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002981err:
2982 kfree(snap->name);
2983 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002984
2985 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002986}
2987
Alex Eldercd892122012-07-03 16:01:19 -05002988static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2989 u64 *snap_size, u64 *snap_features)
2990{
2991 char *snap_name;
2992
2993 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2994
2995 *snap_size = rbd_dev->header.snap_sizes[which];
2996 *snap_features = 0; /* No features for v1 */
2997
2998 /* Skip over names until we find the one we are looking for */
2999
3000 snap_name = rbd_dev->header.snap_names;
3001 while (which--)
3002 snap_name += strlen(snap_name) + 1;
3003
3004 return snap_name;
3005}
3006
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003007/*
Alex Elder9d475de2012-07-03 16:01:19 -05003008 * Get the size and object order for an image snapshot, or if
3009 * snap_id is CEPH_NOSNAP, gets this information for the base
3010 * image.
3011 */
3012static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3013 u8 *order, u64 *snap_size)
3014{
3015 __le64 snapid = cpu_to_le64(snap_id);
3016 int ret;
3017 struct {
3018 u8 order;
3019 __le64 size;
3020 } __attribute__ ((packed)) size_buf = { 0 };
3021
3022 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3023 "rbd", "get_size",
3024 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06003025 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder9d475de2012-07-03 16:01:19 -05003026 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3027 if (ret < 0)
3028 return ret;
3029
3030 *order = size_buf.order;
3031 *snap_size = le64_to_cpu(size_buf.size);
3032
3033 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
3034 (unsigned long long) snap_id, (unsigned int) *order,
3035 (unsigned long long) *snap_size);
3036
3037 return 0;
3038}
3039
3040static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3041{
3042 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3043 &rbd_dev->header.obj_order,
3044 &rbd_dev->header.image_size);
3045}
3046
Alex Elder1e130192012-07-03 16:01:19 -05003047static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3048{
3049 void *reply_buf;
3050 int ret;
3051 void *p;
3052
3053 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3054 if (!reply_buf)
3055 return -ENOMEM;
3056
3057 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3058 "rbd", "get_object_prefix",
3059 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003060 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder1e130192012-07-03 16:01:19 -05003061 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3062 if (ret < 0)
3063 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003064 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05003065
3066 p = reply_buf;
3067 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3068 p + RBD_OBJ_PREFIX_LEN_MAX,
3069 NULL, GFP_NOIO);
3070
3071 if (IS_ERR(rbd_dev->header.object_prefix)) {
3072 ret = PTR_ERR(rbd_dev->header.object_prefix);
3073 rbd_dev->header.object_prefix = NULL;
3074 } else {
3075 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3076 }
3077
3078out:
3079 kfree(reply_buf);
3080
3081 return ret;
3082}
3083
Alex Elderb1b54022012-07-03 16:01:19 -05003084static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3085 u64 *snap_features)
3086{
3087 __le64 snapid = cpu_to_le64(snap_id);
3088 struct {
3089 __le64 features;
3090 __le64 incompat;
3091 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003092 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003093 int ret;
3094
3095 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3096 "rbd", "get_features",
3097 (char *) &snapid, sizeof (snapid),
3098 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06003099 NULL);
Alex Elderb1b54022012-07-03 16:01:19 -05003100 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3101 if (ret < 0)
3102 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07003103
3104 incompat = le64_to_cpu(features_buf.incompat);
3105 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003106 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003107
Alex Elderb1b54022012-07-03 16:01:19 -05003108 *snap_features = le64_to_cpu(features_buf.features);
3109
3110 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3111 (unsigned long long) snap_id,
3112 (unsigned long long) *snap_features,
3113 (unsigned long long) le64_to_cpu(features_buf.incompat));
3114
3115 return 0;
3116}
3117
3118static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3119{
3120 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3121 &rbd_dev->header.features);
3122}
3123
Alex Elder86b00e02012-10-25 23:34:42 -05003124static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3125{
3126 struct rbd_spec *parent_spec;
3127 size_t size;
3128 void *reply_buf = NULL;
3129 __le64 snapid;
3130 void *p;
3131 void *end;
3132 char *image_id;
3133 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003134 int ret;
3135
3136 parent_spec = rbd_spec_alloc();
3137 if (!parent_spec)
3138 return -ENOMEM;
3139
3140 size = sizeof (__le64) + /* pool_id */
3141 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3142 sizeof (__le64) + /* snap_id */
3143 sizeof (__le64); /* overlap */
3144 reply_buf = kmalloc(size, GFP_KERNEL);
3145 if (!reply_buf) {
3146 ret = -ENOMEM;
3147 goto out_err;
3148 }
3149
3150 snapid = cpu_to_le64(CEPH_NOSNAP);
3151 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3152 "rbd", "get_parent",
3153 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06003154 (char *) reply_buf, size, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003155 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3156 if (ret < 0)
3157 goto out_err;
3158
3159 ret = -ERANGE;
3160 p = reply_buf;
3161 end = (char *) reply_buf + size;
3162 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3163 if (parent_spec->pool_id == CEPH_NOPOOL)
3164 goto out; /* No parent? No problem. */
3165
Alex Elder0903e872012-11-14 12:25:19 -06003166 /* The ceph file layout needs to fit pool id in 32 bits */
3167
3168 ret = -EIO;
3169 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3170 goto out;
3171
Alex Elder979ed482012-11-01 08:39:26 -05003172 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003173 if (IS_ERR(image_id)) {
3174 ret = PTR_ERR(image_id);
3175 goto out_err;
3176 }
3177 parent_spec->image_id = image_id;
3178 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3179 ceph_decode_64_safe(&p, end, overlap, out_err);
3180
3181 rbd_dev->parent_overlap = overlap;
3182 rbd_dev->parent_spec = parent_spec;
3183 parent_spec = NULL; /* rbd_dev now owns this */
3184out:
3185 ret = 0;
3186out_err:
3187 kfree(reply_buf);
3188 rbd_spec_put(parent_spec);
3189
3190 return ret;
3191}
3192
Alex Elder9e15b772012-10-30 19:40:33 -05003193static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3194{
3195 size_t image_id_size;
3196 char *image_id;
3197 void *p;
3198 void *end;
3199 size_t size;
3200 void *reply_buf = NULL;
3201 size_t len = 0;
3202 char *image_name = NULL;
3203 int ret;
3204
3205 rbd_assert(!rbd_dev->spec->image_name);
3206
Alex Elder69e7a022012-11-01 08:39:26 -05003207 len = strlen(rbd_dev->spec->image_id);
3208 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003209 image_id = kmalloc(image_id_size, GFP_KERNEL);
3210 if (!image_id)
3211 return NULL;
3212
3213 p = image_id;
3214 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05003215 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05003216
3217 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3218 reply_buf = kmalloc(size, GFP_KERNEL);
3219 if (!reply_buf)
3220 goto out;
3221
3222 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
3223 "rbd", "dir_get_name",
3224 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06003225 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05003226 if (ret < 0)
3227 goto out;
3228 p = reply_buf;
3229 end = (char *) reply_buf + size;
3230 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3231 if (IS_ERR(image_name))
3232 image_name = NULL;
3233 else
3234 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3235out:
3236 kfree(reply_buf);
3237 kfree(image_id);
3238
3239 return image_name;
3240}
3241
3242/*
3243 * When a parent image gets probed, we only have the pool, image,
3244 * and snapshot ids but not the names of any of them. This call
3245 * is made later to fill in those names. It has to be done after
3246 * rbd_dev_snaps_update() has completed because some of the
3247 * information (in particular, snapshot name) is not available
3248 * until then.
3249 */
3250static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3251{
3252 struct ceph_osd_client *osdc;
3253 const char *name;
3254 void *reply_buf = NULL;
3255 int ret;
3256
3257 if (rbd_dev->spec->pool_name)
3258 return 0; /* Already have the names */
3259
3260 /* Look up the pool name */
3261
3262 osdc = &rbd_dev->rbd_client->client->osdc;
3263 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003264 if (!name) {
3265 rbd_warn(rbd_dev, "there is no pool with id %llu",
3266 rbd_dev->spec->pool_id); /* Really a BUG() */
3267 return -EIO;
3268 }
Alex Elder9e15b772012-10-30 19:40:33 -05003269
3270 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3271 if (!rbd_dev->spec->pool_name)
3272 return -ENOMEM;
3273
3274 /* Fetch the image name; tolerate failure here */
3275
3276 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003277 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05003278 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05003279 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003280 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003281
3282 /* Look up the snapshot name. */
3283
3284 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3285 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003286 rbd_warn(rbd_dev, "no snapshot with id %llu",
3287 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003288 ret = -EIO;
3289 goto out_err;
3290 }
3291 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3292 if(!rbd_dev->spec->snap_name)
3293 goto out_err;
3294
3295 return 0;
3296out_err:
3297 kfree(reply_buf);
3298 kfree(rbd_dev->spec->pool_name);
3299 rbd_dev->spec->pool_name = NULL;
3300
3301 return ret;
3302}
3303
Alex Elder6e14b1a2012-07-03 16:01:19 -05003304static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003305{
3306 size_t size;
3307 int ret;
3308 void *reply_buf;
3309 void *p;
3310 void *end;
3311 u64 seq;
3312 u32 snap_count;
3313 struct ceph_snap_context *snapc;
3314 u32 i;
3315
3316 /*
3317 * We'll need room for the seq value (maximum snapshot id),
3318 * snapshot count, and array of that many snapshot ids.
3319 * For now we have a fixed upper limit on the number we're
3320 * prepared to receive.
3321 */
3322 size = sizeof (__le64) + sizeof (__le32) +
3323 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3324 reply_buf = kzalloc(size, GFP_KERNEL);
3325 if (!reply_buf)
3326 return -ENOMEM;
3327
3328 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3329 "rbd", "get_snapcontext",
3330 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003331 reply_buf, size, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003332 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3333 if (ret < 0)
3334 goto out;
3335
3336 ret = -ERANGE;
3337 p = reply_buf;
3338 end = (char *) reply_buf + size;
3339 ceph_decode_64_safe(&p, end, seq, out);
3340 ceph_decode_32_safe(&p, end, snap_count, out);
3341
3342 /*
3343 * Make sure the reported number of snapshot ids wouldn't go
3344 * beyond the end of our buffer. But before checking that,
3345 * make sure the computed size of the snapshot context we
3346 * allocate is representable in a size_t.
3347 */
3348 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3349 / sizeof (u64)) {
3350 ret = -EINVAL;
3351 goto out;
3352 }
3353 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3354 goto out;
3355
3356 size = sizeof (struct ceph_snap_context) +
3357 snap_count * sizeof (snapc->snaps[0]);
3358 snapc = kmalloc(size, GFP_KERNEL);
3359 if (!snapc) {
3360 ret = -ENOMEM;
3361 goto out;
3362 }
3363
3364 atomic_set(&snapc->nref, 1);
3365 snapc->seq = seq;
3366 snapc->num_snaps = snap_count;
3367 for (i = 0; i < snap_count; i++)
3368 snapc->snaps[i] = ceph_decode_64(&p);
3369
3370 rbd_dev->header.snapc = snapc;
3371
3372 dout(" snap context seq = %llu, snap_count = %u\n",
3373 (unsigned long long) seq, (unsigned int) snap_count);
3374
3375out:
3376 kfree(reply_buf);
3377
3378 return 0;
3379}
3380
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003381static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3382{
3383 size_t size;
3384 void *reply_buf;
3385 __le64 snap_id;
3386 int ret;
3387 void *p;
3388 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003389 char *snap_name;
3390
3391 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3392 reply_buf = kmalloc(size, GFP_KERNEL);
3393 if (!reply_buf)
3394 return ERR_PTR(-ENOMEM);
3395
3396 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3397 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
3398 "rbd", "get_snapshot_name",
3399 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003400 reply_buf, size, NULL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003401 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3402 if (ret < 0)
3403 goto out;
3404
3405 p = reply_buf;
3406 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003407 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003408 if (IS_ERR(snap_name)) {
3409 ret = PTR_ERR(snap_name);
3410 goto out;
3411 } else {
3412 dout(" snap_id 0x%016llx snap_name = %s\n",
3413 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3414 }
3415 kfree(reply_buf);
3416
3417 return snap_name;
3418out:
3419 kfree(reply_buf);
3420
3421 return ERR_PTR(ret);
3422}
3423
3424static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3425 u64 *snap_size, u64 *snap_features)
3426{
Alex Eldere0b49862013-01-09 14:44:18 -06003427 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003428 u8 order;
3429 int ret;
3430
3431 snap_id = rbd_dev->header.snapc->snaps[which];
3432 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3433 if (ret)
3434 return ERR_PTR(ret);
3435 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3436 if (ret)
3437 return ERR_PTR(ret);
3438
3439 return rbd_dev_v2_snap_name(rbd_dev, which);
3440}
3441
3442static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3443 u64 *snap_size, u64 *snap_features)
3444{
3445 if (rbd_dev->image_format == 1)
3446 return rbd_dev_v1_snap_info(rbd_dev, which,
3447 snap_size, snap_features);
3448 if (rbd_dev->image_format == 2)
3449 return rbd_dev_v2_snap_info(rbd_dev, which,
3450 snap_size, snap_features);
3451 return ERR_PTR(-EINVAL);
3452}
3453
Alex Elder117973f2012-08-31 17:29:55 -05003454static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3455{
3456 int ret;
3457 __u8 obj_order;
3458
3459 down_write(&rbd_dev->header_rwsem);
3460
3461 /* Grab old order first, to see if it changes */
3462
3463 obj_order = rbd_dev->header.obj_order,
3464 ret = rbd_dev_v2_image_size(rbd_dev);
3465 if (ret)
3466 goto out;
3467 if (rbd_dev->header.obj_order != obj_order) {
3468 ret = -EIO;
3469 goto out;
3470 }
3471 rbd_update_mapping_size(rbd_dev);
3472
3473 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3474 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3475 if (ret)
3476 goto out;
3477 ret = rbd_dev_snaps_update(rbd_dev);
3478 dout("rbd_dev_snaps_update returned %d\n", ret);
3479 if (ret)
3480 goto out;
3481 ret = rbd_dev_snaps_register(rbd_dev);
3482 dout("rbd_dev_snaps_register returned %d\n", ret);
3483out:
3484 up_write(&rbd_dev->header_rwsem);
3485
3486 return ret;
3487}
3488
Alex Elder9d475de2012-07-03 16:01:19 -05003489/*
Alex Elder35938152012-08-02 11:29:46 -05003490 * Scan the rbd device's current snapshot list and compare it to the
3491 * newly-received snapshot context. Remove any existing snapshots
3492 * not present in the new snapshot context. Add a new snapshot for
3493 * any snaphots in the snapshot context not in the current list.
3494 * And verify there are no changes to snapshots we already know
3495 * about.
3496 *
3497 * Assumes the snapshots in the snapshot context are sorted by
3498 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3499 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003500 */
Alex Elder304f6802012-08-31 17:29:52 -05003501static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003502{
Alex Elder35938152012-08-02 11:29:46 -05003503 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3504 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003505 struct list_head *head = &rbd_dev->snaps;
3506 struct list_head *links = head->next;
3507 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003508
Alex Elder9fcbb802012-08-23 23:48:49 -05003509 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003510 while (index < snap_count || links != head) {
3511 u64 snap_id;
3512 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003513 char *snap_name;
3514 u64 snap_size = 0;
3515 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003516
Alex Elder35938152012-08-02 11:29:46 -05003517 snap_id = index < snap_count ? snapc->snaps[index]
3518 : CEPH_NOSNAP;
3519 snap = links != head ? list_entry(links, struct rbd_snap, node)
3520 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05003521 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003522
Alex Elder35938152012-08-02 11:29:46 -05003523 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3524 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003525
Alex Elder35938152012-08-02 11:29:46 -05003526 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003527
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003528 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06003529 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05003530 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003531 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003532 rbd_dev->spec->snap_id == snap->id ?
3533 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003534 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003535
Alex Elder35938152012-08-02 11:29:46 -05003536 /* Done with this list entry; advance */
3537
3538 links = next;
3539 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003540 }
Alex Elder35938152012-08-02 11:29:46 -05003541
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003542 snap_name = rbd_dev_snap_info(rbd_dev, index,
3543 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003544 if (IS_ERR(snap_name))
3545 return PTR_ERR(snap_name);
3546
Alex Elder9fcbb802012-08-23 23:48:49 -05003547 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3548 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003549 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3550 struct rbd_snap *new_snap;
3551
3552 /* We haven't seen this snapshot before */
3553
Alex Elderc8d18422012-07-10 20:30:11 -05003554 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003555 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003556 if (IS_ERR(new_snap)) {
3557 int err = PTR_ERR(new_snap);
3558
3559 dout(" failed to add dev, error %d\n", err);
3560
3561 return err;
3562 }
Alex Elder35938152012-08-02 11:29:46 -05003563
3564 /* New goes before existing, or at end of list */
3565
Alex Elder9fcbb802012-08-23 23:48:49 -05003566 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003567 if (snap)
3568 list_add_tail(&new_snap->node, &snap->node);
3569 else
Alex Elder523f3252012-08-30 00:16:37 -05003570 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003571 } else {
3572 /* Already have this one */
3573
Alex Elder9fcbb802012-08-23 23:48:49 -05003574 dout(" already present\n");
3575
Alex Eldercd892122012-07-03 16:01:19 -05003576 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05003577 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003578 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003579
3580 /* Done with this list entry; advance */
3581
3582 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003583 }
Alex Elder35938152012-08-02 11:29:46 -05003584
3585 /* Advance to the next entry in the snapshot context */
3586
3587 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003588 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003589 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003590
3591 return 0;
3592}
3593
Alex Elder304f6802012-08-31 17:29:52 -05003594/*
3595 * Scan the list of snapshots and register the devices for any that
3596 * have not already been registered.
3597 */
3598static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3599{
3600 struct rbd_snap *snap;
3601 int ret = 0;
3602
3603 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003604 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3605 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003606
3607 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3608 if (!rbd_snap_registered(snap)) {
3609 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3610 if (ret < 0)
3611 break;
3612 }
3613 }
3614 dout("%s: returning %d\n", __func__, ret);
3615
3616 return ret;
3617}
3618
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003619static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3620{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003621 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003622 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003623
3624 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003625
Alex Eldercd789ab2012-08-30 00:16:38 -05003626 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003627 dev->bus = &rbd_bus_type;
3628 dev->type = &rbd_device_type;
3629 dev->parent = &rbd_root_dev;
3630 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003631 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003632 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003633
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003634 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003635
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003636 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003637}
3638
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003639static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3640{
3641 device_unregister(&rbd_dev->dev);
3642}
3643
Alex Eldere2839302012-08-29 17:11:06 -05003644static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003645
3646/*
Alex Elder499afd52012-02-02 08:13:29 -06003647 * Get a unique rbd identifier for the given new rbd_dev, and add
3648 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003649 */
Alex Eldere2839302012-08-29 17:11:06 -05003650static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003651{
Alex Eldere2839302012-08-29 17:11:06 -05003652 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003653
3654 spin_lock(&rbd_dev_list_lock);
3655 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3656 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003657 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3658 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003659}
Alex Elderb7f23c32012-01-29 13:57:43 -06003660
Alex Elder1ddbe942012-01-29 13:57:44 -06003661/*
Alex Elder499afd52012-02-02 08:13:29 -06003662 * Remove an rbd_dev from the global list, and record that its
3663 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003664 */
Alex Eldere2839302012-08-29 17:11:06 -05003665static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003666{
Alex Elderd184f6b2012-01-29 13:57:44 -06003667 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003668 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003669 int max_id;
3670
Alex Elderaafb2302012-09-06 16:00:54 -05003671 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003672
Alex Eldere2839302012-08-29 17:11:06 -05003673 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3674 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003675 spin_lock(&rbd_dev_list_lock);
3676 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003677
3678 /*
3679 * If the id being "put" is not the current maximum, there
3680 * is nothing special we need to do.
3681 */
Alex Eldere2839302012-08-29 17:11:06 -05003682 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003683 spin_unlock(&rbd_dev_list_lock);
3684 return;
3685 }
3686
3687 /*
3688 * We need to update the current maximum id. Search the
3689 * list to find out what it is. We're more likely to find
3690 * the maximum at the end, so search the list backward.
3691 */
3692 max_id = 0;
3693 list_for_each_prev(tmp, &rbd_dev_list) {
3694 struct rbd_device *rbd_dev;
3695
3696 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003697 if (rbd_dev->dev_id > max_id)
3698 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003699 }
Alex Elder499afd52012-02-02 08:13:29 -06003700 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003701
Alex Elder1ddbe942012-01-29 13:57:44 -06003702 /*
Alex Eldere2839302012-08-29 17:11:06 -05003703 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003704 * which case it now accurately reflects the new maximum.
3705 * Be careful not to overwrite the maximum value in that
3706 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003707 */
Alex Eldere2839302012-08-29 17:11:06 -05003708 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3709 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003710}
3711
Alex Eldera725f65e2012-02-02 08:13:30 -06003712/*
Alex Eldere28fff262012-02-02 08:13:30 -06003713 * Skips over white space at *buf, and updates *buf to point to the
3714 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003715 * the token (string of non-white space characters) found. Note
3716 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003717 */
3718static inline size_t next_token(const char **buf)
3719{
3720 /*
3721 * These are the characters that produce nonzero for
3722 * isspace() in the "C" and "POSIX" locales.
3723 */
3724 const char *spaces = " \f\n\r\t\v";
3725
3726 *buf += strspn(*buf, spaces); /* Find start of token */
3727
3728 return strcspn(*buf, spaces); /* Return token length */
3729}
3730
3731/*
3732 * Finds the next token in *buf, and if the provided token buffer is
3733 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003734 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3735 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003736 *
3737 * Returns the length of the token found (not including the '\0').
3738 * Return value will be 0 if no token is found, and it will be >=
3739 * token_size if the token would not fit.
3740 *
Alex Elder593a9e72012-02-07 12:03:37 -06003741 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003742 * found token. Note that this occurs even if the token buffer is
3743 * too small to hold it.
3744 */
3745static inline size_t copy_token(const char **buf,
3746 char *token,
3747 size_t token_size)
3748{
3749 size_t len;
3750
3751 len = next_token(buf);
3752 if (len < token_size) {
3753 memcpy(token, *buf, len);
3754 *(token + len) = '\0';
3755 }
3756 *buf += len;
3757
3758 return len;
3759}
3760
3761/*
Alex Elderea3352f2012-07-09 21:04:23 -05003762 * Finds the next token in *buf, dynamically allocates a buffer big
3763 * enough to hold a copy of it, and copies the token into the new
3764 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3765 * that a duplicate buffer is created even for a zero-length token.
3766 *
3767 * Returns a pointer to the newly-allocated duplicate, or a null
3768 * pointer if memory for the duplicate was not available. If
3769 * the lenp argument is a non-null pointer, the length of the token
3770 * (not including the '\0') is returned in *lenp.
3771 *
3772 * If successful, the *buf pointer will be updated to point beyond
3773 * the end of the found token.
3774 *
3775 * Note: uses GFP_KERNEL for allocation.
3776 */
3777static inline char *dup_token(const char **buf, size_t *lenp)
3778{
3779 char *dup;
3780 size_t len;
3781
3782 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003783 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003784 if (!dup)
3785 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003786 *(dup + len) = '\0';
3787 *buf += len;
3788
3789 if (lenp)
3790 *lenp = len;
3791
3792 return dup;
3793}
3794
3795/*
Alex Elder859c31d2012-10-25 23:34:42 -05003796 * Parse the options provided for an "rbd add" (i.e., rbd image
3797 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3798 * and the data written is passed here via a NUL-terminated buffer.
3799 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003800 *
Alex Elder859c31d2012-10-25 23:34:42 -05003801 * The information extracted from these options is recorded in
3802 * the other parameters which return dynamically-allocated
3803 * structures:
3804 * ceph_opts
3805 * The address of a pointer that will refer to a ceph options
3806 * structure. Caller must release the returned pointer using
3807 * ceph_destroy_options() when it is no longer needed.
3808 * rbd_opts
3809 * Address of an rbd options pointer. Fully initialized by
3810 * this function; caller must release with kfree().
3811 * spec
3812 * Address of an rbd image specification pointer. Fully
3813 * initialized by this function based on parsed options.
3814 * Caller must release with rbd_spec_put().
3815 *
3816 * The options passed take this form:
3817 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3818 * where:
3819 * <mon_addrs>
3820 * A comma-separated list of one or more monitor addresses.
3821 * A monitor address is an ip address, optionally followed
3822 * by a port number (separated by a colon).
3823 * I.e.: ip1[:port1][,ip2[:port2]...]
3824 * <options>
3825 * A comma-separated list of ceph and/or rbd options.
3826 * <pool_name>
3827 * The name of the rados pool containing the rbd image.
3828 * <image_name>
3829 * The name of the image in that pool to map.
3830 * <snap_id>
3831 * An optional snapshot id. If provided, the mapping will
3832 * present data from the image at the time that snapshot was
3833 * created. The image head is used if no snapshot id is
3834 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003835 */
Alex Elder859c31d2012-10-25 23:34:42 -05003836static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003837 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003838 struct rbd_options **opts,
3839 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003840{
Alex Elderd22f76e2012-07-12 10:46:35 -05003841 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003842 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003843 const char *mon_addrs;
3844 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003845 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003846 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003847 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003848 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003849
3850 /* The first four tokens are required */
3851
Alex Elder7ef32142012-02-02 08:13:30 -06003852 len = next_token(&buf);
Alex Elder4fb5d672012-11-01 10:17:15 -05003853 if (!len) {
3854 rbd_warn(NULL, "no monitor address(es) provided");
3855 return -EINVAL;
3856 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003857 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003858 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003859 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003860
Alex Elderdc79b112012-10-25 23:34:41 -05003861 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003862 options = dup_token(&buf, NULL);
3863 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003864 return -ENOMEM;
Alex Elder4fb5d672012-11-01 10:17:15 -05003865 if (!*options) {
3866 rbd_warn(NULL, "no options provided");
3867 goto out_err;
3868 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003869
Alex Elder859c31d2012-10-25 23:34:42 -05003870 spec = rbd_spec_alloc();
3871 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003872 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003873
3874 spec->pool_name = dup_token(&buf, NULL);
3875 if (!spec->pool_name)
3876 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003877 if (!*spec->pool_name) {
3878 rbd_warn(NULL, "no pool name provided");
3879 goto out_err;
3880 }
Alex Eldere28fff262012-02-02 08:13:30 -06003881
Alex Elder69e7a022012-11-01 08:39:26 -05003882 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003883 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003884 goto out_mem;
Alex Elder4fb5d672012-11-01 10:17:15 -05003885 if (!*spec->image_name) {
3886 rbd_warn(NULL, "no image name provided");
3887 goto out_err;
3888 }
Alex Eldere28fff262012-02-02 08:13:30 -06003889
Alex Elderf28e5652012-10-25 23:34:41 -05003890 /*
3891 * Snapshot name is optional; default is to use "-"
3892 * (indicating the head/no snapshot).
3893 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003894 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003895 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003896 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3897 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003898 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003899 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003900 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003901 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003902 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003903 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003904 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003905 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003906
Alex Elder0ddebc02012-10-25 23:34:41 -05003907 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003908
Alex Elder4e9afeb2012-10-25 23:34:41 -05003909 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3910 if (!rbd_opts)
3911 goto out_mem;
3912
3913 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003914
Alex Elder859c31d2012-10-25 23:34:42 -05003915 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003916 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003917 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003918 if (IS_ERR(copts)) {
3919 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003920 goto out_err;
3921 }
Alex Elder859c31d2012-10-25 23:34:42 -05003922 kfree(options);
3923
3924 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003925 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003926 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003927
Alex Elderdc79b112012-10-25 23:34:41 -05003928 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003929out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003930 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003931out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003932 kfree(rbd_opts);
3933 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003934 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003935
Alex Elderdc79b112012-10-25 23:34:41 -05003936 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003937}
3938
Alex Elder589d30e2012-07-10 20:30:11 -05003939/*
3940 * An rbd format 2 image has a unique identifier, distinct from the
3941 * name given to it by the user. Internally, that identifier is
3942 * what's used to specify the names of objects related to the image.
3943 *
3944 * A special "rbd id" object is used to map an rbd image name to its
3945 * id. If that object doesn't exist, then there is no v2 rbd image
3946 * with the supplied name.
3947 *
3948 * This function will record the given rbd_dev's image_id field if
3949 * it can be determined, and in that case will return 0. If any
3950 * errors occur a negative errno will be returned and the rbd_dev's
3951 * image_id field will be unchanged (and should be NULL).
3952 */
3953static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3954{
3955 int ret;
3956 size_t size;
3957 char *object_name;
3958 void *response;
3959 void *p;
3960
3961 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003962 * When probing a parent image, the image id is already
3963 * known (and the image name likely is not). There's no
3964 * need to fetch the image id again in this case.
3965 */
3966 if (rbd_dev->spec->image_id)
3967 return 0;
3968
3969 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003970 * First, see if the format 2 image id file exists, and if
3971 * so, get the image's persistent id from it.
3972 */
Alex Elder69e7a022012-11-01 08:39:26 -05003973 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003974 object_name = kmalloc(size, GFP_NOIO);
3975 if (!object_name)
3976 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003977 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003978 dout("rbd id object name is %s\n", object_name);
3979
3980 /* Response will be an encoded string, which includes a length */
3981
3982 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3983 response = kzalloc(size, GFP_NOIO);
3984 if (!response) {
3985 ret = -ENOMEM;
3986 goto out;
3987 }
3988
3989 ret = rbd_req_sync_exec(rbd_dev, object_name,
3990 "rbd", "get_id",
3991 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003992 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003993 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3994 if (ret < 0)
3995 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003996 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003997
3998 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003999 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05004000 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05004001 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004002 if (IS_ERR(rbd_dev->spec->image_id)) {
4003 ret = PTR_ERR(rbd_dev->spec->image_id);
4004 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05004005 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004006 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004007 }
4008out:
4009 kfree(response);
4010 kfree(object_name);
4011
4012 return ret;
4013}
4014
Alex Eldera30b71b2012-07-10 20:30:11 -05004015static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4016{
4017 int ret;
4018 size_t size;
4019
4020 /* Version 1 images have no id; empty string is used */
4021
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004022 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
4023 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05004024 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05004025
4026 /* Record the header object name for this rbd image. */
4027
Alex Elder69e7a022012-11-01 08:39:26 -05004028 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004029 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4030 if (!rbd_dev->header_name) {
4031 ret = -ENOMEM;
4032 goto out_err;
4033 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004034 sprintf(rbd_dev->header_name, "%s%s",
4035 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004036
4037 /* Populate rbd image metadata */
4038
4039 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4040 if (ret < 0)
4041 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004042
4043 /* Version 1 images have no parent (no layering) */
4044
4045 rbd_dev->parent_spec = NULL;
4046 rbd_dev->parent_overlap = 0;
4047
Alex Eldera30b71b2012-07-10 20:30:11 -05004048 rbd_dev->image_format = 1;
4049
4050 dout("discovered version 1 image, header name is %s\n",
4051 rbd_dev->header_name);
4052
4053 return 0;
4054
4055out_err:
4056 kfree(rbd_dev->header_name);
4057 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004058 kfree(rbd_dev->spec->image_id);
4059 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004060
4061 return ret;
4062}
4063
4064static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4065{
4066 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05004067 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004068 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05004069
4070 /*
4071 * Image id was filled in by the caller. Record the header
4072 * object name for this rbd image.
4073 */
Alex Elder979ed482012-11-01 08:39:26 -05004074 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05004075 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4076 if (!rbd_dev->header_name)
4077 return -ENOMEM;
4078 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004079 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05004080
4081 /* Get the size and object order for the image */
4082
4083 ret = rbd_dev_v2_image_size(rbd_dev);
4084 if (ret < 0)
4085 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004086
4087 /* Get the object prefix (a.k.a. block_name) for the image */
4088
4089 ret = rbd_dev_v2_object_prefix(rbd_dev);
4090 if (ret < 0)
4091 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004092
Alex Elderd8891402012-10-09 13:50:17 -07004093 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004094
4095 ret = rbd_dev_v2_features(rbd_dev);
4096 if (ret < 0)
4097 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004098
Alex Elder86b00e02012-10-25 23:34:42 -05004099 /* If the image supports layering, get the parent info */
4100
4101 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4102 ret = rbd_dev_v2_parent_info(rbd_dev);
4103 if (ret < 0)
4104 goto out_err;
4105 }
4106
Alex Elder6e14b1a2012-07-03 16:01:19 -05004107 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004108
Alex Elder6e14b1a2012-07-03 16:01:19 -05004109 rbd_dev->header.crypt_type = 0;
4110 rbd_dev->header.comp_type = 0;
4111
4112 /* Get the snapshot context, plus the header version */
4113
4114 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05004115 if (ret)
4116 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004117 rbd_dev->header.obj_version = ver;
4118
Alex Eldera30b71b2012-07-10 20:30:11 -05004119 rbd_dev->image_format = 2;
4120
4121 dout("discovered version 2 image, header name is %s\n",
4122 rbd_dev->header_name);
4123
Alex Elder35152972012-08-31 17:29:55 -05004124 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004125out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004126 rbd_dev->parent_overlap = 0;
4127 rbd_spec_put(rbd_dev->parent_spec);
4128 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004129 kfree(rbd_dev->header_name);
4130 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004131 kfree(rbd_dev->header.object_prefix);
4132 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004133
4134 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004135}
4136
Alex Elder83a06262012-10-30 15:47:17 -05004137static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
4138{
4139 int ret;
4140
4141 /* no need to lock here, as rbd_dev is not registered yet */
4142 ret = rbd_dev_snaps_update(rbd_dev);
4143 if (ret)
4144 return ret;
4145
Alex Elder9e15b772012-10-30 19:40:33 -05004146 ret = rbd_dev_probe_update_spec(rbd_dev);
4147 if (ret)
4148 goto err_out_snaps;
4149
Alex Elder83a06262012-10-30 15:47:17 -05004150 ret = rbd_dev_set_mapping(rbd_dev);
4151 if (ret)
4152 goto err_out_snaps;
4153
4154 /* generate unique id: find highest unique id, add one */
4155 rbd_dev_id_get(rbd_dev);
4156
4157 /* Fill in the device name, now that we have its id. */
4158 BUILD_BUG_ON(DEV_NAME_LEN
4159 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4160 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4161
4162 /* Get our block major device number. */
4163
4164 ret = register_blkdev(0, rbd_dev->name);
4165 if (ret < 0)
4166 goto err_out_id;
4167 rbd_dev->major = ret;
4168
4169 /* Set up the blkdev mapping. */
4170
4171 ret = rbd_init_disk(rbd_dev);
4172 if (ret)
4173 goto err_out_blkdev;
4174
4175 ret = rbd_bus_add_dev(rbd_dev);
4176 if (ret)
4177 goto err_out_disk;
4178
4179 /*
4180 * At this point cleanup in the event of an error is the job
4181 * of the sysfs code (initiated by rbd_bus_del_dev()).
4182 */
4183 down_write(&rbd_dev->header_rwsem);
4184 ret = rbd_dev_snaps_register(rbd_dev);
4185 up_write(&rbd_dev->header_rwsem);
4186 if (ret)
4187 goto err_out_bus;
4188
Alex Elderc0430642013-01-18 12:31:09 -06004189 ret = rbd_req_sync_watch(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05004190 if (ret)
4191 goto err_out_bus;
4192
4193 /* Everything's ready. Announce the disk to the world. */
4194
4195 add_disk(rbd_dev->disk);
4196
4197 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4198 (unsigned long long) rbd_dev->mapping.size);
4199
4200 return ret;
4201err_out_bus:
4202 /* this will also clean up rest of rbd_dev stuff */
4203
4204 rbd_bus_del_dev(rbd_dev);
4205
4206 return ret;
4207err_out_disk:
4208 rbd_free_disk(rbd_dev);
4209err_out_blkdev:
4210 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4211err_out_id:
4212 rbd_dev_id_put(rbd_dev);
4213err_out_snaps:
4214 rbd_remove_all_snaps(rbd_dev);
4215
4216 return ret;
4217}
4218
Alex Eldera30b71b2012-07-10 20:30:11 -05004219/*
4220 * Probe for the existence of the header object for the given rbd
4221 * device. For format 2 images this includes determining the image
4222 * id.
4223 */
4224static int rbd_dev_probe(struct rbd_device *rbd_dev)
4225{
4226 int ret;
4227
4228 /*
4229 * Get the id from the image id object. If it's not a
4230 * format 2 image, we'll get ENOENT back, and we'll assume
4231 * it's a format 1 image.
4232 */
4233 ret = rbd_dev_image_id(rbd_dev);
4234 if (ret)
4235 ret = rbd_dev_v1_probe(rbd_dev);
4236 else
4237 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004238 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05004239 dout("probe failed, returning %d\n", ret);
4240
Alex Elder83a06262012-10-30 15:47:17 -05004241 return ret;
4242 }
4243
4244 ret = rbd_dev_probe_finish(rbd_dev);
4245 if (ret)
4246 rbd_header_free(&rbd_dev->header);
4247
Alex Eldera30b71b2012-07-10 20:30:11 -05004248 return ret;
4249}
4250
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004251static ssize_t rbd_add(struct bus_type *bus,
4252 const char *buf,
4253 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004254{
Alex Eldercb8627c2012-07-09 21:04:23 -05004255 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004256 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004257 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004258 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004259 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004260 struct ceph_osd_client *osdc;
4261 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004262
4263 if (!try_module_get(THIS_MODULE))
4264 return -ENODEV;
4265
Alex Eldera725f65e2012-02-02 08:13:30 -06004266 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004267 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004268 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004269 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004270
Alex Elder9d3997f2012-10-25 23:34:42 -05004271 rbdc = rbd_get_client(ceph_opts);
4272 if (IS_ERR(rbdc)) {
4273 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004274 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004275 }
Alex Elderc53d5892012-10-25 23:34:42 -05004276 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004277
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004278 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004279 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004280 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004281 if (rc < 0)
4282 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004283 spec->pool_id = (u64) rc;
4284
Alex Elder0903e872012-11-14 12:25:19 -06004285 /* The ceph file layout needs to fit pool id in 32 bits */
4286
4287 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4288 rc = -EIO;
4289 goto err_out_client;
4290 }
4291
Alex Elderc53d5892012-10-25 23:34:42 -05004292 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004293 if (!rbd_dev)
4294 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004295 rbdc = NULL; /* rbd_dev now owns this */
4296 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004297
Alex Elderbd4ba652012-10-25 23:34:42 -05004298 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004299 kfree(rbd_opts);
4300 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004301
Alex Eldera30b71b2012-07-10 20:30:11 -05004302 rc = rbd_dev_probe(rbd_dev);
4303 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004304 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004305
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004306 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004307err_out_rbd_dev:
4308 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004309err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004310 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004311err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004312 if (ceph_opts)
4313 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004314 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004315 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004316err_out_module:
4317 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004318
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004319 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004320
4321 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004322}
4323
Alex Elderde71a292012-07-03 16:01:19 -05004324static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004325{
4326 struct list_head *tmp;
4327 struct rbd_device *rbd_dev;
4328
Alex Eldere124a822012-01-29 13:57:44 -06004329 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004330 list_for_each(tmp, &rbd_dev_list) {
4331 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004332 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06004333 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004334 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06004335 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004336 }
Alex Eldere124a822012-01-29 13:57:44 -06004337 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004338 return NULL;
4339}
4340
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004341static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004342{
Alex Elder593a9e72012-02-07 12:03:37 -06004343 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004344
Alex Elder1dbb4392012-01-24 10:08:37 -06004345 if (rbd_dev->watch_request) {
4346 struct ceph_client *client = rbd_dev->rbd_client->client;
4347
4348 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004349 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06004350 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004351 if (rbd_dev->watch_event)
Alex Elder907703d2012-11-13 21:11:15 -06004352 rbd_req_sync_watch(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004353
4354 /* clean up and free blkdev */
4355 rbd_free_disk(rbd_dev);
4356 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004357
Alex Elder2ac4e752012-07-10 20:30:10 -05004358 /* release allocated disk header fields */
4359 rbd_header_free(&rbd_dev->header);
4360
Alex Elder32eec682012-02-08 16:11:14 -06004361 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004362 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004363 rbd_assert(rbd_dev->rbd_client != NULL);
4364 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004365
4366 /* release module ref */
4367 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004368}
4369
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004370static ssize_t rbd_remove(struct bus_type *bus,
4371 const char *buf,
4372 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004373{
4374 struct rbd_device *rbd_dev = NULL;
4375 int target_id, rc;
4376 unsigned long ul;
4377 int ret = count;
4378
4379 rc = strict_strtoul(buf, 10, &ul);
4380 if (rc)
4381 return rc;
4382
4383 /* convert to int; abort if we lost anything in the conversion */
4384 target_id = (int) ul;
4385 if (target_id != ul)
4386 return -EINVAL;
4387
4388 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4389
4390 rbd_dev = __rbd_get_dev(target_id);
4391 if (!rbd_dev) {
4392 ret = -ENOENT;
4393 goto done;
4394 }
4395
Alex Elder42382b72012-11-16 09:29:16 -06004396 if (rbd_dev->open_count) {
4397 ret = -EBUSY;
4398 goto done;
4399 }
4400
Alex Elder41f38c22012-10-25 23:34:40 -05004401 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004402 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004403
4404done:
4405 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004406
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004407 return ret;
4408}
4409
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004410/*
4411 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004412 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004413 */
4414static int rbd_sysfs_init(void)
4415{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004416 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004417
Alex Elderfed4c142012-02-07 12:03:36 -06004418 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004419 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004420 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004421
Alex Elderfed4c142012-02-07 12:03:36 -06004422 ret = bus_register(&rbd_bus_type);
4423 if (ret < 0)
4424 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004425
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004426 return ret;
4427}
4428
4429static void rbd_sysfs_cleanup(void)
4430{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004431 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004432 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004433}
4434
4435int __init rbd_init(void)
4436{
4437 int rc;
4438
4439 rc = rbd_sysfs_init();
4440 if (rc)
4441 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004442 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004443 return 0;
4444}
4445
4446void __exit rbd_exit(void)
4447{
4448 rbd_sysfs_cleanup();
4449}
4450
4451module_init(rbd_init);
4452module_exit(rbd_exit);
4453
4454MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4455MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4456MODULE_DESCRIPTION("rados block device");
4457
4458/* following authorship retained from original osdblk.c */
4459MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4460
4461MODULE_LICENSE("GPL");