blob: 86206a75017dc63f194428780de86c9442e323f5 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500115/*
116 * An rbd image specification.
117 *
118 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
119 * identify an image.
120 */
121struct rbd_spec {
122 u64 pool_id;
123 char *pool_name;
124
125 char *image_id;
126 size_t image_id_len;
127 char *image_name;
128 size_t image_name_len;
129
130 u64 snap_id;
131 char *snap_name;
132
133 struct kref kref;
134};
135
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700136struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700137 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700138};
139
140/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600141 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700142 */
143struct rbd_client {
144 struct ceph_client *client;
145 struct kref kref;
146 struct list_head node;
147};
148
149/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600150 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700152struct rbd_req_status {
153 int done;
154 int rc;
155 u64 bytes;
156};
157
158/*
159 * a collection of requests
160 */
161struct rbd_req_coll {
162 int total;
163 int num_done;
164 struct kref kref;
165 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700166};
167
Alex Elderf0f8cef2012-01-29 13:57:44 -0600168/*
169 * a single io request
170 */
171struct rbd_request {
172 struct request *rq; /* blk layer request */
173 struct bio *bio; /* cloned bio */
174 struct page **pages; /* list of used pages */
175 u64 len;
176 int coll_index;
177 struct rbd_req_coll *coll;
178};
179
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800180struct rbd_snap {
181 struct device dev;
182 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800183 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800184 struct list_head node;
185 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500186 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800187};
188
Alex Elderf84344f2012-08-31 17:29:51 -0500189struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500190 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500191 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500192 bool read_only;
193};
194
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195/*
196 * a single device
197 */
198struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500199 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700200
201 int major; /* blkdev assigned major */
202 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700203
Alex Eldera30b71b2012-07-10 20:30:11 -0500204 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700205 struct rbd_client *rbd_client;
206
207 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
208
209 spinlock_t lock; /* queue lock */
210
211 struct rbd_image_header header;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500212 bool exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500213 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700214
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500215 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500216
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700217 struct ceph_osd_event *watch_event;
218 struct ceph_osd_request *watch_request;
219
Josh Durginc6666012011-11-21 17:11:12 -0800220 /* protects updating the header */
221 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500222
223 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700224
225 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800226
227 /* list of snapshots */
228 struct list_head snaps;
229
230 /* sysfs related */
231 struct device dev;
232};
233
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700234static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600235
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700236static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600237static DEFINE_SPINLOCK(rbd_dev_list_lock);
238
Alex Elder432b8582012-01-29 13:57:44 -0600239static LIST_HEAD(rbd_client_list); /* clients */
240static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700241
Alex Elder304f6802012-08-31 17:29:52 -0500242static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
243static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
244
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800245static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500246static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800247
Alex Elderf0f8cef2012-01-29 13:57:44 -0600248static ssize_t rbd_add(struct bus_type *bus, const char *buf,
249 size_t count);
250static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
251 size_t count);
252
253static struct bus_attribute rbd_bus_attrs[] = {
254 __ATTR(add, S_IWUSR, NULL, rbd_add),
255 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
256 __ATTR_NULL
257};
258
259static struct bus_type rbd_bus_type = {
260 .name = "rbd",
261 .bus_attrs = rbd_bus_attrs,
262};
263
264static void rbd_root_dev_release(struct device *dev)
265{
266}
267
268static struct device rbd_root_dev = {
269 .init_name = "rbd",
270 .release = rbd_root_dev_release,
271};
272
Alex Elderaafb2302012-09-06 16:00:54 -0500273#ifdef RBD_DEBUG
274#define rbd_assert(expr) \
275 if (unlikely(!(expr))) { \
276 printk(KERN_ERR "\nAssertion failure in %s() " \
277 "at line %d:\n\n" \
278 "\trbd_assert(%s);\n\n", \
279 __func__, __LINE__, #expr); \
280 BUG(); \
281 }
282#else /* !RBD_DEBUG */
283# define rbd_assert(expr) ((void) 0)
284#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800285
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
287{
288 return get_device(&rbd_dev->dev);
289}
290
291static void rbd_put_dev(struct rbd_device *rbd_dev)
292{
293 put_device(&rbd_dev->dev);
294}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295
Alex Elder117973f2012-08-31 17:29:55 -0500296static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
297static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700298
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299static int rbd_open(struct block_device *bdev, fmode_t mode)
300{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600301 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302
Alex Elderf84344f2012-08-31 17:29:51 -0500303 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304 return -EROFS;
305
Alex Elder340c7a22012-08-10 13:12:07 -0700306 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500307 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700308
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309 return 0;
310}
311
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800312static int rbd_release(struct gendisk *disk, fmode_t mode)
313{
314 struct rbd_device *rbd_dev = disk->private_data;
315
316 rbd_put_dev(rbd_dev);
317
318 return 0;
319}
320
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700321static const struct block_device_operations rbd_bd_ops = {
322 .owner = THIS_MODULE,
323 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800324 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700325};
326
327/*
328 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500329 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700330 */
Alex Elderf8c38922012-08-10 13:12:07 -0700331static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332{
333 struct rbd_client *rbdc;
334 int ret = -ENOMEM;
335
336 dout("rbd_client_create\n");
337 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
338 if (!rbdc)
339 goto out_opt;
340
341 kref_init(&rbdc->kref);
342 INIT_LIST_HEAD(&rbdc->node);
343
Alex Elderbc534d862012-01-29 13:57:44 -0600344 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
345
Alex Elder43ae4702012-07-03 16:01:18 -0500346 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600348 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500349 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350
351 ret = ceph_open_session(rbdc->client);
352 if (ret < 0)
353 goto out_err;
354
Alex Elder432b8582012-01-29 13:57:44 -0600355 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600357 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358
Alex Elderbc534d862012-01-29 13:57:44 -0600359 mutex_unlock(&ctl_mutex);
360
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361 dout("rbd_client_create created %p\n", rbdc);
362 return rbdc;
363
364out_err:
365 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600366out_mutex:
367 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700368 kfree(rbdc);
369out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500370 if (ceph_opts)
371 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400372 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700373}
374
375/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700376 * Find a ceph client with specific addr and configuration. If
377 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700378 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700379static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380{
381 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700382 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383
Alex Elder43ae4702012-07-03 16:01:18 -0500384 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700385 return NULL;
386
Alex Elder1f7ba332012-08-10 13:12:07 -0700387 spin_lock(&rbd_client_list_lock);
388 list_for_each_entry(client_node, &rbd_client_list, node) {
389 if (!ceph_compare_options(ceph_opts, client_node->client)) {
390 kref_get(&client_node->kref);
391 found = true;
392 break;
393 }
394 }
395 spin_unlock(&rbd_client_list_lock);
396
397 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700398}
399
400/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 * mount options
402 */
403enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700404 Opt_last_int,
405 /* int args above */
406 Opt_last_string,
407 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700408 Opt_read_only,
409 Opt_read_write,
410 /* Boolean args above */
411 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700412};
413
Alex Elder43ae4702012-07-03 16:01:18 -0500414static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700415 /* int args above */
416 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500417 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700418 {Opt_read_only, "ro"}, /* Alternate spelling */
419 {Opt_read_write, "read_write"},
420 {Opt_read_write, "rw"}, /* Alternate spelling */
421 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700422 {-1, NULL}
423};
424
425static int parse_rbd_opts_token(char *c, void *private)
426{
Alex Elder43ae4702012-07-03 16:01:18 -0500427 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700428 substring_t argstr[MAX_OPT_ARGS];
429 int token, intval, ret;
430
Alex Elder43ae4702012-07-03 16:01:18 -0500431 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700432 if (token < 0)
433 return -EINVAL;
434
435 if (token < Opt_last_int) {
436 ret = match_int(&argstr[0], &intval);
437 if (ret < 0) {
438 pr_err("bad mount option arg (not int) "
439 "at '%s'\n", c);
440 return ret;
441 }
442 dout("got int token %d val %d\n", token, intval);
443 } else if (token > Opt_last_int && token < Opt_last_string) {
444 dout("got string token %d val %s\n", token,
445 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700446 } else if (token > Opt_last_string && token < Opt_last_bool) {
447 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448 } else {
449 dout("got token %d\n", token);
450 }
451
452 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700453 case Opt_read_only:
454 rbd_opts->read_only = true;
455 break;
456 case Opt_read_write:
457 rbd_opts->read_only = false;
458 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700459 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500460 rbd_assert(false);
461 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700462 }
463 return 0;
464}
465
466/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467 * Get a ceph client with specific addr and configuration, if one does
468 * not exist create it.
469 */
Alex Elder78cea762012-10-25 23:34:41 -0500470static int rbd_get_client(struct rbd_device *rbd_dev,
471 struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700472{
Alex Elderf8c38922012-08-10 13:12:07 -0700473 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700474
Alex Elder1f7ba332012-08-10 13:12:07 -0700475 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600477 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500478 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700479 } else {
480 rbdc = rbd_client_create(ceph_opts);
481 if (IS_ERR(rbdc))
482 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483 }
Alex Elderf8c38922012-08-10 13:12:07 -0700484 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700485
Alex Elderf8c38922012-08-10 13:12:07 -0700486 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700487}
488
489/*
490 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600491 *
Alex Elder432b8582012-01-29 13:57:44 -0600492 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700493 */
494static void rbd_client_release(struct kref *kref)
495{
496 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
497
498 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500499 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700500 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500501 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700502
503 ceph_destroy_client(rbdc->client);
504 kfree(rbdc);
505}
506
507/*
508 * Drop reference to ceph client node. If it's not referenced anymore, release
509 * it.
510 */
511static void rbd_put_client(struct rbd_device *rbd_dev)
512{
513 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
514 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700515}
516
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700517/*
518 * Destroy requests collection
519 */
520static void rbd_coll_release(struct kref *kref)
521{
522 struct rbd_req_coll *coll =
523 container_of(kref, struct rbd_req_coll, kref);
524
525 dout("rbd_coll_release %p\n", coll);
526 kfree(coll);
527}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700528
Alex Eldera30b71b2012-07-10 20:30:11 -0500529static bool rbd_image_format_valid(u32 image_format)
530{
531 return image_format == 1 || image_format == 2;
532}
533
Alex Elder8e94af82012-07-25 09:32:40 -0500534static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
535{
Alex Elder103a1502012-08-02 11:29:45 -0500536 size_t size;
537 u32 snap_count;
538
539 /* The header has to start with the magic rbd header text */
540 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
541 return false;
542
Alex Elderdb2388b2012-10-20 22:17:27 -0500543 /* The bio layer requires at least sector-sized I/O */
544
545 if (ondisk->options.order < SECTOR_SHIFT)
546 return false;
547
548 /* If we use u64 in a few spots we may be able to loosen this */
549
550 if (ondisk->options.order > 8 * sizeof (int) - 1)
551 return false;
552
Alex Elder103a1502012-08-02 11:29:45 -0500553 /*
554 * The size of a snapshot header has to fit in a size_t, and
555 * that limits the number of snapshots.
556 */
557 snap_count = le32_to_cpu(ondisk->snap_count);
558 size = SIZE_MAX - sizeof (struct ceph_snap_context);
559 if (snap_count > size / sizeof (__le64))
560 return false;
561
562 /*
563 * Not only that, but the size of the entire the snapshot
564 * header must also be representable in a size_t.
565 */
566 size -= snap_count * sizeof (__le64);
567 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
568 return false;
569
570 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500571}
572
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700573/*
574 * Create a new header structure, translate header format from the on-disk
575 * header.
576 */
577static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500578 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700579{
Alex Elderccece232012-07-10 20:30:10 -0500580 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500581 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500582 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500583 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584
Alex Elder6a523252012-07-19 17:12:59 -0500585 memset(header, 0, sizeof (*header));
586
Alex Elder103a1502012-08-02 11:29:45 -0500587 snap_count = le32_to_cpu(ondisk->snap_count);
588
Alex Elder58c17b02012-08-23 23:22:06 -0500589 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
590 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500591 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500593 memcpy(header->object_prefix, ondisk->object_prefix, len);
594 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600595
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700596 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500597 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
598
Alex Elder621901d2012-08-23 23:22:06 -0500599 /* Save a copy of the snapshot names */
600
Alex Elderf785cc12012-08-23 23:22:06 -0500601 if (snap_names_len > (u64) SIZE_MAX)
602 return -EIO;
603 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700604 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500605 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500606 /*
607 * Note that rbd_dev_v1_header_read() guarantees
608 * the ondisk buffer we're working with has
609 * snap_names_len bytes beyond the end of the
610 * snapshot id array, this memcpy() is safe.
611 */
612 memcpy(header->snap_names, &ondisk->snaps[snap_count],
613 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500614
Alex Elder621901d2012-08-23 23:22:06 -0500615 /* Record each snapshot's size */
616
Alex Elderd2bb24e2012-07-26 23:37:14 -0500617 size = snap_count * sizeof (*header->snap_sizes);
618 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500620 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500621 for (i = 0; i < snap_count; i++)
622 header->snap_sizes[i] =
623 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624 } else {
Alex Elderccece232012-07-10 20:30:10 -0500625 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626 header->snap_names = NULL;
627 header->snap_sizes = NULL;
628 }
Alex Elder849b4262012-07-09 21:04:24 -0500629
Alex Elder34b13182012-07-13 20:35:12 -0500630 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631 header->obj_order = ondisk->options.order;
632 header->crypt_type = ondisk->options.crypt_type;
633 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500634
Alex Elder621901d2012-08-23 23:22:06 -0500635 /* Allocate and fill in the snapshot context */
636
Alex Elderf84344f2012-08-31 17:29:51 -0500637 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500638 size = sizeof (struct ceph_snap_context);
639 size += snap_count * sizeof (header->snapc->snaps[0]);
640 header->snapc = kzalloc(size, GFP_KERNEL);
641 if (!header->snapc)
642 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643
644 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500645 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500647 for (i = 0; i < snap_count; i++)
648 header->snapc->snaps[i] =
649 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650
651 return 0;
652
Alex Elder6a523252012-07-19 17:12:59 -0500653out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500654 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500655 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500657 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500658 kfree(header->object_prefix);
659 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500660
Alex Elder00f1f362012-02-07 12:03:36 -0600661 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662}
663
Alex Elder8836b992012-08-30 14:42:15 -0500664static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666
Alex Eldere86924a2012-07-10 20:30:11 -0500667 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600668
Alex Eldere86924a2012-07-10 20:30:11 -0500669 list_for_each_entry(snap, &rbd_dev->snaps, node) {
670 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500671 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500672 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500673 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600674
Alex Eldere86924a2012-07-10 20:30:11 -0500675 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600676 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 }
Alex Eldere86924a2012-07-10 20:30:11 -0500678
Alex Elder00f1f362012-02-07 12:03:36 -0600679 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680}
681
Alex Elder819d52b2012-10-25 23:34:41 -0500682static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683{
Alex Elder78dc4472012-07-19 08:49:18 -0500684 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500686 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800687 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500688 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500689 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500690 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500691 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500693 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694 if (ret < 0)
695 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500696 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 }
Alex Elderdaba5fd2012-10-26 17:25:23 -0500698 rbd_dev->exists = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700 return ret;
701}
702
703static void rbd_header_free(struct rbd_image_header *header)
704{
Alex Elder849b4262012-07-09 21:04:24 -0500705 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500706 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500708 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500709 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500710 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800711 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500712 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713}
714
Alex Elder65ccfe22012-08-09 10:33:26 -0700715static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716{
Alex Elder65ccfe22012-08-09 10:33:26 -0700717 char *name;
718 u64 segment;
719 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700720
Alex Elder65ccfe22012-08-09 10:33:26 -0700721 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
722 if (!name)
723 return NULL;
724 segment = offset >> rbd_dev->header.obj_order;
725 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
726 rbd_dev->header.object_prefix, segment);
727 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
728 pr_err("error formatting segment name for #%llu (%d)\n",
729 segment, ret);
730 kfree(name);
731 name = NULL;
732 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733
Alex Elder65ccfe22012-08-09 10:33:26 -0700734 return name;
735}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700736
Alex Elder65ccfe22012-08-09 10:33:26 -0700737static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
738{
739 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700740
Alex Elder65ccfe22012-08-09 10:33:26 -0700741 return offset & (segment_size - 1);
742}
743
744static u64 rbd_segment_length(struct rbd_device *rbd_dev,
745 u64 offset, u64 length)
746{
747 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
748
749 offset &= segment_size - 1;
750
Alex Elderaafb2302012-09-06 16:00:54 -0500751 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700752 if (offset + length > segment_size)
753 length = segment_size - offset;
754
755 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700756}
757
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700758static int rbd_get_num_segments(struct rbd_image_header *header,
759 u64 ofs, u64 len)
760{
Alex Elderdf111be2012-08-09 10:33:26 -0700761 u64 start_seg;
762 u64 end_seg;
763
764 if (!len)
765 return 0;
766 if (len - 1 > U64_MAX - ofs)
767 return -ERANGE;
768
769 start_seg = ofs >> header->obj_order;
770 end_seg = (ofs + len - 1) >> header->obj_order;
771
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700772 return end_seg - start_seg + 1;
773}
774
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700775/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700776 * returns the size of an object in the image
777 */
778static u64 rbd_obj_bytes(struct rbd_image_header *header)
779{
780 return 1 << header->obj_order;
781}
782
783/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 * bio helpers
785 */
786
787static void bio_chain_put(struct bio *chain)
788{
789 struct bio *tmp;
790
791 while (chain) {
792 tmp = chain;
793 chain = chain->bi_next;
794 bio_put(tmp);
795 }
796}
797
798/*
799 * zeros a bio chain, starting at specific offset
800 */
801static void zero_bio_chain(struct bio *chain, int start_ofs)
802{
803 struct bio_vec *bv;
804 unsigned long flags;
805 void *buf;
806 int i;
807 int pos = 0;
808
809 while (chain) {
810 bio_for_each_segment(bv, chain, i) {
811 if (pos + bv->bv_len > start_ofs) {
812 int remainder = max(start_ofs - pos, 0);
813 buf = bvec_kmap_irq(bv, &flags);
814 memset(buf + remainder, 0,
815 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200816 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700817 }
818 pos += bv->bv_len;
819 }
820
821 chain = chain->bi_next;
822 }
823}
824
825/*
Alex Elderf7760da2012-10-20 22:17:27 -0500826 * Clone a portion of a bio, starting at the given byte offset
827 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700828 */
Alex Elderf7760da2012-10-20 22:17:27 -0500829static struct bio *bio_clone_range(struct bio *bio_src,
830 unsigned int offset,
831 unsigned int len,
832 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700833{
Alex Elderf7760da2012-10-20 22:17:27 -0500834 struct bio_vec *bv;
835 unsigned int resid;
836 unsigned short idx;
837 unsigned int voff;
838 unsigned short end_idx;
839 unsigned short vcnt;
840 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700841
Alex Elderf7760da2012-10-20 22:17:27 -0500842 /* Handle the easy case for the caller */
843
844 if (!offset && len == bio_src->bi_size)
845 return bio_clone(bio_src, gfpmask);
846
847 if (WARN_ON_ONCE(!len))
848 return NULL;
849 if (WARN_ON_ONCE(len > bio_src->bi_size))
850 return NULL;
851 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
852 return NULL;
853
854 /* Find first affected segment... */
855
856 resid = offset;
857 __bio_for_each_segment(bv, bio_src, idx, 0) {
858 if (resid < bv->bv_len)
859 break;
860 resid -= bv->bv_len;
861 }
862 voff = resid;
863
864 /* ...and the last affected segment */
865
866 resid += len;
867 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
868 if (resid <= bv->bv_len)
869 break;
870 resid -= bv->bv_len;
871 }
872 vcnt = end_idx - idx + 1;
873
874 /* Build the clone */
875
876 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
877 if (!bio)
878 return NULL; /* ENOMEM */
879
880 bio->bi_bdev = bio_src->bi_bdev;
881 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
882 bio->bi_rw = bio_src->bi_rw;
883 bio->bi_flags |= 1 << BIO_CLONED;
884
885 /*
886 * Copy over our part of the bio_vec, then update the first
887 * and last (or only) entries.
888 */
889 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
890 vcnt * sizeof (struct bio_vec));
891 bio->bi_io_vec[0].bv_offset += voff;
892 if (vcnt > 1) {
893 bio->bi_io_vec[0].bv_len -= voff;
894 bio->bi_io_vec[vcnt - 1].bv_len = resid;
895 } else {
896 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700897 }
898
Alex Elderf7760da2012-10-20 22:17:27 -0500899 bio->bi_vcnt = vcnt;
900 bio->bi_size = len;
901 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700902
Alex Elderf7760da2012-10-20 22:17:27 -0500903 return bio;
904}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905
Alex Elderf7760da2012-10-20 22:17:27 -0500906/*
907 * Clone a portion of a bio chain, starting at the given byte offset
908 * into the first bio in the source chain and continuing for the
909 * number of bytes indicated. The result is another bio chain of
910 * exactly the given length, or a null pointer on error.
911 *
912 * The bio_src and offset parameters are both in-out. On entry they
913 * refer to the first source bio and the offset into that bio where
914 * the start of data to be cloned is located.
915 *
916 * On return, bio_src is updated to refer to the bio in the source
917 * chain that contains first un-cloned byte, and *offset will
918 * contain the offset of that byte within that bio.
919 */
920static struct bio *bio_chain_clone_range(struct bio **bio_src,
921 unsigned int *offset,
922 unsigned int len,
923 gfp_t gfpmask)
924{
925 struct bio *bi = *bio_src;
926 unsigned int off = *offset;
927 struct bio *chain = NULL;
928 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700929
Alex Elderf7760da2012-10-20 22:17:27 -0500930 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700931
Alex Elderf7760da2012-10-20 22:17:27 -0500932 if (!bi || off >= bi->bi_size || !len)
933 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934
Alex Elderf7760da2012-10-20 22:17:27 -0500935 end = &chain;
936 while (len) {
937 unsigned int bi_size;
938 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700939
Alex Elderf7760da2012-10-20 22:17:27 -0500940 if (!bi)
941 goto out_err; /* EINVAL; ran out of bio's */
942 bi_size = min_t(unsigned int, bi->bi_size - off, len);
943 bio = bio_clone_range(bi, off, bi_size, gfpmask);
944 if (!bio)
945 goto out_err; /* ENOMEM */
946
947 *end = bio;
948 end = &bio->bi_next;
949
950 off += bi_size;
951 if (off == bi->bi_size) {
952 bi = bi->bi_next;
953 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700954 }
Alex Elderf7760da2012-10-20 22:17:27 -0500955 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 }
Alex Elderf7760da2012-10-20 22:17:27 -0500957 *bio_src = bi;
958 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700959
Alex Elderf7760da2012-10-20 22:17:27 -0500960 return chain;
961out_err:
962 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964 return NULL;
965}
966
967/*
968 * helpers for osd request op vectors.
969 */
Alex Elder57cfc102012-06-26 12:57:03 -0700970static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
971 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700972{
Alex Elder57cfc102012-06-26 12:57:03 -0700973 struct ceph_osd_req_op *ops;
974
975 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
976 if (!ops)
977 return NULL;
978
979 ops[0].op = opcode;
980
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700981 /*
982 * op extent offset and length will be set later on
983 * in calc_raw_layout()
984 */
Alex Elder57cfc102012-06-26 12:57:03 -0700985 ops[0].payload_len = payload_len;
986
987 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700988}
989
990static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
991{
992 kfree(ops);
993}
994
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700995static void rbd_coll_end_req_index(struct request *rq,
996 struct rbd_req_coll *coll,
997 int index,
998 int ret, u64 len)
999{
1000 struct request_queue *q;
1001 int min, max, i;
1002
Alex Elderbd919d42012-07-13 20:35:11 -05001003 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1004 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001005
1006 if (!rq)
1007 return;
1008
1009 if (!coll) {
1010 blk_end_request(rq, ret, len);
1011 return;
1012 }
1013
1014 q = rq->q;
1015
1016 spin_lock_irq(q->queue_lock);
1017 coll->status[index].done = 1;
1018 coll->status[index].rc = ret;
1019 coll->status[index].bytes = len;
1020 max = min = coll->num_done;
1021 while (max < coll->total && coll->status[max].done)
1022 max++;
1023
1024 for (i = min; i<max; i++) {
1025 __blk_end_request(rq, coll->status[i].rc,
1026 coll->status[i].bytes);
1027 coll->num_done++;
1028 kref_put(&coll->kref, rbd_coll_release);
1029 }
1030 spin_unlock_irq(q->queue_lock);
1031}
1032
1033static void rbd_coll_end_req(struct rbd_request *req,
1034 int ret, u64 len)
1035{
1036 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1037}
1038
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001039/*
1040 * Send ceph osd request
1041 */
1042static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001043 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001044 struct ceph_snap_context *snapc,
1045 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001046 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001047 struct bio *bio,
1048 struct page **pages,
1049 int num_pages,
1050 int flags,
1051 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001052 struct rbd_req_coll *coll,
1053 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001055 struct ceph_msg *msg),
1056 struct ceph_osd_request **linger_req,
1057 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058{
1059 struct ceph_osd_request *req;
1060 struct ceph_file_layout *layout;
1061 int ret;
1062 u64 bno;
1063 struct timespec mtime = CURRENT_TIME;
1064 struct rbd_request *req_data;
1065 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001066 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001067
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001068 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001069 if (!req_data) {
1070 if (coll)
1071 rbd_coll_end_req_index(rq, coll, coll_index,
1072 -ENOMEM, len);
1073 return -ENOMEM;
1074 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001075
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001076 if (coll) {
1077 req_data->coll = coll;
1078 req_data->coll_index = coll_index;
1079 }
1080
Alex Elderf7760da2012-10-20 22:17:27 -05001081 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1082 object_name, (unsigned long long) ofs,
1083 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084
Alex Elder0ce1a792012-07-03 16:01:18 -05001085 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001086 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1087 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001088 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001089 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090 goto done_pages;
1091 }
1092
1093 req->r_callback = rbd_cb;
1094
1095 req_data->rq = rq;
1096 req_data->bio = bio;
1097 req_data->pages = pages;
1098 req_data->len = len;
1099
1100 req->r_priv = req_data;
1101
1102 reqhead = req->r_request->front.iov_base;
1103 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1104
Alex Elderaded07e2012-07-03 16:01:18 -05001105 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106 req->r_oid_len = strlen(req->r_oid);
1107
1108 layout = &req->r_file_layout;
1109 memset(layout, 0, sizeof(*layout));
1110 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1111 layout->fl_stripe_count = cpu_to_le32(1);
1112 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001113 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001114 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1115 req, ops);
1116 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117
1118 ceph_osdc_build_request(req, ofs, &len,
1119 ops,
1120 snapc,
1121 &mtime,
1122 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001124 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001125 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001126 *linger_req = req;
1127 }
1128
Alex Elder1dbb4392012-01-24 10:08:37 -06001129 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001130 if (ret < 0)
1131 goto done_err;
1132
1133 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001134 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001135 if (ver)
1136 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001137 dout("reassert_ver=%llu\n",
1138 (unsigned long long)
1139 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001140 ceph_osdc_put_request(req);
1141 }
1142 return ret;
1143
1144done_err:
1145 bio_chain_put(req_data->bio);
1146 ceph_osdc_put_request(req);
1147done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001148 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001150 return ret;
1151}
1152
1153/*
1154 * Ceph osd op callback
1155 */
1156static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1157{
1158 struct rbd_request *req_data = req->r_priv;
1159 struct ceph_osd_reply_head *replyhead;
1160 struct ceph_osd_op *op;
1161 __s32 rc;
1162 u64 bytes;
1163 int read_op;
1164
1165 /* parse reply */
1166 replyhead = msg->front.iov_base;
1167 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1168 op = (void *)(replyhead + 1);
1169 rc = le32_to_cpu(replyhead->result);
1170 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001171 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172
Alex Elderbd919d42012-07-13 20:35:11 -05001173 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1174 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001175
1176 if (rc == -ENOENT && read_op) {
1177 zero_bio_chain(req_data->bio, 0);
1178 rc = 0;
1179 } else if (rc == 0 && read_op && bytes < req_data->len) {
1180 zero_bio_chain(req_data->bio, bytes);
1181 bytes = req_data->len;
1182 }
1183
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001184 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185
1186 if (req_data->bio)
1187 bio_chain_put(req_data->bio);
1188
1189 ceph_osdc_put_request(req);
1190 kfree(req_data);
1191}
1192
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001193static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1194{
1195 ceph_osdc_put_request(req);
1196}
1197
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001198/*
1199 * Do a synchronous ceph osd operation
1200 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001201static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001202 struct ceph_snap_context *snapc,
1203 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001204 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001205 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001206 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001207 u64 ofs, u64 inbound_size,
1208 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001209 struct ceph_osd_request **linger_req,
1210 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001211{
1212 int ret;
1213 struct page **pages;
1214 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001215
Alex Elderaafb2302012-09-06 16:00:54 -05001216 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001217
Alex Elderf8d4de62012-07-03 16:01:19 -05001218 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001219 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001220 if (IS_ERR(pages))
1221 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222
Alex Elder0ce1a792012-07-03 16:01:18 -05001223 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001224 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001225 pages, num_pages,
1226 flags,
1227 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001228 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001229 NULL,
1230 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001232 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001233
Alex Elderf8d4de62012-07-03 16:01:19 -05001234 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1235 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001236
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237done:
1238 ceph_release_page_vector(pages, num_pages);
1239 return ret;
1240}
1241
1242/*
1243 * Do an asynchronous ceph osd operation
1244 */
1245static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001246 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001248 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001249 struct bio *bio,
1250 struct rbd_req_coll *coll,
1251 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001252{
1253 char *seg_name;
1254 u64 seg_ofs;
1255 u64 seg_len;
1256 int ret;
1257 struct ceph_osd_req_op *ops;
1258 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001259 int opcode;
1260 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001261 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001262
Alex Elder65ccfe22012-08-09 10:33:26 -07001263 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001264 if (!seg_name)
1265 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001266 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1267 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001268
Alex Elderff2e4bb2012-10-10 18:59:29 -07001269 if (rq_data_dir(rq) == WRITE) {
1270 opcode = CEPH_OSD_OP_WRITE;
1271 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001272 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001273 payload_len = seg_len;
1274 } else {
1275 opcode = CEPH_OSD_OP_READ;
1276 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001277 snapc = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001278 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001279 payload_len = 0;
1280 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001281
Alex Elder57cfc102012-06-26 12:57:03 -07001282 ret = -ENOMEM;
1283 ops = rbd_create_rw_ops(1, opcode, payload_len);
1284 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001285 goto done;
1286
1287 /* we've taken care of segment sizes earlier when we
1288 cloned the bios. We should never have a segment
1289 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001290 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001291
1292 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1293 seg_name, seg_ofs, seg_len,
1294 bio,
1295 NULL, 0,
1296 flags,
1297 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001298 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001299 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001300
1301 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001302done:
1303 kfree(seg_name);
1304 return ret;
1305}
1306
1307/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001308 * Request sync osd read
1309 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001310static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001311 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001312 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001313 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001314 char *buf,
1315 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001316{
Alex Elder913d2fd2012-06-26 12:57:03 -07001317 struct ceph_osd_req_op *ops;
1318 int ret;
1319
1320 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1321 if (!ops)
1322 return -ENOMEM;
1323
1324 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001325 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001326 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001327 ops, object_name, ofs, len, buf, NULL, ver);
1328 rbd_destroy_ops(ops);
1329
1330 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001331}
1332
1333/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001334 * Request sync osd watch
1335 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001336static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001338 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001339{
1340 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001341 int ret;
1342
Alex Elder57cfc102012-06-26 12:57:03 -07001343 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1344 if (!ops)
1345 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001346
Josh Durgina71b8912011-12-05 18:10:44 -08001347 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001348 ops[0].watch.cookie = notify_id;
1349 ops[0].watch.flag = 0;
1350
Alex Elder0ce1a792012-07-03 16:01:18 -05001351 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001352 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001353 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001354 CEPH_OSD_FLAG_READ,
1355 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001356 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001357 rbd_simple_req_cb, 0, NULL);
1358
1359 rbd_destroy_ops(ops);
1360 return ret;
1361}
1362
1363static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1364{
Alex Elder0ce1a792012-07-03 16:01:18 -05001365 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001366 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001367 int rc;
1368
Alex Elder0ce1a792012-07-03 16:01:18 -05001369 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001370 return;
1371
Alex Elderbd919d42012-07-13 20:35:11 -05001372 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1373 rbd_dev->header_name, (unsigned long long) notify_id,
1374 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001375 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001376 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001377 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001378 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001379
Alex Elder7f0a24d2012-07-25 09:32:40 -05001380 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001381}
1382
1383/*
1384 * Request sync osd watch
1385 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001386static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001387{
1388 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001389 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001390 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001391
Alex Elder57cfc102012-06-26 12:57:03 -07001392 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1393 if (!ops)
1394 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001395
1396 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001397 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001398 if (ret < 0)
1399 goto fail;
1400
Alex Elder0e6f3222012-07-25 09:32:40 -05001401 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001402 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001403 ops[0].watch.flag = 1;
1404
Alex Elder0ce1a792012-07-03 16:01:18 -05001405 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001406 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001407 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1408 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001409 rbd_dev->header_name,
1410 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001411 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001412
1413 if (ret < 0)
1414 goto fail_event;
1415
1416 rbd_destroy_ops(ops);
1417 return 0;
1418
1419fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001420 ceph_osdc_cancel_event(rbd_dev->watch_event);
1421 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001422fail:
1423 rbd_destroy_ops(ops);
1424 return ret;
1425}
1426
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001427/*
1428 * Request sync osd unwatch
1429 */
Alex Elder070c6332012-07-25 09:32:41 -05001430static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001431{
1432 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001433 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001434
Alex Elder57cfc102012-06-26 12:57:03 -07001435 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1436 if (!ops)
1437 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001438
1439 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001440 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001441 ops[0].watch.flag = 0;
1442
Alex Elder0ce1a792012-07-03 16:01:18 -05001443 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001444 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001445 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1446 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001447 rbd_dev->header_name,
1448 0, 0, NULL, NULL, NULL);
1449
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001450
1451 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001452 ceph_osdc_cancel_event(rbd_dev->watch_event);
1453 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001454 return ret;
1455}
1456
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001457/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001458 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001459 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001460static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001461 const char *object_name,
1462 const char *class_name,
1463 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001464 const char *outbound,
1465 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001466 char *inbound,
1467 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001468 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001469 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001470{
1471 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001472 int class_name_len = strlen(class_name);
1473 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001474 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001475 int ret;
1476
Alex Elder3cb4a682012-06-26 12:57:03 -07001477 /*
1478 * Any input parameters required by the method we're calling
1479 * will be sent along with the class and method names as
1480 * part of the message payload. That data and its size are
1481 * supplied via the indata and indata_len fields (named from
1482 * the perspective of the server side) in the OSD request
1483 * operation.
1484 */
1485 payload_size = class_name_len + method_name_len + outbound_size;
1486 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001487 if (!ops)
1488 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489
Alex Elderaded07e2012-07-03 16:01:18 -05001490 ops[0].cls.class_name = class_name;
1491 ops[0].cls.class_len = (__u8) class_name_len;
1492 ops[0].cls.method_name = method_name;
1493 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001494 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001495 ops[0].cls.indata = outbound;
1496 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497
Alex Elder0ce1a792012-07-03 16:01:18 -05001498 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001499 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001500 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001501 object_name, 0, inbound_size, inbound,
1502 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503
1504 rbd_destroy_ops(ops);
1505
1506 dout("cls_exec returned %d\n", ret);
1507 return ret;
1508}
1509
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001510static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1511{
1512 struct rbd_req_coll *coll =
1513 kzalloc(sizeof(struct rbd_req_coll) +
1514 sizeof(struct rbd_req_status) * num_reqs,
1515 GFP_ATOMIC);
1516
1517 if (!coll)
1518 return NULL;
1519 coll->total = num_reqs;
1520 kref_init(&coll->kref);
1521 return coll;
1522}
1523
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001524/*
1525 * block device queue callback
1526 */
1527static void rbd_rq_fn(struct request_queue *q)
1528{
1529 struct rbd_device *rbd_dev = q->queuedata;
1530 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531
Alex Elder00f1f362012-02-07 12:03:36 -06001532 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001535 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001536 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001537 int num_segs, cur_seg = 0;
1538 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001539 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001540 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001541
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001542 dout("fetched request\n");
1543
1544 /* filter out block requests we don't understand */
1545 if ((rq->cmd_type != REQ_TYPE_FS)) {
1546 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001547 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 }
1549
1550 /* deduce our operation (read, write) */
1551 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001552 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001554 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555 }
1556
1557 spin_unlock_irq(q->queue_lock);
1558
Josh Durgind1d25642011-12-05 14:03:05 -08001559 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001560
Alex Elderdaba5fd2012-10-26 17:25:23 -05001561 if (!rbd_dev->exists) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001562 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
Josh Durgine88a36e2011-11-21 18:14:25 -08001563 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001564 dout("request for non-existent snapshot");
1565 spin_lock_irq(q->queue_lock);
1566 __blk_end_request_all(rq, -ENXIO);
1567 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001568 }
1569
Josh Durgind1d25642011-12-05 14:03:05 -08001570 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1571
1572 up_read(&rbd_dev->header_rwsem);
1573
Alex Elderf7760da2012-10-20 22:17:27 -05001574 size = blk_rq_bytes(rq);
1575 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1576 bio = rq->bio;
1577
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001578 dout("%s 0x%x bytes at 0x%llx\n",
1579 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001580 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001581
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001582 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001583 if (num_segs <= 0) {
1584 spin_lock_irq(q->queue_lock);
1585 __blk_end_request_all(rq, num_segs);
1586 ceph_put_snap_context(snapc);
1587 continue;
1588 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001589 coll = rbd_alloc_coll(num_segs);
1590 if (!coll) {
1591 spin_lock_irq(q->queue_lock);
1592 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001593 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001594 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001595 }
1596
Alex Elderf7760da2012-10-20 22:17:27 -05001597 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001598 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001599 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1600 unsigned int chain_size;
1601 struct bio *bio_chain;
1602
1603 BUG_ON(limit > (u64) UINT_MAX);
1604 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001605 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001606
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001607 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001608
1609 /* Pass a cloned bio chain via an osd request */
1610
1611 bio_chain = bio_chain_clone_range(&bio,
1612 &bio_offset, chain_size,
1613 GFP_ATOMIC);
1614 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001615 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001616 ofs, chain_size,
1617 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001618 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001619 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001620 -ENOMEM, chain_size);
1621 size -= chain_size;
1622 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001623
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001624 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001625 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001626 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001627
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001628 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001629
1630 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001631 }
1632}
1633
1634/*
1635 * a queue callback. Makes sure that we don't create a bio that spans across
1636 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001637 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001638 */
1639static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1640 struct bio_vec *bvec)
1641{
1642 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05001643 sector_t sector_offset;
1644 sector_t sectors_per_obj;
1645 sector_t obj_sector_offset;
1646 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001647
Alex Eldere5cfeed2012-10-20 22:17:27 -05001648 /*
1649 * Find how far into its rbd object the partition-relative
1650 * bio start sector is to offset relative to the enclosing
1651 * device.
1652 */
1653 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1654 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1655 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001656
Alex Eldere5cfeed2012-10-20 22:17:27 -05001657 /*
1658 * Compute the number of bytes from that offset to the end
1659 * of the object. Account for what's already used by the bio.
1660 */
1661 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1662 if (ret > bmd->bi_size)
1663 ret -= bmd->bi_size;
1664 else
1665 ret = 0;
1666
1667 /*
1668 * Don't send back more than was asked for. And if the bio
1669 * was empty, let the whole thing through because: "Note
1670 * that a block device *must* allow a single page to be
1671 * added to an empty bio."
1672 */
1673 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1674 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1675 ret = (int) bvec->bv_len;
1676
1677 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001678}
1679
1680static void rbd_free_disk(struct rbd_device *rbd_dev)
1681{
1682 struct gendisk *disk = rbd_dev->disk;
1683
1684 if (!disk)
1685 return;
1686
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001687 if (disk->flags & GENHD_FL_UP)
1688 del_gendisk(disk);
1689 if (disk->queue)
1690 blk_cleanup_queue(disk->queue);
1691 put_disk(disk);
1692}
1693
1694/*
Alex Elder4156d992012-08-02 11:29:46 -05001695 * Read the complete header for the given rbd device.
1696 *
1697 * Returns a pointer to a dynamically-allocated buffer containing
1698 * the complete and validated header. Caller can pass the address
1699 * of a variable that will be filled in with the version of the
1700 * header object at the time it was read.
1701 *
1702 * Returns a pointer-coded errno if a failure occurs.
1703 */
1704static struct rbd_image_header_ondisk *
1705rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1706{
1707 struct rbd_image_header_ondisk *ondisk = NULL;
1708 u32 snap_count = 0;
1709 u64 names_size = 0;
1710 u32 want_count;
1711 int ret;
1712
1713 /*
1714 * The complete header will include an array of its 64-bit
1715 * snapshot ids, followed by the names of those snapshots as
1716 * a contiguous block of NUL-terminated strings. Note that
1717 * the number of snapshots could change by the time we read
1718 * it in, in which case we re-read it.
1719 */
1720 do {
1721 size_t size;
1722
1723 kfree(ondisk);
1724
1725 size = sizeof (*ondisk);
1726 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1727 size += names_size;
1728 ondisk = kmalloc(size, GFP_KERNEL);
1729 if (!ondisk)
1730 return ERR_PTR(-ENOMEM);
1731
1732 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1733 rbd_dev->header_name,
1734 0, size,
1735 (char *) ondisk, version);
1736
1737 if (ret < 0)
1738 goto out_err;
1739 if (WARN_ON((size_t) ret < size)) {
1740 ret = -ENXIO;
1741 pr_warning("short header read for image %s"
1742 " (want %zd got %d)\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001743 rbd_dev->spec->image_name, size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05001744 goto out_err;
1745 }
1746 if (!rbd_dev_ondisk_valid(ondisk)) {
1747 ret = -ENXIO;
1748 pr_warning("invalid header for image %s\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001749 rbd_dev->spec->image_name);
Alex Elder4156d992012-08-02 11:29:46 -05001750 goto out_err;
1751 }
1752
1753 names_size = le64_to_cpu(ondisk->snap_names_len);
1754 want_count = snap_count;
1755 snap_count = le32_to_cpu(ondisk->snap_count);
1756 } while (snap_count != want_count);
1757
1758 return ondisk;
1759
1760out_err:
1761 kfree(ondisk);
1762
1763 return ERR_PTR(ret);
1764}
1765
1766/*
1767 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001768 */
1769static int rbd_read_header(struct rbd_device *rbd_dev,
1770 struct rbd_image_header *header)
1771{
Alex Elder4156d992012-08-02 11:29:46 -05001772 struct rbd_image_header_ondisk *ondisk;
1773 u64 ver = 0;
1774 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775
Alex Elder4156d992012-08-02 11:29:46 -05001776 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1777 if (IS_ERR(ondisk))
1778 return PTR_ERR(ondisk);
1779 ret = rbd_header_from_disk(header, ondisk);
1780 if (ret >= 0)
1781 header->obj_version = ver;
1782 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001783
Alex Elder4156d992012-08-02 11:29:46 -05001784 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001785}
1786
Alex Elder41f38c22012-10-25 23:34:40 -05001787static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001788{
1789 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001790 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001791
Alex Eldera0593292012-07-19 09:09:27 -05001792 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001793 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001794}
1795
Alex Elder94785542012-10-09 13:50:17 -07001796static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1797{
1798 sector_t size;
1799
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001800 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001801 return;
1802
1803 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1804 dout("setting size to %llu sectors", (unsigned long long) size);
1805 rbd_dev->mapping.size = (u64) size;
1806 set_capacity(rbd_dev->disk, size);
1807}
1808
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001809/*
1810 * only read the first part of the ondisk header, without the snaps info
1811 */
Alex Elder117973f2012-08-31 17:29:55 -05001812static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813{
1814 int ret;
1815 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001816
1817 ret = rbd_read_header(rbd_dev, &h);
1818 if (ret < 0)
1819 return ret;
1820
Josh Durgina51aa0c2011-12-05 10:35:04 -08001821 down_write(&rbd_dev->header_rwsem);
1822
Alex Elder94785542012-10-09 13:50:17 -07001823 /* Update image size, and check for resize of mapped image */
1824 rbd_dev->header.image_size = h.image_size;
1825 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001826
Alex Elder849b4262012-07-09 21:04:24 -05001827 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001828 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001829 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001830 /* osd requests may still refer to snapc */
1831 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832
Alex Elderb8136232012-07-25 09:32:41 -05001833 if (hver)
1834 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001835 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001836 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001837 rbd_dev->header.snapc = h.snapc;
1838 rbd_dev->header.snap_names = h.snap_names;
1839 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001840 /* Free the extra copy of the object prefix */
1841 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1842 kfree(h.object_prefix);
1843
Alex Elder304f6802012-08-31 17:29:52 -05001844 ret = rbd_dev_snaps_update(rbd_dev);
1845 if (!ret)
1846 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001847
Josh Durginc6666012011-11-21 17:11:12 -08001848 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001849
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001850 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001851}
1852
Alex Elder117973f2012-08-31 17:29:55 -05001853static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001854{
1855 int ret;
1856
Alex Elder117973f2012-08-31 17:29:55 -05001857 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001858 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001859 if (rbd_dev->image_format == 1)
1860 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1861 else
1862 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001863 mutex_unlock(&ctl_mutex);
1864
1865 return ret;
1866}
1867
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001868static int rbd_init_disk(struct rbd_device *rbd_dev)
1869{
1870 struct gendisk *disk;
1871 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001872 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001873
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001874 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001875 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1876 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001877 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001878
Alex Elderf0f8cef2012-01-29 13:57:44 -06001879 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001880 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001881 disk->major = rbd_dev->major;
1882 disk->first_minor = 0;
1883 disk->fops = &rbd_bd_ops;
1884 disk->private_data = rbd_dev;
1885
1886 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001887 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1888 if (!q)
1889 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001890
Alex Elder593a9e72012-02-07 12:03:37 -06001891 /* We use the default size, but let's be explicit about it. */
1892 blk_queue_physical_block_size(q, SECTOR_SIZE);
1893
Josh Durgin029bcbd2011-07-22 11:35:23 -07001894 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001895 segment_size = rbd_obj_bytes(&rbd_dev->header);
1896 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1897 blk_queue_max_segment_size(q, segment_size);
1898 blk_queue_io_min(q, segment_size);
1899 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001900
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001901 blk_queue_merge_bvec(q, rbd_merge_bvec);
1902 disk->queue = q;
1903
1904 q->queuedata = rbd_dev;
1905
1906 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001907
Alex Elder12f02942012-08-29 17:11:07 -05001908 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1909
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001910 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001911out_disk:
1912 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001913
1914 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001915}
1916
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917/*
1918 sysfs
1919*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001920
Alex Elder593a9e72012-02-07 12:03:37 -06001921static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1922{
1923 return container_of(dev, struct rbd_device, dev);
1924}
1925
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001926static ssize_t rbd_size_show(struct device *dev,
1927 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001928{
Alex Elder593a9e72012-02-07 12:03:37 -06001929 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001930 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001931
Josh Durgina51aa0c2011-12-05 10:35:04 -08001932 down_read(&rbd_dev->header_rwsem);
1933 size = get_capacity(rbd_dev->disk);
1934 up_read(&rbd_dev->header_rwsem);
1935
1936 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001937}
1938
Alex Elder34b13182012-07-13 20:35:12 -05001939/*
1940 * Note this shows the features for whatever's mapped, which is not
1941 * necessarily the base image.
1942 */
1943static ssize_t rbd_features_show(struct device *dev,
1944 struct device_attribute *attr, char *buf)
1945{
1946 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1947
1948 return sprintf(buf, "0x%016llx\n",
1949 (unsigned long long) rbd_dev->mapping.features);
1950}
1951
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001952static ssize_t rbd_major_show(struct device *dev,
1953 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001954{
Alex Elder593a9e72012-02-07 12:03:37 -06001955 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001956
1957 return sprintf(buf, "%d\n", rbd_dev->major);
1958}
1959
1960static ssize_t rbd_client_id_show(struct device *dev,
1961 struct device_attribute *attr, char *buf)
1962{
Alex Elder593a9e72012-02-07 12:03:37 -06001963 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001964
Alex Elder1dbb4392012-01-24 10:08:37 -06001965 return sprintf(buf, "client%lld\n",
1966 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001967}
1968
1969static ssize_t rbd_pool_show(struct device *dev,
1970 struct device_attribute *attr, char *buf)
1971{
Alex Elder593a9e72012-02-07 12:03:37 -06001972 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001973
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001974 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001975}
1976
Alex Elder9bb2f332012-07-12 10:46:35 -05001977static ssize_t rbd_pool_id_show(struct device *dev,
1978 struct device_attribute *attr, char *buf)
1979{
1980 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1981
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001982 return sprintf(buf, "%llu\n",
1983 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05001984}
1985
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001986static ssize_t rbd_name_show(struct device *dev,
1987 struct device_attribute *attr, char *buf)
1988{
Alex Elder593a9e72012-02-07 12:03:37 -06001989 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001990
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001991 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001992}
1993
Alex Elder589d30e2012-07-10 20:30:11 -05001994static ssize_t rbd_image_id_show(struct device *dev,
1995 struct device_attribute *attr, char *buf)
1996{
1997 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1998
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001999 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002000}
2001
Alex Elder34b13182012-07-13 20:35:12 -05002002/*
2003 * Shows the name of the currently-mapped snapshot (or
2004 * RBD_SNAP_HEAD_NAME for the base image).
2005 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002006static ssize_t rbd_snap_show(struct device *dev,
2007 struct device_attribute *attr,
2008 char *buf)
2009{
Alex Elder593a9e72012-02-07 12:03:37 -06002010 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002011
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002012 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002013}
2014
2015static ssize_t rbd_image_refresh(struct device *dev,
2016 struct device_attribute *attr,
2017 const char *buf,
2018 size_t size)
2019{
Alex Elder593a9e72012-02-07 12:03:37 -06002020 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002021 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002022
Alex Elder117973f2012-08-31 17:29:55 -05002023 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002024
2025 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002026}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002027
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002028static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002029static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002030static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2031static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2032static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002033static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002034static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002035static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002036static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2037static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002038
2039static struct attribute *rbd_attrs[] = {
2040 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002041 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002042 &dev_attr_major.attr,
2043 &dev_attr_client_id.attr,
2044 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002045 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002046 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002047 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002048 &dev_attr_current_snap.attr,
2049 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002050 NULL
2051};
2052
2053static struct attribute_group rbd_attr_group = {
2054 .attrs = rbd_attrs,
2055};
2056
2057static const struct attribute_group *rbd_attr_groups[] = {
2058 &rbd_attr_group,
2059 NULL
2060};
2061
2062static void rbd_sysfs_dev_release(struct device *dev)
2063{
2064}
2065
2066static struct device_type rbd_device_type = {
2067 .name = "rbd",
2068 .groups = rbd_attr_groups,
2069 .release = rbd_sysfs_dev_release,
2070};
2071
2072
2073/*
2074 sysfs - snapshots
2075*/
2076
2077static ssize_t rbd_snap_size_show(struct device *dev,
2078 struct device_attribute *attr,
2079 char *buf)
2080{
2081 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2082
Josh Durgin35915382011-12-05 18:25:13 -08002083 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002084}
2085
2086static ssize_t rbd_snap_id_show(struct device *dev,
2087 struct device_attribute *attr,
2088 char *buf)
2089{
2090 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2091
Josh Durgin35915382011-12-05 18:25:13 -08002092 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002093}
2094
Alex Elder34b13182012-07-13 20:35:12 -05002095static ssize_t rbd_snap_features_show(struct device *dev,
2096 struct device_attribute *attr,
2097 char *buf)
2098{
2099 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2100
2101 return sprintf(buf, "0x%016llx\n",
2102 (unsigned long long) snap->features);
2103}
2104
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2106static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002107static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002108
2109static struct attribute *rbd_snap_attrs[] = {
2110 &dev_attr_snap_size.attr,
2111 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002112 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002113 NULL,
2114};
2115
2116static struct attribute_group rbd_snap_attr_group = {
2117 .attrs = rbd_snap_attrs,
2118};
2119
2120static void rbd_snap_dev_release(struct device *dev)
2121{
2122 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2123 kfree(snap->name);
2124 kfree(snap);
2125}
2126
2127static const struct attribute_group *rbd_snap_attr_groups[] = {
2128 &rbd_snap_attr_group,
2129 NULL
2130};
2131
2132static struct device_type rbd_snap_device_type = {
2133 .groups = rbd_snap_attr_groups,
2134 .release = rbd_snap_dev_release,
2135};
2136
Alex Elder8b8fb992012-10-26 17:25:24 -05002137static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2138{
2139 kref_get(&spec->kref);
2140
2141 return spec;
2142}
2143
2144static void rbd_spec_free(struct kref *kref);
2145static void rbd_spec_put(struct rbd_spec *spec)
2146{
2147 if (spec)
2148 kref_put(&spec->kref, rbd_spec_free);
2149}
2150
2151static struct rbd_spec *rbd_spec_alloc(void)
2152{
2153 struct rbd_spec *spec;
2154
2155 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2156 if (!spec)
2157 return NULL;
2158 kref_init(&spec->kref);
2159
2160 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2161
2162 return spec;
2163}
2164
2165static void rbd_spec_free(struct kref *kref)
2166{
2167 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2168
2169 kfree(spec->pool_name);
2170 kfree(spec->image_id);
2171 kfree(spec->image_name);
2172 kfree(spec->snap_name);
2173 kfree(spec);
2174}
2175
Alex Elder304f6802012-08-31 17:29:52 -05002176static bool rbd_snap_registered(struct rbd_snap *snap)
2177{
2178 bool ret = snap->dev.type == &rbd_snap_device_type;
2179 bool reg = device_is_registered(&snap->dev);
2180
2181 rbd_assert(!ret ^ reg);
2182
2183 return ret;
2184}
2185
Alex Elder41f38c22012-10-25 23:34:40 -05002186static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002187{
2188 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002189 if (device_is_registered(&snap->dev))
2190 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002191}
2192
Alex Elder14e70852012-07-19 09:09:27 -05002193static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002194 struct device *parent)
2195{
2196 struct device *dev = &snap->dev;
2197 int ret;
2198
2199 dev->type = &rbd_snap_device_type;
2200 dev->parent = parent;
2201 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002202 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002203 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2204
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002205 ret = device_register(dev);
2206
2207 return ret;
2208}
2209
Alex Elder4e891e02012-07-10 20:30:10 -05002210static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002211 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002212 u64 snap_id, u64 snap_size,
2213 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002214{
Alex Elder4e891e02012-07-10 20:30:10 -05002215 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002216 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002217
2218 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002219 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002220 return ERR_PTR(-ENOMEM);
2221
2222 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002223 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002224 if (!snap->name)
2225 goto err;
2226
Alex Elderc8d18422012-07-10 20:30:11 -05002227 snap->id = snap_id;
2228 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002229 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002230
2231 return snap;
2232
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002233err:
2234 kfree(snap->name);
2235 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002236
2237 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002238}
2239
Alex Eldercd892122012-07-03 16:01:19 -05002240static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2241 u64 *snap_size, u64 *snap_features)
2242{
2243 char *snap_name;
2244
2245 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2246
2247 *snap_size = rbd_dev->header.snap_sizes[which];
2248 *snap_features = 0; /* No features for v1 */
2249
2250 /* Skip over names until we find the one we are looking for */
2251
2252 snap_name = rbd_dev->header.snap_names;
2253 while (which--)
2254 snap_name += strlen(snap_name) + 1;
2255
2256 return snap_name;
2257}
2258
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002259/*
Alex Elder9d475de2012-07-03 16:01:19 -05002260 * Get the size and object order for an image snapshot, or if
2261 * snap_id is CEPH_NOSNAP, gets this information for the base
2262 * image.
2263 */
2264static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2265 u8 *order, u64 *snap_size)
2266{
2267 __le64 snapid = cpu_to_le64(snap_id);
2268 int ret;
2269 struct {
2270 u8 order;
2271 __le64 size;
2272 } __attribute__ ((packed)) size_buf = { 0 };
2273
2274 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2275 "rbd", "get_size",
2276 (char *) &snapid, sizeof (snapid),
2277 (char *) &size_buf, sizeof (size_buf),
2278 CEPH_OSD_FLAG_READ, NULL);
2279 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2280 if (ret < 0)
2281 return ret;
2282
2283 *order = size_buf.order;
2284 *snap_size = le64_to_cpu(size_buf.size);
2285
2286 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2287 (unsigned long long) snap_id, (unsigned int) *order,
2288 (unsigned long long) *snap_size);
2289
2290 return 0;
2291}
2292
2293static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2294{
2295 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2296 &rbd_dev->header.obj_order,
2297 &rbd_dev->header.image_size);
2298}
2299
Alex Elder1e130192012-07-03 16:01:19 -05002300static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2301{
2302 void *reply_buf;
2303 int ret;
2304 void *p;
2305
2306 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2307 if (!reply_buf)
2308 return -ENOMEM;
2309
2310 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2311 "rbd", "get_object_prefix",
2312 NULL, 0,
2313 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2314 CEPH_OSD_FLAG_READ, NULL);
2315 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2316 if (ret < 0)
2317 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002318 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002319
2320 p = reply_buf;
2321 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2322 p + RBD_OBJ_PREFIX_LEN_MAX,
2323 NULL, GFP_NOIO);
2324
2325 if (IS_ERR(rbd_dev->header.object_prefix)) {
2326 ret = PTR_ERR(rbd_dev->header.object_prefix);
2327 rbd_dev->header.object_prefix = NULL;
2328 } else {
2329 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2330 }
2331
2332out:
2333 kfree(reply_buf);
2334
2335 return ret;
2336}
2337
Alex Elderb1b54022012-07-03 16:01:19 -05002338static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2339 u64 *snap_features)
2340{
2341 __le64 snapid = cpu_to_le64(snap_id);
2342 struct {
2343 __le64 features;
2344 __le64 incompat;
2345 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002346 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002347 int ret;
2348
2349 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2350 "rbd", "get_features",
2351 (char *) &snapid, sizeof (snapid),
2352 (char *) &features_buf, sizeof (features_buf),
2353 CEPH_OSD_FLAG_READ, NULL);
2354 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2355 if (ret < 0)
2356 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002357
2358 incompat = le64_to_cpu(features_buf.incompat);
2359 if (incompat & ~RBD_FEATURES_ALL)
2360 return -ENOTSUPP;
2361
Alex Elderb1b54022012-07-03 16:01:19 -05002362 *snap_features = le64_to_cpu(features_buf.features);
2363
2364 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2365 (unsigned long long) snap_id,
2366 (unsigned long long) *snap_features,
2367 (unsigned long long) le64_to_cpu(features_buf.incompat));
2368
2369 return 0;
2370}
2371
2372static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2373{
2374 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2375 &rbd_dev->header.features);
2376}
2377
Alex Elder6e14b1a2012-07-03 16:01:19 -05002378static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002379{
2380 size_t size;
2381 int ret;
2382 void *reply_buf;
2383 void *p;
2384 void *end;
2385 u64 seq;
2386 u32 snap_count;
2387 struct ceph_snap_context *snapc;
2388 u32 i;
2389
2390 /*
2391 * We'll need room for the seq value (maximum snapshot id),
2392 * snapshot count, and array of that many snapshot ids.
2393 * For now we have a fixed upper limit on the number we're
2394 * prepared to receive.
2395 */
2396 size = sizeof (__le64) + sizeof (__le32) +
2397 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2398 reply_buf = kzalloc(size, GFP_KERNEL);
2399 if (!reply_buf)
2400 return -ENOMEM;
2401
2402 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2403 "rbd", "get_snapcontext",
2404 NULL, 0,
2405 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002406 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002407 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2408 if (ret < 0)
2409 goto out;
2410
2411 ret = -ERANGE;
2412 p = reply_buf;
2413 end = (char *) reply_buf + size;
2414 ceph_decode_64_safe(&p, end, seq, out);
2415 ceph_decode_32_safe(&p, end, snap_count, out);
2416
2417 /*
2418 * Make sure the reported number of snapshot ids wouldn't go
2419 * beyond the end of our buffer. But before checking that,
2420 * make sure the computed size of the snapshot context we
2421 * allocate is representable in a size_t.
2422 */
2423 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2424 / sizeof (u64)) {
2425 ret = -EINVAL;
2426 goto out;
2427 }
2428 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2429 goto out;
2430
2431 size = sizeof (struct ceph_snap_context) +
2432 snap_count * sizeof (snapc->snaps[0]);
2433 snapc = kmalloc(size, GFP_KERNEL);
2434 if (!snapc) {
2435 ret = -ENOMEM;
2436 goto out;
2437 }
2438
2439 atomic_set(&snapc->nref, 1);
2440 snapc->seq = seq;
2441 snapc->num_snaps = snap_count;
2442 for (i = 0; i < snap_count; i++)
2443 snapc->snaps[i] = ceph_decode_64(&p);
2444
2445 rbd_dev->header.snapc = snapc;
2446
2447 dout(" snap context seq = %llu, snap_count = %u\n",
2448 (unsigned long long) seq, (unsigned int) snap_count);
2449
2450out:
2451 kfree(reply_buf);
2452
2453 return 0;
2454}
2455
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002456static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2457{
2458 size_t size;
2459 void *reply_buf;
2460 __le64 snap_id;
2461 int ret;
2462 void *p;
2463 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002464 char *snap_name;
2465
2466 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2467 reply_buf = kmalloc(size, GFP_KERNEL);
2468 if (!reply_buf)
2469 return ERR_PTR(-ENOMEM);
2470
2471 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2472 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2473 "rbd", "get_snapshot_name",
2474 (char *) &snap_id, sizeof (snap_id),
2475 reply_buf, size,
2476 CEPH_OSD_FLAG_READ, NULL);
2477 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2478 if (ret < 0)
2479 goto out;
2480
2481 p = reply_buf;
2482 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002483 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002484 if (IS_ERR(snap_name)) {
2485 ret = PTR_ERR(snap_name);
2486 goto out;
2487 } else {
2488 dout(" snap_id 0x%016llx snap_name = %s\n",
2489 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2490 }
2491 kfree(reply_buf);
2492
2493 return snap_name;
2494out:
2495 kfree(reply_buf);
2496
2497 return ERR_PTR(ret);
2498}
2499
2500static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2501 u64 *snap_size, u64 *snap_features)
2502{
2503 __le64 snap_id;
2504 u8 order;
2505 int ret;
2506
2507 snap_id = rbd_dev->header.snapc->snaps[which];
2508 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2509 if (ret)
2510 return ERR_PTR(ret);
2511 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2512 if (ret)
2513 return ERR_PTR(ret);
2514
2515 return rbd_dev_v2_snap_name(rbd_dev, which);
2516}
2517
2518static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2519 u64 *snap_size, u64 *snap_features)
2520{
2521 if (rbd_dev->image_format == 1)
2522 return rbd_dev_v1_snap_info(rbd_dev, which,
2523 snap_size, snap_features);
2524 if (rbd_dev->image_format == 2)
2525 return rbd_dev_v2_snap_info(rbd_dev, which,
2526 snap_size, snap_features);
2527 return ERR_PTR(-EINVAL);
2528}
2529
Alex Elder117973f2012-08-31 17:29:55 -05002530static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2531{
2532 int ret;
2533 __u8 obj_order;
2534
2535 down_write(&rbd_dev->header_rwsem);
2536
2537 /* Grab old order first, to see if it changes */
2538
2539 obj_order = rbd_dev->header.obj_order,
2540 ret = rbd_dev_v2_image_size(rbd_dev);
2541 if (ret)
2542 goto out;
2543 if (rbd_dev->header.obj_order != obj_order) {
2544 ret = -EIO;
2545 goto out;
2546 }
2547 rbd_update_mapping_size(rbd_dev);
2548
2549 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2550 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2551 if (ret)
2552 goto out;
2553 ret = rbd_dev_snaps_update(rbd_dev);
2554 dout("rbd_dev_snaps_update returned %d\n", ret);
2555 if (ret)
2556 goto out;
2557 ret = rbd_dev_snaps_register(rbd_dev);
2558 dout("rbd_dev_snaps_register returned %d\n", ret);
2559out:
2560 up_write(&rbd_dev->header_rwsem);
2561
2562 return ret;
2563}
2564
Alex Elder9d475de2012-07-03 16:01:19 -05002565/*
Alex Elder35938152012-08-02 11:29:46 -05002566 * Scan the rbd device's current snapshot list and compare it to the
2567 * newly-received snapshot context. Remove any existing snapshots
2568 * not present in the new snapshot context. Add a new snapshot for
2569 * any snaphots in the snapshot context not in the current list.
2570 * And verify there are no changes to snapshots we already know
2571 * about.
2572 *
2573 * Assumes the snapshots in the snapshot context are sorted by
2574 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2575 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002576 */
Alex Elder304f6802012-08-31 17:29:52 -05002577static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002578{
Alex Elder35938152012-08-02 11:29:46 -05002579 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2580 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002581 struct list_head *head = &rbd_dev->snaps;
2582 struct list_head *links = head->next;
2583 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002584
Alex Elder9fcbb802012-08-23 23:48:49 -05002585 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002586 while (index < snap_count || links != head) {
2587 u64 snap_id;
2588 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002589 char *snap_name;
2590 u64 snap_size = 0;
2591 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002592
Alex Elder35938152012-08-02 11:29:46 -05002593 snap_id = index < snap_count ? snapc->snaps[index]
2594 : CEPH_NOSNAP;
2595 snap = links != head ? list_entry(links, struct rbd_snap, node)
2596 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002597 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002598
Alex Elder35938152012-08-02 11:29:46 -05002599 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2600 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002601
Alex Elder35938152012-08-02 11:29:46 -05002602 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002603
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002604 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderdaba5fd2012-10-26 17:25:23 -05002605 rbd_dev->exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002606 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002607 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002608 rbd_dev->spec->snap_id == snap->id ?
2609 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002610 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002611
Alex Elder35938152012-08-02 11:29:46 -05002612 /* Done with this list entry; advance */
2613
2614 links = next;
2615 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002616 }
Alex Elder35938152012-08-02 11:29:46 -05002617
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002618 snap_name = rbd_dev_snap_info(rbd_dev, index,
2619 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002620 if (IS_ERR(snap_name))
2621 return PTR_ERR(snap_name);
2622
Alex Elder9fcbb802012-08-23 23:48:49 -05002623 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2624 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002625 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2626 struct rbd_snap *new_snap;
2627
2628 /* We haven't seen this snapshot before */
2629
Alex Elderc8d18422012-07-10 20:30:11 -05002630 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002631 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002632 if (IS_ERR(new_snap)) {
2633 int err = PTR_ERR(new_snap);
2634
2635 dout(" failed to add dev, error %d\n", err);
2636
2637 return err;
2638 }
Alex Elder35938152012-08-02 11:29:46 -05002639
2640 /* New goes before existing, or at end of list */
2641
Alex Elder9fcbb802012-08-23 23:48:49 -05002642 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002643 if (snap)
2644 list_add_tail(&new_snap->node, &snap->node);
2645 else
Alex Elder523f3252012-08-30 00:16:37 -05002646 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002647 } else {
2648 /* Already have this one */
2649
Alex Elder9fcbb802012-08-23 23:48:49 -05002650 dout(" already present\n");
2651
Alex Eldercd892122012-07-03 16:01:19 -05002652 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002653 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002654 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002655
2656 /* Done with this list entry; advance */
2657
2658 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002659 }
Alex Elder35938152012-08-02 11:29:46 -05002660
2661 /* Advance to the next entry in the snapshot context */
2662
2663 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002664 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002665 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002666
2667 return 0;
2668}
2669
Alex Elder304f6802012-08-31 17:29:52 -05002670/*
2671 * Scan the list of snapshots and register the devices for any that
2672 * have not already been registered.
2673 */
2674static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2675{
2676 struct rbd_snap *snap;
2677 int ret = 0;
2678
2679 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002680 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2681 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002682
2683 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2684 if (!rbd_snap_registered(snap)) {
2685 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2686 if (ret < 0)
2687 break;
2688 }
2689 }
2690 dout("%s: returning %d\n", __func__, ret);
2691
2692 return ret;
2693}
2694
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002695static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2696{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002697 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002698 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002699
2700 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002701
Alex Eldercd789ab2012-08-30 00:16:38 -05002702 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002703 dev->bus = &rbd_bus_type;
2704 dev->type = &rbd_device_type;
2705 dev->parent = &rbd_root_dev;
2706 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002707 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002708 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002709
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002710 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002711
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002712 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002713}
2714
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002715static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2716{
2717 device_unregister(&rbd_dev->dev);
2718}
2719
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002720static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2721{
2722 int ret, rc;
2723
2724 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002725 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002726 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002727 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002728 if (rc < 0)
2729 return rc;
2730 }
2731 } while (ret == -ERANGE);
2732
2733 return ret;
2734}
2735
Alex Eldere2839302012-08-29 17:11:06 -05002736static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002737
2738/*
Alex Elder499afd52012-02-02 08:13:29 -06002739 * Get a unique rbd identifier for the given new rbd_dev, and add
2740 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002741 */
Alex Eldere2839302012-08-29 17:11:06 -05002742static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002743{
Alex Eldere2839302012-08-29 17:11:06 -05002744 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002745
2746 spin_lock(&rbd_dev_list_lock);
2747 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2748 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002749 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2750 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002751}
Alex Elderb7f23c32012-01-29 13:57:43 -06002752
Alex Elder1ddbe942012-01-29 13:57:44 -06002753/*
Alex Elder499afd52012-02-02 08:13:29 -06002754 * Remove an rbd_dev from the global list, and record that its
2755 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002756 */
Alex Eldere2839302012-08-29 17:11:06 -05002757static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002758{
Alex Elderd184f6b2012-01-29 13:57:44 -06002759 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002760 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002761 int max_id;
2762
Alex Elderaafb2302012-09-06 16:00:54 -05002763 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002764
Alex Eldere2839302012-08-29 17:11:06 -05002765 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2766 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002767 spin_lock(&rbd_dev_list_lock);
2768 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002769
2770 /*
2771 * If the id being "put" is not the current maximum, there
2772 * is nothing special we need to do.
2773 */
Alex Eldere2839302012-08-29 17:11:06 -05002774 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002775 spin_unlock(&rbd_dev_list_lock);
2776 return;
2777 }
2778
2779 /*
2780 * We need to update the current maximum id. Search the
2781 * list to find out what it is. We're more likely to find
2782 * the maximum at the end, so search the list backward.
2783 */
2784 max_id = 0;
2785 list_for_each_prev(tmp, &rbd_dev_list) {
2786 struct rbd_device *rbd_dev;
2787
2788 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002789 if (rbd_dev->dev_id > max_id)
2790 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002791 }
Alex Elder499afd52012-02-02 08:13:29 -06002792 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002793
Alex Elder1ddbe942012-01-29 13:57:44 -06002794 /*
Alex Eldere2839302012-08-29 17:11:06 -05002795 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002796 * which case it now accurately reflects the new maximum.
2797 * Be careful not to overwrite the maximum value in that
2798 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002799 */
Alex Eldere2839302012-08-29 17:11:06 -05002800 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2801 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002802}
2803
Alex Eldera725f65e2012-02-02 08:13:30 -06002804/*
Alex Eldere28fff262012-02-02 08:13:30 -06002805 * Skips over white space at *buf, and updates *buf to point to the
2806 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002807 * the token (string of non-white space characters) found. Note
2808 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002809 */
2810static inline size_t next_token(const char **buf)
2811{
2812 /*
2813 * These are the characters that produce nonzero for
2814 * isspace() in the "C" and "POSIX" locales.
2815 */
2816 const char *spaces = " \f\n\r\t\v";
2817
2818 *buf += strspn(*buf, spaces); /* Find start of token */
2819
2820 return strcspn(*buf, spaces); /* Return token length */
2821}
2822
2823/*
2824 * Finds the next token in *buf, and if the provided token buffer is
2825 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002826 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2827 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002828 *
2829 * Returns the length of the token found (not including the '\0').
2830 * Return value will be 0 if no token is found, and it will be >=
2831 * token_size if the token would not fit.
2832 *
Alex Elder593a9e72012-02-07 12:03:37 -06002833 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002834 * found token. Note that this occurs even if the token buffer is
2835 * too small to hold it.
2836 */
2837static inline size_t copy_token(const char **buf,
2838 char *token,
2839 size_t token_size)
2840{
2841 size_t len;
2842
2843 len = next_token(buf);
2844 if (len < token_size) {
2845 memcpy(token, *buf, len);
2846 *(token + len) = '\0';
2847 }
2848 *buf += len;
2849
2850 return len;
2851}
2852
2853/*
Alex Elderea3352f2012-07-09 21:04:23 -05002854 * Finds the next token in *buf, dynamically allocates a buffer big
2855 * enough to hold a copy of it, and copies the token into the new
2856 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2857 * that a duplicate buffer is created even for a zero-length token.
2858 *
2859 * Returns a pointer to the newly-allocated duplicate, or a null
2860 * pointer if memory for the duplicate was not available. If
2861 * the lenp argument is a non-null pointer, the length of the token
2862 * (not including the '\0') is returned in *lenp.
2863 *
2864 * If successful, the *buf pointer will be updated to point beyond
2865 * the end of the found token.
2866 *
2867 * Note: uses GFP_KERNEL for allocation.
2868 */
2869static inline char *dup_token(const char **buf, size_t *lenp)
2870{
2871 char *dup;
2872 size_t len;
2873
2874 len = next_token(buf);
2875 dup = kmalloc(len + 1, GFP_KERNEL);
2876 if (!dup)
2877 return NULL;
2878
2879 memcpy(dup, *buf, len);
2880 *(dup + len) = '\0';
2881 *buf += len;
2882
2883 if (lenp)
2884 *lenp = len;
2885
2886 return dup;
2887}
2888
2889/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002890 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2891 * rbd_md_name, and name fields of the given rbd_dev, based on the
2892 * list of monitor addresses and other options provided via
2893 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2894 * copy of the snapshot name to map if successful, or a
2895 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002896 *
2897 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002898 */
Alex Elderdc79b112012-10-25 23:34:41 -05002899static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2900 const char *buf,
2901 struct ceph_options **ceph_opts,
2902 struct rbd_options **opts)
Alex Eldera725f65e2012-02-02 08:13:30 -06002903{
Alex Elderd22f76e2012-07-12 10:46:35 -05002904 size_t len;
Alex Elder0ddebc02012-10-25 23:34:41 -05002905 const char *mon_addrs;
2906 size_t mon_addrs_size;
Alex Elderf28e5652012-10-25 23:34:41 -05002907 char *options;
Alex Elder4e9afeb2012-10-25 23:34:41 -05002908 struct rbd_options *rbd_opts = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05002909 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002910
2911 /* The first four tokens are required */
2912
Alex Elder7ef32142012-02-02 08:13:30 -06002913 len = next_token(&buf);
2914 if (!len)
Alex Elderdc79b112012-10-25 23:34:41 -05002915 return -EINVAL; /* Missing monitor address(es) */
Alex Elder0ddebc02012-10-25 23:34:41 -05002916 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05002917 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002918 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002919
Alex Elderdc79b112012-10-25 23:34:41 -05002920 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05002921 options = dup_token(&buf, NULL);
2922 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05002923 return -ENOMEM;
Alex Elderf28e5652012-10-25 23:34:41 -05002924 if (!*options)
2925 goto out_err; /* Missing options */
Alex Eldera725f65e2012-02-02 08:13:30 -06002926
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002927 rbd_dev->spec->pool_name = dup_token(&buf, NULL);
2928 if (!rbd_dev->spec->pool_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002929 goto out_mem;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002930 if (!*rbd_dev->spec->pool_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002931 goto out_err; /* Missing pool name */
Alex Eldere28fff262012-02-02 08:13:30 -06002932
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002933 rbd_dev->spec->image_name =
2934 dup_token(&buf, &rbd_dev->spec->image_name_len);
2935 if (!rbd_dev->spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002936 goto out_mem;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002937 if (!*rbd_dev->spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002938 goto out_err; /* Missing image name */
Alex Eldere28fff262012-02-02 08:13:30 -06002939
Alex Elderf28e5652012-10-25 23:34:41 -05002940 /*
2941 * Snapshot name is optional; default is to use "-"
2942 * (indicating the head/no snapshot).
2943 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002944 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002945 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002946 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2947 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05002948 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05002949 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05002950 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05002951 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002952 rbd_dev->spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
2953 if (!rbd_dev->spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002954 goto out_mem;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002955 memcpy(rbd_dev->spec->snap_name, buf, len);
2956 *(rbd_dev->spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05002957
Alex Elder0ddebc02012-10-25 23:34:41 -05002958 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06002959
Alex Elder4e9afeb2012-10-25 23:34:41 -05002960 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
2961 if (!rbd_opts)
2962 goto out_mem;
2963
2964 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05002965
Alex Elderdc79b112012-10-25 23:34:41 -05002966 *ceph_opts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05002967 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05002968 parse_rbd_opts_token, rbd_opts);
Alex Elderf28e5652012-10-25 23:34:41 -05002969 kfree(options);
Alex Elderdc79b112012-10-25 23:34:41 -05002970 if (IS_ERR(*ceph_opts)) {
2971 ret = PTR_ERR(*ceph_opts);
2972 goto out_err;
2973 }
Alex Elder4e9afeb2012-10-25 23:34:41 -05002974 *opts = rbd_opts;
Alex Elder0ddebc02012-10-25 23:34:41 -05002975
Alex Elderdc79b112012-10-25 23:34:41 -05002976 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05002977out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05002978 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002979out_err:
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002980 kfree(rbd_dev->spec->image_name);
2981 rbd_dev->spec->image_name = NULL;
2982 rbd_dev->spec->image_name_len = 0;
2983 kfree(rbd_dev->spec->pool_name);
2984 rbd_dev->spec->pool_name = NULL;
Alex Elderf28e5652012-10-25 23:34:41 -05002985 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05002986
Alex Elderdc79b112012-10-25 23:34:41 -05002987 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002988}
2989
Alex Elder589d30e2012-07-10 20:30:11 -05002990/*
2991 * An rbd format 2 image has a unique identifier, distinct from the
2992 * name given to it by the user. Internally, that identifier is
2993 * what's used to specify the names of objects related to the image.
2994 *
2995 * A special "rbd id" object is used to map an rbd image name to its
2996 * id. If that object doesn't exist, then there is no v2 rbd image
2997 * with the supplied name.
2998 *
2999 * This function will record the given rbd_dev's image_id field if
3000 * it can be determined, and in that case will return 0. If any
3001 * errors occur a negative errno will be returned and the rbd_dev's
3002 * image_id field will be unchanged (and should be NULL).
3003 */
3004static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3005{
3006 int ret;
3007 size_t size;
3008 char *object_name;
3009 void *response;
3010 void *p;
3011
3012 /*
3013 * First, see if the format 2 image id file exists, and if
3014 * so, get the image's persistent id from it.
3015 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003016 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
Alex Elder589d30e2012-07-10 20:30:11 -05003017 object_name = kmalloc(size, GFP_NOIO);
3018 if (!object_name)
3019 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003020 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003021 dout("rbd id object name is %s\n", object_name);
3022
3023 /* Response will be an encoded string, which includes a length */
3024
3025 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3026 response = kzalloc(size, GFP_NOIO);
3027 if (!response) {
3028 ret = -ENOMEM;
3029 goto out;
3030 }
3031
3032 ret = rbd_req_sync_exec(rbd_dev, object_name,
3033 "rbd", "get_id",
3034 NULL, 0,
3035 response, RBD_IMAGE_ID_LEN_MAX,
3036 CEPH_OSD_FLAG_READ, NULL);
3037 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3038 if (ret < 0)
3039 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003040 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003041
3042 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003043 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003044 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003045 &rbd_dev->spec->image_id_len,
Alex Elder589d30e2012-07-10 20:30:11 -05003046 GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003047 if (IS_ERR(rbd_dev->spec->image_id)) {
3048 ret = PTR_ERR(rbd_dev->spec->image_id);
3049 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003050 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003051 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003052 }
3053out:
3054 kfree(response);
3055 kfree(object_name);
3056
3057 return ret;
3058}
3059
Alex Eldera30b71b2012-07-10 20:30:11 -05003060static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3061{
3062 int ret;
3063 size_t size;
3064
3065 /* Version 1 images have no id; empty string is used */
3066
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003067 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3068 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003069 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003070 rbd_dev->spec->image_id_len = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003071
3072 /* Record the header object name for this rbd image. */
3073
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003074 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003075 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3076 if (!rbd_dev->header_name) {
3077 ret = -ENOMEM;
3078 goto out_err;
3079 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003080 sprintf(rbd_dev->header_name, "%s%s",
3081 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003082
3083 /* Populate rbd image metadata */
3084
3085 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3086 if (ret < 0)
3087 goto out_err;
3088 rbd_dev->image_format = 1;
3089
3090 dout("discovered version 1 image, header name is %s\n",
3091 rbd_dev->header_name);
3092
3093 return 0;
3094
3095out_err:
3096 kfree(rbd_dev->header_name);
3097 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003098 kfree(rbd_dev->spec->image_id);
3099 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003100
3101 return ret;
3102}
3103
3104static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3105{
3106 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003107 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003108 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003109
3110 /*
3111 * Image id was filled in by the caller. Record the header
3112 * object name for this rbd image.
3113 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003114 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
Alex Eldera30b71b2012-07-10 20:30:11 -05003115 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3116 if (!rbd_dev->header_name)
3117 return -ENOMEM;
3118 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003119 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003120
3121 /* Get the size and object order for the image */
3122
3123 ret = rbd_dev_v2_image_size(rbd_dev);
3124 if (ret < 0)
3125 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003126
3127 /* Get the object prefix (a.k.a. block_name) for the image */
3128
3129 ret = rbd_dev_v2_object_prefix(rbd_dev);
3130 if (ret < 0)
3131 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003132
Alex Elderd8891402012-10-09 13:50:17 -07003133 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003134
3135 ret = rbd_dev_v2_features(rbd_dev);
3136 if (ret < 0)
3137 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003138
Alex Elder6e14b1a2012-07-03 16:01:19 -05003139 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003140
Alex Elder6e14b1a2012-07-03 16:01:19 -05003141 rbd_dev->header.crypt_type = 0;
3142 rbd_dev->header.comp_type = 0;
3143
3144 /* Get the snapshot context, plus the header version */
3145
3146 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003147 if (ret)
3148 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003149 rbd_dev->header.obj_version = ver;
3150
Alex Eldera30b71b2012-07-10 20:30:11 -05003151 rbd_dev->image_format = 2;
3152
3153 dout("discovered version 2 image, header name is %s\n",
3154 rbd_dev->header_name);
3155
Alex Elder35152972012-08-31 17:29:55 -05003156 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003157out_err:
3158 kfree(rbd_dev->header_name);
3159 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003160 kfree(rbd_dev->header.object_prefix);
3161 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003162
3163 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003164}
3165
3166/*
3167 * Probe for the existence of the header object for the given rbd
3168 * device. For format 2 images this includes determining the image
3169 * id.
3170 */
3171static int rbd_dev_probe(struct rbd_device *rbd_dev)
3172{
3173 int ret;
3174
3175 /*
3176 * Get the id from the image id object. If it's not a
3177 * format 2 image, we'll get ENOENT back, and we'll assume
3178 * it's a format 1 image.
3179 */
3180 ret = rbd_dev_image_id(rbd_dev);
3181 if (ret)
3182 ret = rbd_dev_v1_probe(rbd_dev);
3183 else
3184 ret = rbd_dev_v2_probe(rbd_dev);
3185 if (ret)
3186 dout("probe failed, returning %d\n", ret);
3187
3188 return ret;
3189}
3190
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003191static ssize_t rbd_add(struct bus_type *bus,
3192 const char *buf,
3193 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003194{
Alex Eldercb8627c2012-07-09 21:04:23 -05003195 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003196 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003197 struct rbd_options *rbd_opts = NULL;
Alex Elder27cc2592012-02-02 08:13:30 -06003198 struct ceph_osd_client *osdc;
3199 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003200
3201 if (!try_module_get(THIS_MODULE))
3202 return -ENODEV;
3203
Alex Eldercb8627c2012-07-09 21:04:23 -05003204 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3205 if (!rbd_dev)
Alex Elder4e9afeb2012-10-25 23:34:41 -05003206 return -ENOMEM;
Alex Elder8b8fb992012-10-26 17:25:24 -05003207 rbd_dev->spec = rbd_spec_alloc();
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003208 if (!rbd_dev->spec)
3209 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003210
3211 /* static rbd_device initialization */
3212 spin_lock_init(&rbd_dev->lock);
3213 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003214 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003215 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003216
Alex Eldera725f65e2012-02-02 08:13:30 -06003217 /* parse add command */
Alex Elderdc79b112012-10-25 23:34:41 -05003218 rc = rbd_add_parse_args(rbd_dev, buf, &ceph_opts, &rbd_opts);
3219 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003220 goto err_out_mem;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003221 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Eldera725f65e2012-02-02 08:13:30 -06003222
Alex Elder78cea762012-10-25 23:34:41 -05003223 rc = rbd_get_client(rbd_dev, ceph_opts);
3224 if (rc < 0)
Alex Elder0ddebc02012-10-25 23:34:41 -05003225 goto err_out_args;
Alex Elder78cea762012-10-25 23:34:41 -05003226 ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003227
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003228 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003229 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003230 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003231 if (rc < 0)
3232 goto err_out_client;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003233 rbd_dev->spec->pool_id = (u64) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003234
Alex Eldera30b71b2012-07-10 20:30:11 -05003235 rc = rbd_dev_probe(rbd_dev);
3236 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003237 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003238
3239 /* no need to lock here, as rbd_dev is not registered yet */
3240 rc = rbd_dev_snaps_update(rbd_dev);
3241 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003242 goto err_out_probe;
Alex Elder05fd6f62012-08-29 17:11:07 -05003243
Alex Elder819d52b2012-10-25 23:34:41 -05003244 rc = rbd_dev_set_mapping(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05003245 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003246 goto err_out_snaps;
Alex Elder05fd6f62012-08-29 17:11:07 -05003247
Alex Elder85ae8922012-07-26 23:37:14 -05003248 /* generate unique id: find highest unique id, add one */
3249 rbd_dev_id_get(rbd_dev);
3250
3251 /* Fill in the device name, now that we have its id. */
3252 BUILD_BUG_ON(DEV_NAME_LEN
3253 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3254 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3255
3256 /* Get our block major device number. */
3257
Alex Elder27cc2592012-02-02 08:13:30 -06003258 rc = register_blkdev(0, rbd_dev->name);
3259 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003260 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003261 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003262
Alex Elder0f308a32012-08-29 17:11:07 -05003263 /* Set up the blkdev mapping. */
3264
3265 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003266 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003267 goto err_out_blkdev;
3268
Alex Elder0f308a32012-08-29 17:11:07 -05003269 rc = rbd_bus_add_dev(rbd_dev);
3270 if (rc)
3271 goto err_out_disk;
3272
Alex Elder32eec682012-02-08 16:11:14 -06003273 /*
3274 * At this point cleanup in the event of an error is the job
3275 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003276 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003277
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003278 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003279 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003280 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003281 if (rc)
3282 goto err_out_bus;
3283
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003284 rc = rbd_init_watch_dev(rbd_dev);
3285 if (rc)
3286 goto err_out_bus;
3287
Alex Elder4e9afeb2012-10-25 23:34:41 -05003288 kfree(rbd_opts);
3289
Alex Elder3ee40012012-08-29 17:11:07 -05003290 /* Everything's ready. Announce the disk to the world. */
3291
3292 add_disk(rbd_dev->disk);
3293
3294 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3295 (unsigned long long) rbd_dev->mapping.size);
3296
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003297 return count;
3298
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003299err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003300 /* this will also clean up rest of rbd_dev stuff */
3301
3302 rbd_bus_del_dev(rbd_dev);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003303 kfree(rbd_opts);
3304
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003305 return rc;
3306
Alex Elder0f308a32012-08-29 17:11:07 -05003307err_out_disk:
3308 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003309err_out_blkdev:
3310 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003311err_out_id:
3312 rbd_dev_id_put(rbd_dev);
Alex Elder41f38c22012-10-25 23:34:40 -05003313err_out_snaps:
3314 rbd_remove_all_snaps(rbd_dev);
3315err_out_probe:
Alex Elder05fd6f62012-08-29 17:11:07 -05003316 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003317err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003318 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003319 rbd_put_client(rbd_dev);
Alex Elder0ddebc02012-10-25 23:34:41 -05003320err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003321 if (ceph_opts)
3322 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003323 kfree(rbd_opts);
Alex Elder85ae8922012-07-26 23:37:14 -05003324err_out_mem:
Alex Elder8b8fb992012-10-26 17:25:24 -05003325 rbd_spec_put(rbd_dev->spec);
Alex Elder27cc2592012-02-02 08:13:30 -06003326 kfree(rbd_dev);
3327
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003328 dout("Error adding device %s\n", buf);
3329 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003330
3331 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003332}
3333
Alex Elderde71a292012-07-03 16:01:19 -05003334static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003335{
3336 struct list_head *tmp;
3337 struct rbd_device *rbd_dev;
3338
Alex Eldere124a822012-01-29 13:57:44 -06003339 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003340 list_for_each(tmp, &rbd_dev_list) {
3341 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003342 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003343 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003344 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003345 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003346 }
Alex Eldere124a822012-01-29 13:57:44 -06003347 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003348 return NULL;
3349}
3350
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003351static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003352{
Alex Elder593a9e72012-02-07 12:03:37 -06003353 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003354
Alex Elder1dbb4392012-01-24 10:08:37 -06003355 if (rbd_dev->watch_request) {
3356 struct ceph_client *client = rbd_dev->rbd_client->client;
3357
3358 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003359 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003360 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003361 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003362 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003363
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003364 rbd_put_client(rbd_dev);
3365
3366 /* clean up and free blkdev */
3367 rbd_free_disk(rbd_dev);
3368 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003369
Alex Elder2ac4e752012-07-10 20:30:10 -05003370 /* release allocated disk header fields */
3371 rbd_header_free(&rbd_dev->header);
3372
Alex Elder32eec682012-02-08 16:11:14 -06003373 /* done with the id, and with the rbd_dev */
Alex Elder0bed54d2012-07-03 16:01:18 -05003374 kfree(rbd_dev->header_name);
Alex Eldere2839302012-08-29 17:11:06 -05003375 rbd_dev_id_put(rbd_dev);
Alex Elder8b8fb992012-10-26 17:25:24 -05003376 rbd_spec_put(rbd_dev->spec);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003377 kfree(rbd_dev);
3378
3379 /* release module ref */
3380 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003381}
3382
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003383static ssize_t rbd_remove(struct bus_type *bus,
3384 const char *buf,
3385 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003386{
3387 struct rbd_device *rbd_dev = NULL;
3388 int target_id, rc;
3389 unsigned long ul;
3390 int ret = count;
3391
3392 rc = strict_strtoul(buf, 10, &ul);
3393 if (rc)
3394 return rc;
3395
3396 /* convert to int; abort if we lost anything in the conversion */
3397 target_id = (int) ul;
3398 if (target_id != ul)
3399 return -EINVAL;
3400
3401 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3402
3403 rbd_dev = __rbd_get_dev(target_id);
3404 if (!rbd_dev) {
3405 ret = -ENOENT;
3406 goto done;
3407 }
3408
Alex Elder41f38c22012-10-25 23:34:40 -05003409 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003410 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003411
3412done:
3413 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003414
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003415 return ret;
3416}
3417
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003418/*
3419 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003420 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003421 */
3422static int rbd_sysfs_init(void)
3423{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003424 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003425
Alex Elderfed4c142012-02-07 12:03:36 -06003426 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003427 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003428 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003429
Alex Elderfed4c142012-02-07 12:03:36 -06003430 ret = bus_register(&rbd_bus_type);
3431 if (ret < 0)
3432 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003433
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003434 return ret;
3435}
3436
3437static void rbd_sysfs_cleanup(void)
3438{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003439 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003440 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003441}
3442
3443int __init rbd_init(void)
3444{
3445 int rc;
3446
3447 rc = rbd_sysfs_init();
3448 if (rc)
3449 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003450 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003451 return 0;
3452}
3453
3454void __exit rbd_exit(void)
3455{
3456 rbd_sysfs_cleanup();
3457}
3458
3459module_init(rbd_init);
3460module_exit(rbd_exit);
3461
3462MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3463MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3464MODULE_DESCRIPTION("rados block device");
3465
3466/* following authorship retained from original osdblk.c */
3467MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3468
3469MODULE_LICENSE("GPL");