blob: 4771de2fba8a6fb781854d595b3979342b7a53d5 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500115/*
116 * An rbd image specification.
117 *
118 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
119 * identify an image.
120 */
121struct rbd_spec {
122 u64 pool_id;
123 char *pool_name;
124
125 char *image_id;
126 size_t image_id_len;
127 char *image_name;
128 size_t image_name_len;
129
130 u64 snap_id;
131 char *snap_name;
132
133 struct kref kref;
134};
135
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700136struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700137 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700138};
139
140/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600141 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700142 */
143struct rbd_client {
144 struct ceph_client *client;
145 struct kref kref;
146 struct list_head node;
147};
148
149/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600150 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700152struct rbd_req_status {
153 int done;
154 int rc;
155 u64 bytes;
156};
157
158/*
159 * a collection of requests
160 */
161struct rbd_req_coll {
162 int total;
163 int num_done;
164 struct kref kref;
165 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700166};
167
Alex Elderf0f8cef2012-01-29 13:57:44 -0600168/*
169 * a single io request
170 */
171struct rbd_request {
172 struct request *rq; /* blk layer request */
173 struct bio *bio; /* cloned bio */
174 struct page **pages; /* list of used pages */
175 u64 len;
176 int coll_index;
177 struct rbd_req_coll *coll;
178};
179
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800180struct rbd_snap {
181 struct device dev;
182 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800183 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800184 struct list_head node;
185 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500186 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800187};
188
Alex Elderf84344f2012-08-31 17:29:51 -0500189struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500190 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500191 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500192 bool read_only;
193};
194
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195/*
196 * a single device
197 */
198struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500199 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700200
201 int major; /* blkdev assigned major */
202 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700203
Alex Eldera30b71b2012-07-10 20:30:11 -0500204 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700205 struct rbd_client *rbd_client;
206
207 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
208
209 spinlock_t lock; /* queue lock */
210
211 struct rbd_image_header header;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500212 bool exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500213 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700214
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500215 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500216
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700217 struct ceph_osd_event *watch_event;
218 struct ceph_osd_request *watch_request;
219
Josh Durginc6666012011-11-21 17:11:12 -0800220 /* protects updating the header */
221 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500222
223 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700224
225 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800226
227 /* list of snapshots */
228 struct list_head snaps;
229
230 /* sysfs related */
231 struct device dev;
232};
233
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700234static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600235
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700236static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600237static DEFINE_SPINLOCK(rbd_dev_list_lock);
238
Alex Elder432b8582012-01-29 13:57:44 -0600239static LIST_HEAD(rbd_client_list); /* clients */
240static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700241
Alex Elder304f6802012-08-31 17:29:52 -0500242static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
243static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
244
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800245static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500246static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800247
Alex Elderf0f8cef2012-01-29 13:57:44 -0600248static ssize_t rbd_add(struct bus_type *bus, const char *buf,
249 size_t count);
250static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
251 size_t count);
252
253static struct bus_attribute rbd_bus_attrs[] = {
254 __ATTR(add, S_IWUSR, NULL, rbd_add),
255 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
256 __ATTR_NULL
257};
258
259static struct bus_type rbd_bus_type = {
260 .name = "rbd",
261 .bus_attrs = rbd_bus_attrs,
262};
263
264static void rbd_root_dev_release(struct device *dev)
265{
266}
267
268static struct device rbd_root_dev = {
269 .init_name = "rbd",
270 .release = rbd_root_dev_release,
271};
272
Alex Elderaafb2302012-09-06 16:00:54 -0500273#ifdef RBD_DEBUG
274#define rbd_assert(expr) \
275 if (unlikely(!(expr))) { \
276 printk(KERN_ERR "\nAssertion failure in %s() " \
277 "at line %d:\n\n" \
278 "\trbd_assert(%s);\n\n", \
279 __func__, __LINE__, #expr); \
280 BUG(); \
281 }
282#else /* !RBD_DEBUG */
283# define rbd_assert(expr) ((void) 0)
284#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800285
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
287{
288 return get_device(&rbd_dev->dev);
289}
290
291static void rbd_put_dev(struct rbd_device *rbd_dev)
292{
293 put_device(&rbd_dev->dev);
294}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295
Alex Elder117973f2012-08-31 17:29:55 -0500296static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
297static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700298
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299static int rbd_open(struct block_device *bdev, fmode_t mode)
300{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600301 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302
Alex Elderf84344f2012-08-31 17:29:51 -0500303 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304 return -EROFS;
305
Alex Elder340c7a22012-08-10 13:12:07 -0700306 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500307 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700308
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309 return 0;
310}
311
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800312static int rbd_release(struct gendisk *disk, fmode_t mode)
313{
314 struct rbd_device *rbd_dev = disk->private_data;
315
316 rbd_put_dev(rbd_dev);
317
318 return 0;
319}
320
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700321static const struct block_device_operations rbd_bd_ops = {
322 .owner = THIS_MODULE,
323 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800324 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700325};
326
327/*
328 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500329 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700330 */
Alex Elderf8c38922012-08-10 13:12:07 -0700331static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332{
333 struct rbd_client *rbdc;
334 int ret = -ENOMEM;
335
336 dout("rbd_client_create\n");
337 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
338 if (!rbdc)
339 goto out_opt;
340
341 kref_init(&rbdc->kref);
342 INIT_LIST_HEAD(&rbdc->node);
343
Alex Elderbc534d862012-01-29 13:57:44 -0600344 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
345
Alex Elder43ae4702012-07-03 16:01:18 -0500346 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600348 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500349 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350
351 ret = ceph_open_session(rbdc->client);
352 if (ret < 0)
353 goto out_err;
354
Alex Elder432b8582012-01-29 13:57:44 -0600355 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600357 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358
Alex Elderbc534d862012-01-29 13:57:44 -0600359 mutex_unlock(&ctl_mutex);
360
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361 dout("rbd_client_create created %p\n", rbdc);
362 return rbdc;
363
364out_err:
365 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600366out_mutex:
367 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700368 kfree(rbdc);
369out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500370 if (ceph_opts)
371 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400372 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700373}
374
375/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700376 * Find a ceph client with specific addr and configuration. If
377 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700378 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700379static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380{
381 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700382 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383
Alex Elder43ae4702012-07-03 16:01:18 -0500384 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700385 return NULL;
386
Alex Elder1f7ba332012-08-10 13:12:07 -0700387 spin_lock(&rbd_client_list_lock);
388 list_for_each_entry(client_node, &rbd_client_list, node) {
389 if (!ceph_compare_options(ceph_opts, client_node->client)) {
390 kref_get(&client_node->kref);
391 found = true;
392 break;
393 }
394 }
395 spin_unlock(&rbd_client_list_lock);
396
397 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700398}
399
400/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 * mount options
402 */
403enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700404 Opt_last_int,
405 /* int args above */
406 Opt_last_string,
407 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700408 Opt_read_only,
409 Opt_read_write,
410 /* Boolean args above */
411 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700412};
413
Alex Elder43ae4702012-07-03 16:01:18 -0500414static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700415 /* int args above */
416 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500417 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700418 {Opt_read_only, "ro"}, /* Alternate spelling */
419 {Opt_read_write, "read_write"},
420 {Opt_read_write, "rw"}, /* Alternate spelling */
421 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700422 {-1, NULL}
423};
424
425static int parse_rbd_opts_token(char *c, void *private)
426{
Alex Elder43ae4702012-07-03 16:01:18 -0500427 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700428 substring_t argstr[MAX_OPT_ARGS];
429 int token, intval, ret;
430
Alex Elder43ae4702012-07-03 16:01:18 -0500431 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700432 if (token < 0)
433 return -EINVAL;
434
435 if (token < Opt_last_int) {
436 ret = match_int(&argstr[0], &intval);
437 if (ret < 0) {
438 pr_err("bad mount option arg (not int) "
439 "at '%s'\n", c);
440 return ret;
441 }
442 dout("got int token %d val %d\n", token, intval);
443 } else if (token > Opt_last_int && token < Opt_last_string) {
444 dout("got string token %d val %s\n", token,
445 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700446 } else if (token > Opt_last_string && token < Opt_last_bool) {
447 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448 } else {
449 dout("got token %d\n", token);
450 }
451
452 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700453 case Opt_read_only:
454 rbd_opts->read_only = true;
455 break;
456 case Opt_read_write:
457 rbd_opts->read_only = false;
458 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700459 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500460 rbd_assert(false);
461 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700462 }
463 return 0;
464}
465
466/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467 * Get a ceph client with specific addr and configuration, if one does
468 * not exist create it.
469 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500470static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471{
Alex Elderf8c38922012-08-10 13:12:07 -0700472 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700473
Alex Elder1f7ba332012-08-10 13:12:07 -0700474 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500475 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500476 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500477 else
Alex Elderf8c38922012-08-10 13:12:07 -0700478 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700479
Alex Elder9d3997f2012-10-25 23:34:42 -0500480 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481}
482
483/*
484 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600485 *
Alex Elder432b8582012-01-29 13:57:44 -0600486 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700487 */
488static void rbd_client_release(struct kref *kref)
489{
490 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
491
492 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500493 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700494 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500495 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700496
497 ceph_destroy_client(rbdc->client);
498 kfree(rbdc);
499}
500
501/*
502 * Drop reference to ceph client node. If it's not referenced anymore, release
503 * it.
504 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500505static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700506{
Alex Elder9d3997f2012-10-25 23:34:42 -0500507 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700508}
509
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700510/*
511 * Destroy requests collection
512 */
513static void rbd_coll_release(struct kref *kref)
514{
515 struct rbd_req_coll *coll =
516 container_of(kref, struct rbd_req_coll, kref);
517
518 dout("rbd_coll_release %p\n", coll);
519 kfree(coll);
520}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700521
Alex Eldera30b71b2012-07-10 20:30:11 -0500522static bool rbd_image_format_valid(u32 image_format)
523{
524 return image_format == 1 || image_format == 2;
525}
526
Alex Elder8e94af82012-07-25 09:32:40 -0500527static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
528{
Alex Elder103a1502012-08-02 11:29:45 -0500529 size_t size;
530 u32 snap_count;
531
532 /* The header has to start with the magic rbd header text */
533 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
534 return false;
535
Alex Elderdb2388b2012-10-20 22:17:27 -0500536 /* The bio layer requires at least sector-sized I/O */
537
538 if (ondisk->options.order < SECTOR_SHIFT)
539 return false;
540
541 /* If we use u64 in a few spots we may be able to loosen this */
542
543 if (ondisk->options.order > 8 * sizeof (int) - 1)
544 return false;
545
Alex Elder103a1502012-08-02 11:29:45 -0500546 /*
547 * The size of a snapshot header has to fit in a size_t, and
548 * that limits the number of snapshots.
549 */
550 snap_count = le32_to_cpu(ondisk->snap_count);
551 size = SIZE_MAX - sizeof (struct ceph_snap_context);
552 if (snap_count > size / sizeof (__le64))
553 return false;
554
555 /*
556 * Not only that, but the size of the entire the snapshot
557 * header must also be representable in a size_t.
558 */
559 size -= snap_count * sizeof (__le64);
560 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
561 return false;
562
563 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500564}
565
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700566/*
567 * Create a new header structure, translate header format from the on-disk
568 * header.
569 */
570static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500571 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572{
Alex Elderccece232012-07-10 20:30:10 -0500573 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500574 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500575 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500576 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700577
Alex Elder6a523252012-07-19 17:12:59 -0500578 memset(header, 0, sizeof (*header));
579
Alex Elder103a1502012-08-02 11:29:45 -0500580 snap_count = le32_to_cpu(ondisk->snap_count);
581
Alex Elder58c17b02012-08-23 23:22:06 -0500582 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
583 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500584 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500586 memcpy(header->object_prefix, ondisk->object_prefix, len);
587 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600588
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700589 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500590 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
591
Alex Elder621901d2012-08-23 23:22:06 -0500592 /* Save a copy of the snapshot names */
593
Alex Elderf785cc12012-08-23 23:22:06 -0500594 if (snap_names_len > (u64) SIZE_MAX)
595 return -EIO;
596 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500598 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500599 /*
600 * Note that rbd_dev_v1_header_read() guarantees
601 * the ondisk buffer we're working with has
602 * snap_names_len bytes beyond the end of the
603 * snapshot id array, this memcpy() is safe.
604 */
605 memcpy(header->snap_names, &ondisk->snaps[snap_count],
606 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500607
Alex Elder621901d2012-08-23 23:22:06 -0500608 /* Record each snapshot's size */
609
Alex Elderd2bb24e2012-07-26 23:37:14 -0500610 size = snap_count * sizeof (*header->snap_sizes);
611 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500613 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500614 for (i = 0; i < snap_count; i++)
615 header->snap_sizes[i] =
616 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617 } else {
Alex Elderccece232012-07-10 20:30:10 -0500618 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619 header->snap_names = NULL;
620 header->snap_sizes = NULL;
621 }
Alex Elder849b4262012-07-09 21:04:24 -0500622
Alex Elder34b13182012-07-13 20:35:12 -0500623 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624 header->obj_order = ondisk->options.order;
625 header->crypt_type = ondisk->options.crypt_type;
626 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500627
Alex Elder621901d2012-08-23 23:22:06 -0500628 /* Allocate and fill in the snapshot context */
629
Alex Elderf84344f2012-08-31 17:29:51 -0500630 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500631 size = sizeof (struct ceph_snap_context);
632 size += snap_count * sizeof (header->snapc->snaps[0]);
633 header->snapc = kzalloc(size, GFP_KERNEL);
634 if (!header->snapc)
635 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636
637 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500638 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500640 for (i = 0; i < snap_count; i++)
641 header->snapc->snaps[i] =
642 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643
644 return 0;
645
Alex Elder6a523252012-07-19 17:12:59 -0500646out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500647 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500648 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700649 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500650 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500651 kfree(header->object_prefix);
652 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500653
Alex Elder00f1f362012-02-07 12:03:36 -0600654 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700655}
656
Alex Elder8836b992012-08-30 14:42:15 -0500657static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659
Alex Eldere86924a2012-07-10 20:30:11 -0500660 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600661
Alex Eldere86924a2012-07-10 20:30:11 -0500662 list_for_each_entry(snap, &rbd_dev->snaps, node) {
663 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500664 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500665 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500666 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600667
Alex Eldere86924a2012-07-10 20:30:11 -0500668 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600669 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670 }
Alex Eldere86924a2012-07-10 20:30:11 -0500671
Alex Elder00f1f362012-02-07 12:03:36 -0600672 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673}
674
Alex Elder819d52b2012-10-25 23:34:41 -0500675static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700676{
Alex Elder78dc4472012-07-19 08:49:18 -0500677 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500679 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800680 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500681 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500682 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500683 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500684 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500686 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687 if (ret < 0)
688 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500689 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690 }
Alex Elderdaba5fd2012-10-26 17:25:23 -0500691 rbd_dev->exists = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700693 return ret;
694}
695
696static void rbd_header_free(struct rbd_image_header *header)
697{
Alex Elder849b4262012-07-09 21:04:24 -0500698 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500699 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500701 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500702 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500703 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800704 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500705 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706}
707
Alex Elder65ccfe22012-08-09 10:33:26 -0700708static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709{
Alex Elder65ccfe22012-08-09 10:33:26 -0700710 char *name;
711 u64 segment;
712 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713
Alex Elder65ccfe22012-08-09 10:33:26 -0700714 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
715 if (!name)
716 return NULL;
717 segment = offset >> rbd_dev->header.obj_order;
718 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
719 rbd_dev->header.object_prefix, segment);
720 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
721 pr_err("error formatting segment name for #%llu (%d)\n",
722 segment, ret);
723 kfree(name);
724 name = NULL;
725 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726
Alex Elder65ccfe22012-08-09 10:33:26 -0700727 return name;
728}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729
Alex Elder65ccfe22012-08-09 10:33:26 -0700730static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
731{
732 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733
Alex Elder65ccfe22012-08-09 10:33:26 -0700734 return offset & (segment_size - 1);
735}
736
737static u64 rbd_segment_length(struct rbd_device *rbd_dev,
738 u64 offset, u64 length)
739{
740 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
741
742 offset &= segment_size - 1;
743
Alex Elderaafb2302012-09-06 16:00:54 -0500744 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700745 if (offset + length > segment_size)
746 length = segment_size - offset;
747
748 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700749}
750
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700751static int rbd_get_num_segments(struct rbd_image_header *header,
752 u64 ofs, u64 len)
753{
Alex Elderdf111be2012-08-09 10:33:26 -0700754 u64 start_seg;
755 u64 end_seg;
756
757 if (!len)
758 return 0;
759 if (len - 1 > U64_MAX - ofs)
760 return -ERANGE;
761
762 start_seg = ofs >> header->obj_order;
763 end_seg = (ofs + len - 1) >> header->obj_order;
764
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700765 return end_seg - start_seg + 1;
766}
767
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700768/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700769 * returns the size of an object in the image
770 */
771static u64 rbd_obj_bytes(struct rbd_image_header *header)
772{
773 return 1 << header->obj_order;
774}
775
776/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700777 * bio helpers
778 */
779
780static void bio_chain_put(struct bio *chain)
781{
782 struct bio *tmp;
783
784 while (chain) {
785 tmp = chain;
786 chain = chain->bi_next;
787 bio_put(tmp);
788 }
789}
790
791/*
792 * zeros a bio chain, starting at specific offset
793 */
794static void zero_bio_chain(struct bio *chain, int start_ofs)
795{
796 struct bio_vec *bv;
797 unsigned long flags;
798 void *buf;
799 int i;
800 int pos = 0;
801
802 while (chain) {
803 bio_for_each_segment(bv, chain, i) {
804 if (pos + bv->bv_len > start_ofs) {
805 int remainder = max(start_ofs - pos, 0);
806 buf = bvec_kmap_irq(bv, &flags);
807 memset(buf + remainder, 0,
808 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200809 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700810 }
811 pos += bv->bv_len;
812 }
813
814 chain = chain->bi_next;
815 }
816}
817
818/*
Alex Elderf7760da2012-10-20 22:17:27 -0500819 * Clone a portion of a bio, starting at the given byte offset
820 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700821 */
Alex Elderf7760da2012-10-20 22:17:27 -0500822static struct bio *bio_clone_range(struct bio *bio_src,
823 unsigned int offset,
824 unsigned int len,
825 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700826{
Alex Elderf7760da2012-10-20 22:17:27 -0500827 struct bio_vec *bv;
828 unsigned int resid;
829 unsigned short idx;
830 unsigned int voff;
831 unsigned short end_idx;
832 unsigned short vcnt;
833 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834
Alex Elderf7760da2012-10-20 22:17:27 -0500835 /* Handle the easy case for the caller */
836
837 if (!offset && len == bio_src->bi_size)
838 return bio_clone(bio_src, gfpmask);
839
840 if (WARN_ON_ONCE(!len))
841 return NULL;
842 if (WARN_ON_ONCE(len > bio_src->bi_size))
843 return NULL;
844 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
845 return NULL;
846
847 /* Find first affected segment... */
848
849 resid = offset;
850 __bio_for_each_segment(bv, bio_src, idx, 0) {
851 if (resid < bv->bv_len)
852 break;
853 resid -= bv->bv_len;
854 }
855 voff = resid;
856
857 /* ...and the last affected segment */
858
859 resid += len;
860 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
861 if (resid <= bv->bv_len)
862 break;
863 resid -= bv->bv_len;
864 }
865 vcnt = end_idx - idx + 1;
866
867 /* Build the clone */
868
869 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
870 if (!bio)
871 return NULL; /* ENOMEM */
872
873 bio->bi_bdev = bio_src->bi_bdev;
874 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
875 bio->bi_rw = bio_src->bi_rw;
876 bio->bi_flags |= 1 << BIO_CLONED;
877
878 /*
879 * Copy over our part of the bio_vec, then update the first
880 * and last (or only) entries.
881 */
882 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
883 vcnt * sizeof (struct bio_vec));
884 bio->bi_io_vec[0].bv_offset += voff;
885 if (vcnt > 1) {
886 bio->bi_io_vec[0].bv_len -= voff;
887 bio->bi_io_vec[vcnt - 1].bv_len = resid;
888 } else {
889 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700890 }
891
Alex Elderf7760da2012-10-20 22:17:27 -0500892 bio->bi_vcnt = vcnt;
893 bio->bi_size = len;
894 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700895
Alex Elderf7760da2012-10-20 22:17:27 -0500896 return bio;
897}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898
Alex Elderf7760da2012-10-20 22:17:27 -0500899/*
900 * Clone a portion of a bio chain, starting at the given byte offset
901 * into the first bio in the source chain and continuing for the
902 * number of bytes indicated. The result is another bio chain of
903 * exactly the given length, or a null pointer on error.
904 *
905 * The bio_src and offset parameters are both in-out. On entry they
906 * refer to the first source bio and the offset into that bio where
907 * the start of data to be cloned is located.
908 *
909 * On return, bio_src is updated to refer to the bio in the source
910 * chain that contains first un-cloned byte, and *offset will
911 * contain the offset of that byte within that bio.
912 */
913static struct bio *bio_chain_clone_range(struct bio **bio_src,
914 unsigned int *offset,
915 unsigned int len,
916 gfp_t gfpmask)
917{
918 struct bio *bi = *bio_src;
919 unsigned int off = *offset;
920 struct bio *chain = NULL;
921 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700922
Alex Elderf7760da2012-10-20 22:17:27 -0500923 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924
Alex Elderf7760da2012-10-20 22:17:27 -0500925 if (!bi || off >= bi->bi_size || !len)
926 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700927
Alex Elderf7760da2012-10-20 22:17:27 -0500928 end = &chain;
929 while (len) {
930 unsigned int bi_size;
931 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700932
Alex Elderf7760da2012-10-20 22:17:27 -0500933 if (!bi)
934 goto out_err; /* EINVAL; ran out of bio's */
935 bi_size = min_t(unsigned int, bi->bi_size - off, len);
936 bio = bio_clone_range(bi, off, bi_size, gfpmask);
937 if (!bio)
938 goto out_err; /* ENOMEM */
939
940 *end = bio;
941 end = &bio->bi_next;
942
943 off += bi_size;
944 if (off == bi->bi_size) {
945 bi = bi->bi_next;
946 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947 }
Alex Elderf7760da2012-10-20 22:17:27 -0500948 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949 }
Alex Elderf7760da2012-10-20 22:17:27 -0500950 *bio_src = bi;
951 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700952
Alex Elderf7760da2012-10-20 22:17:27 -0500953 return chain;
954out_err:
955 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700957 return NULL;
958}
959
960/*
961 * helpers for osd request op vectors.
962 */
Alex Elder57cfc102012-06-26 12:57:03 -0700963static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
964 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965{
Alex Elder57cfc102012-06-26 12:57:03 -0700966 struct ceph_osd_req_op *ops;
967
968 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
969 if (!ops)
970 return NULL;
971
972 ops[0].op = opcode;
973
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700974 /*
975 * op extent offset and length will be set later on
976 * in calc_raw_layout()
977 */
Alex Elder57cfc102012-06-26 12:57:03 -0700978 ops[0].payload_len = payload_len;
979
980 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700981}
982
983static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
984{
985 kfree(ops);
986}
987
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700988static void rbd_coll_end_req_index(struct request *rq,
989 struct rbd_req_coll *coll,
990 int index,
991 int ret, u64 len)
992{
993 struct request_queue *q;
994 int min, max, i;
995
Alex Elderbd919d42012-07-13 20:35:11 -0500996 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
997 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700998
999 if (!rq)
1000 return;
1001
1002 if (!coll) {
1003 blk_end_request(rq, ret, len);
1004 return;
1005 }
1006
1007 q = rq->q;
1008
1009 spin_lock_irq(q->queue_lock);
1010 coll->status[index].done = 1;
1011 coll->status[index].rc = ret;
1012 coll->status[index].bytes = len;
1013 max = min = coll->num_done;
1014 while (max < coll->total && coll->status[max].done)
1015 max++;
1016
1017 for (i = min; i<max; i++) {
1018 __blk_end_request(rq, coll->status[i].rc,
1019 coll->status[i].bytes);
1020 coll->num_done++;
1021 kref_put(&coll->kref, rbd_coll_release);
1022 }
1023 spin_unlock_irq(q->queue_lock);
1024}
1025
1026static void rbd_coll_end_req(struct rbd_request *req,
1027 int ret, u64 len)
1028{
1029 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1030}
1031
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001032/*
1033 * Send ceph osd request
1034 */
1035static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001036 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001037 struct ceph_snap_context *snapc,
1038 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001039 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040 struct bio *bio,
1041 struct page **pages,
1042 int num_pages,
1043 int flags,
1044 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001045 struct rbd_req_coll *coll,
1046 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001047 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001048 struct ceph_msg *msg),
1049 struct ceph_osd_request **linger_req,
1050 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001051{
1052 struct ceph_osd_request *req;
1053 struct ceph_file_layout *layout;
1054 int ret;
1055 u64 bno;
1056 struct timespec mtime = CURRENT_TIME;
1057 struct rbd_request *req_data;
1058 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001059 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001060
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001061 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001062 if (!req_data) {
1063 if (coll)
1064 rbd_coll_end_req_index(rq, coll, coll_index,
1065 -ENOMEM, len);
1066 return -ENOMEM;
1067 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001068
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001069 if (coll) {
1070 req_data->coll = coll;
1071 req_data->coll_index = coll_index;
1072 }
1073
Alex Elderf7760da2012-10-20 22:17:27 -05001074 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1075 object_name, (unsigned long long) ofs,
1076 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077
Alex Elder0ce1a792012-07-03 16:01:18 -05001078 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001079 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1080 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001081 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001082 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001083 goto done_pages;
1084 }
1085
1086 req->r_callback = rbd_cb;
1087
1088 req_data->rq = rq;
1089 req_data->bio = bio;
1090 req_data->pages = pages;
1091 req_data->len = len;
1092
1093 req->r_priv = req_data;
1094
1095 reqhead = req->r_request->front.iov_base;
1096 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1097
Alex Elderaded07e2012-07-03 16:01:18 -05001098 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001099 req->r_oid_len = strlen(req->r_oid);
1100
1101 layout = &req->r_file_layout;
1102 memset(layout, 0, sizeof(*layout));
1103 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1104 layout->fl_stripe_count = cpu_to_le32(1);
1105 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001106 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001107 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1108 req, ops);
1109 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110
1111 ceph_osdc_build_request(req, ofs, &len,
1112 ops,
1113 snapc,
1114 &mtime,
1115 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001116
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001117 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001118 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001119 *linger_req = req;
1120 }
1121
Alex Elder1dbb4392012-01-24 10:08:37 -06001122 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123 if (ret < 0)
1124 goto done_err;
1125
1126 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001127 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001128 if (ver)
1129 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001130 dout("reassert_ver=%llu\n",
1131 (unsigned long long)
1132 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133 ceph_osdc_put_request(req);
1134 }
1135 return ret;
1136
1137done_err:
1138 bio_chain_put(req_data->bio);
1139 ceph_osdc_put_request(req);
1140done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001141 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001142 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143 return ret;
1144}
1145
1146/*
1147 * Ceph osd op callback
1148 */
1149static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1150{
1151 struct rbd_request *req_data = req->r_priv;
1152 struct ceph_osd_reply_head *replyhead;
1153 struct ceph_osd_op *op;
1154 __s32 rc;
1155 u64 bytes;
1156 int read_op;
1157
1158 /* parse reply */
1159 replyhead = msg->front.iov_base;
1160 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1161 op = (void *)(replyhead + 1);
1162 rc = le32_to_cpu(replyhead->result);
1163 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001164 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165
Alex Elderbd919d42012-07-13 20:35:11 -05001166 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1167 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168
1169 if (rc == -ENOENT && read_op) {
1170 zero_bio_chain(req_data->bio, 0);
1171 rc = 0;
1172 } else if (rc == 0 && read_op && bytes < req_data->len) {
1173 zero_bio_chain(req_data->bio, bytes);
1174 bytes = req_data->len;
1175 }
1176
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001177 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178
1179 if (req_data->bio)
1180 bio_chain_put(req_data->bio);
1181
1182 ceph_osdc_put_request(req);
1183 kfree(req_data);
1184}
1185
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001186static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1187{
1188 ceph_osdc_put_request(req);
1189}
1190
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001191/*
1192 * Do a synchronous ceph osd operation
1193 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001194static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195 struct ceph_snap_context *snapc,
1196 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001198 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001199 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001200 u64 ofs, u64 inbound_size,
1201 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001202 struct ceph_osd_request **linger_req,
1203 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001204{
1205 int ret;
1206 struct page **pages;
1207 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001208
Alex Elderaafb2302012-09-06 16:00:54 -05001209 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001210
Alex Elderf8d4de62012-07-03 16:01:19 -05001211 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001213 if (IS_ERR(pages))
1214 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215
Alex Elder0ce1a792012-07-03 16:01:18 -05001216 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001217 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001218 pages, num_pages,
1219 flags,
1220 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001221 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001222 NULL,
1223 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001224 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001225 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226
Alex Elderf8d4de62012-07-03 16:01:19 -05001227 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1228 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001229
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001230done:
1231 ceph_release_page_vector(pages, num_pages);
1232 return ret;
1233}
1234
1235/*
1236 * Do an asynchronous ceph osd operation
1237 */
1238static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001239 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001240 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001241 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001242 struct bio *bio,
1243 struct rbd_req_coll *coll,
1244 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001245{
1246 char *seg_name;
1247 u64 seg_ofs;
1248 u64 seg_len;
1249 int ret;
1250 struct ceph_osd_req_op *ops;
1251 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001252 int opcode;
1253 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001254 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255
Alex Elder65ccfe22012-08-09 10:33:26 -07001256 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001257 if (!seg_name)
1258 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001259 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1260 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261
Alex Elderff2e4bb2012-10-10 18:59:29 -07001262 if (rq_data_dir(rq) == WRITE) {
1263 opcode = CEPH_OSD_OP_WRITE;
1264 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001265 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001266 payload_len = seg_len;
1267 } else {
1268 opcode = CEPH_OSD_OP_READ;
1269 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001270 snapc = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001271 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001272 payload_len = 0;
1273 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001274
Alex Elder57cfc102012-06-26 12:57:03 -07001275 ret = -ENOMEM;
1276 ops = rbd_create_rw_ops(1, opcode, payload_len);
1277 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001278 goto done;
1279
1280 /* we've taken care of segment sizes earlier when we
1281 cloned the bios. We should never have a segment
1282 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001283 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001284
1285 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1286 seg_name, seg_ofs, seg_len,
1287 bio,
1288 NULL, 0,
1289 flags,
1290 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001291 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001292 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001293
1294 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001295done:
1296 kfree(seg_name);
1297 return ret;
1298}
1299
1300/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001301 * Request sync osd read
1302 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001303static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001304 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001305 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001306 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001307 char *buf,
1308 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001309{
Alex Elder913d2fd2012-06-26 12:57:03 -07001310 struct ceph_osd_req_op *ops;
1311 int ret;
1312
1313 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1314 if (!ops)
1315 return -ENOMEM;
1316
1317 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001318 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001319 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001320 ops, object_name, ofs, len, buf, NULL, ver);
1321 rbd_destroy_ops(ops);
1322
1323 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001324}
1325
1326/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327 * Request sync osd watch
1328 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001329static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001331 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001332{
1333 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001334 int ret;
1335
Alex Elder57cfc102012-06-26 12:57:03 -07001336 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1337 if (!ops)
1338 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001339
Josh Durgina71b8912011-12-05 18:10:44 -08001340 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341 ops[0].watch.cookie = notify_id;
1342 ops[0].watch.flag = 0;
1343
Alex Elder0ce1a792012-07-03 16:01:18 -05001344 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001345 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001346 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001347 CEPH_OSD_FLAG_READ,
1348 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001349 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001350 rbd_simple_req_cb, 0, NULL);
1351
1352 rbd_destroy_ops(ops);
1353 return ret;
1354}
1355
1356static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1357{
Alex Elder0ce1a792012-07-03 16:01:18 -05001358 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001359 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001360 int rc;
1361
Alex Elder0ce1a792012-07-03 16:01:18 -05001362 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001363 return;
1364
Alex Elderbd919d42012-07-13 20:35:11 -05001365 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1366 rbd_dev->header_name, (unsigned long long) notify_id,
1367 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001368 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001369 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001370 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001371 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001372
Alex Elder7f0a24d2012-07-25 09:32:40 -05001373 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001374}
1375
1376/*
1377 * Request sync osd watch
1378 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001379static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001380{
1381 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001382 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001383 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001384
Alex Elder57cfc102012-06-26 12:57:03 -07001385 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1386 if (!ops)
1387 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001388
1389 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001390 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001391 if (ret < 0)
1392 goto fail;
1393
Alex Elder0e6f3222012-07-25 09:32:40 -05001394 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001395 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001396 ops[0].watch.flag = 1;
1397
Alex Elder0ce1a792012-07-03 16:01:18 -05001398 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001400 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1401 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001402 rbd_dev->header_name,
1403 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001404 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001405
1406 if (ret < 0)
1407 goto fail_event;
1408
1409 rbd_destroy_ops(ops);
1410 return 0;
1411
1412fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001413 ceph_osdc_cancel_event(rbd_dev->watch_event);
1414 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001415fail:
1416 rbd_destroy_ops(ops);
1417 return ret;
1418}
1419
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001420/*
1421 * Request sync osd unwatch
1422 */
Alex Elder070c6332012-07-25 09:32:41 -05001423static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001424{
1425 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001426 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001427
Alex Elder57cfc102012-06-26 12:57:03 -07001428 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1429 if (!ops)
1430 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001431
1432 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001433 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001434 ops[0].watch.flag = 0;
1435
Alex Elder0ce1a792012-07-03 16:01:18 -05001436 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001437 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001438 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1439 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001440 rbd_dev->header_name,
1441 0, 0, NULL, NULL, NULL);
1442
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001443
1444 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001445 ceph_osdc_cancel_event(rbd_dev->watch_event);
1446 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001447 return ret;
1448}
1449
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001450/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001451 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001452 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001453static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001454 const char *object_name,
1455 const char *class_name,
1456 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001457 const char *outbound,
1458 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001459 char *inbound,
1460 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001461 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001462 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001463{
1464 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001465 int class_name_len = strlen(class_name);
1466 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001467 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001468 int ret;
1469
Alex Elder3cb4a682012-06-26 12:57:03 -07001470 /*
1471 * Any input parameters required by the method we're calling
1472 * will be sent along with the class and method names as
1473 * part of the message payload. That data and its size are
1474 * supplied via the indata and indata_len fields (named from
1475 * the perspective of the server side) in the OSD request
1476 * operation.
1477 */
1478 payload_size = class_name_len + method_name_len + outbound_size;
1479 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001480 if (!ops)
1481 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001482
Alex Elderaded07e2012-07-03 16:01:18 -05001483 ops[0].cls.class_name = class_name;
1484 ops[0].cls.class_len = (__u8) class_name_len;
1485 ops[0].cls.method_name = method_name;
1486 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001487 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001488 ops[0].cls.indata = outbound;
1489 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490
Alex Elder0ce1a792012-07-03 16:01:18 -05001491 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001492 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001493 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001494 object_name, 0, inbound_size, inbound,
1495 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001496
1497 rbd_destroy_ops(ops);
1498
1499 dout("cls_exec returned %d\n", ret);
1500 return ret;
1501}
1502
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001503static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1504{
1505 struct rbd_req_coll *coll =
1506 kzalloc(sizeof(struct rbd_req_coll) +
1507 sizeof(struct rbd_req_status) * num_reqs,
1508 GFP_ATOMIC);
1509
1510 if (!coll)
1511 return NULL;
1512 coll->total = num_reqs;
1513 kref_init(&coll->kref);
1514 return coll;
1515}
1516
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001517/*
1518 * block device queue callback
1519 */
1520static void rbd_rq_fn(struct request_queue *q)
1521{
1522 struct rbd_device *rbd_dev = q->queuedata;
1523 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001524
Alex Elder00f1f362012-02-07 12:03:36 -06001525 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001528 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001530 int num_segs, cur_seg = 0;
1531 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001532 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001533 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 dout("fetched request\n");
1536
1537 /* filter out block requests we don't understand */
1538 if ((rq->cmd_type != REQ_TYPE_FS)) {
1539 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001540 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001541 }
1542
1543 /* deduce our operation (read, write) */
1544 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001545 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001547 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 }
1549
1550 spin_unlock_irq(q->queue_lock);
1551
Josh Durgind1d25642011-12-05 14:03:05 -08001552 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001553
Alex Elderdaba5fd2012-10-26 17:25:23 -05001554 if (!rbd_dev->exists) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001555 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
Josh Durgine88a36e2011-11-21 18:14:25 -08001556 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001557 dout("request for non-existent snapshot");
1558 spin_lock_irq(q->queue_lock);
1559 __blk_end_request_all(rq, -ENXIO);
1560 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001561 }
1562
Josh Durgind1d25642011-12-05 14:03:05 -08001563 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1564
1565 up_read(&rbd_dev->header_rwsem);
1566
Alex Elderf7760da2012-10-20 22:17:27 -05001567 size = blk_rq_bytes(rq);
1568 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1569 bio = rq->bio;
1570
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001571 dout("%s 0x%x bytes at 0x%llx\n",
1572 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001573 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001574
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001575 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001576 if (num_segs <= 0) {
1577 spin_lock_irq(q->queue_lock);
1578 __blk_end_request_all(rq, num_segs);
1579 ceph_put_snap_context(snapc);
1580 continue;
1581 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001582 coll = rbd_alloc_coll(num_segs);
1583 if (!coll) {
1584 spin_lock_irq(q->queue_lock);
1585 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001586 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001587 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001588 }
1589
Alex Elderf7760da2012-10-20 22:17:27 -05001590 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001591 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001592 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1593 unsigned int chain_size;
1594 struct bio *bio_chain;
1595
1596 BUG_ON(limit > (u64) UINT_MAX);
1597 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001598 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001599
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001600 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001601
1602 /* Pass a cloned bio chain via an osd request */
1603
1604 bio_chain = bio_chain_clone_range(&bio,
1605 &bio_offset, chain_size,
1606 GFP_ATOMIC);
1607 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001608 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001609 ofs, chain_size,
1610 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001611 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001612 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001613 -ENOMEM, chain_size);
1614 size -= chain_size;
1615 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001616
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001617 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001619 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001620
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001621 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001622
1623 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001624 }
1625}
1626
1627/*
1628 * a queue callback. Makes sure that we don't create a bio that spans across
1629 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001630 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001631 */
1632static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1633 struct bio_vec *bvec)
1634{
1635 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05001636 sector_t sector_offset;
1637 sector_t sectors_per_obj;
1638 sector_t obj_sector_offset;
1639 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001640
Alex Eldere5cfeed2012-10-20 22:17:27 -05001641 /*
1642 * Find how far into its rbd object the partition-relative
1643 * bio start sector is to offset relative to the enclosing
1644 * device.
1645 */
1646 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1647 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1648 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001649
Alex Eldere5cfeed2012-10-20 22:17:27 -05001650 /*
1651 * Compute the number of bytes from that offset to the end
1652 * of the object. Account for what's already used by the bio.
1653 */
1654 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1655 if (ret > bmd->bi_size)
1656 ret -= bmd->bi_size;
1657 else
1658 ret = 0;
1659
1660 /*
1661 * Don't send back more than was asked for. And if the bio
1662 * was empty, let the whole thing through because: "Note
1663 * that a block device *must* allow a single page to be
1664 * added to an empty bio."
1665 */
1666 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1667 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1668 ret = (int) bvec->bv_len;
1669
1670 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001671}
1672
1673static void rbd_free_disk(struct rbd_device *rbd_dev)
1674{
1675 struct gendisk *disk = rbd_dev->disk;
1676
1677 if (!disk)
1678 return;
1679
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001680 if (disk->flags & GENHD_FL_UP)
1681 del_gendisk(disk);
1682 if (disk->queue)
1683 blk_cleanup_queue(disk->queue);
1684 put_disk(disk);
1685}
1686
1687/*
Alex Elder4156d992012-08-02 11:29:46 -05001688 * Read the complete header for the given rbd device.
1689 *
1690 * Returns a pointer to a dynamically-allocated buffer containing
1691 * the complete and validated header. Caller can pass the address
1692 * of a variable that will be filled in with the version of the
1693 * header object at the time it was read.
1694 *
1695 * Returns a pointer-coded errno if a failure occurs.
1696 */
1697static struct rbd_image_header_ondisk *
1698rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1699{
1700 struct rbd_image_header_ondisk *ondisk = NULL;
1701 u32 snap_count = 0;
1702 u64 names_size = 0;
1703 u32 want_count;
1704 int ret;
1705
1706 /*
1707 * The complete header will include an array of its 64-bit
1708 * snapshot ids, followed by the names of those snapshots as
1709 * a contiguous block of NUL-terminated strings. Note that
1710 * the number of snapshots could change by the time we read
1711 * it in, in which case we re-read it.
1712 */
1713 do {
1714 size_t size;
1715
1716 kfree(ondisk);
1717
1718 size = sizeof (*ondisk);
1719 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1720 size += names_size;
1721 ondisk = kmalloc(size, GFP_KERNEL);
1722 if (!ondisk)
1723 return ERR_PTR(-ENOMEM);
1724
1725 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1726 rbd_dev->header_name,
1727 0, size,
1728 (char *) ondisk, version);
1729
1730 if (ret < 0)
1731 goto out_err;
1732 if (WARN_ON((size_t) ret < size)) {
1733 ret = -ENXIO;
1734 pr_warning("short header read for image %s"
1735 " (want %zd got %d)\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001736 rbd_dev->spec->image_name, size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05001737 goto out_err;
1738 }
1739 if (!rbd_dev_ondisk_valid(ondisk)) {
1740 ret = -ENXIO;
1741 pr_warning("invalid header for image %s\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001742 rbd_dev->spec->image_name);
Alex Elder4156d992012-08-02 11:29:46 -05001743 goto out_err;
1744 }
1745
1746 names_size = le64_to_cpu(ondisk->snap_names_len);
1747 want_count = snap_count;
1748 snap_count = le32_to_cpu(ondisk->snap_count);
1749 } while (snap_count != want_count);
1750
1751 return ondisk;
1752
1753out_err:
1754 kfree(ondisk);
1755
1756 return ERR_PTR(ret);
1757}
1758
1759/*
1760 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001761 */
1762static int rbd_read_header(struct rbd_device *rbd_dev,
1763 struct rbd_image_header *header)
1764{
Alex Elder4156d992012-08-02 11:29:46 -05001765 struct rbd_image_header_ondisk *ondisk;
1766 u64 ver = 0;
1767 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001768
Alex Elder4156d992012-08-02 11:29:46 -05001769 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1770 if (IS_ERR(ondisk))
1771 return PTR_ERR(ondisk);
1772 ret = rbd_header_from_disk(header, ondisk);
1773 if (ret >= 0)
1774 header->obj_version = ver;
1775 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001776
Alex Elder4156d992012-08-02 11:29:46 -05001777 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001778}
1779
Alex Elder41f38c22012-10-25 23:34:40 -05001780static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001781{
1782 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001783 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001784
Alex Eldera0593292012-07-19 09:09:27 -05001785 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001786 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001787}
1788
Alex Elder94785542012-10-09 13:50:17 -07001789static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1790{
1791 sector_t size;
1792
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001793 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001794 return;
1795
1796 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1797 dout("setting size to %llu sectors", (unsigned long long) size);
1798 rbd_dev->mapping.size = (u64) size;
1799 set_capacity(rbd_dev->disk, size);
1800}
1801
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001802/*
1803 * only read the first part of the ondisk header, without the snaps info
1804 */
Alex Elder117973f2012-08-31 17:29:55 -05001805static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001806{
1807 int ret;
1808 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001809
1810 ret = rbd_read_header(rbd_dev, &h);
1811 if (ret < 0)
1812 return ret;
1813
Josh Durgina51aa0c2011-12-05 10:35:04 -08001814 down_write(&rbd_dev->header_rwsem);
1815
Alex Elder94785542012-10-09 13:50:17 -07001816 /* Update image size, and check for resize of mapped image */
1817 rbd_dev->header.image_size = h.image_size;
1818 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001819
Alex Elder849b4262012-07-09 21:04:24 -05001820 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001821 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001822 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001823 /* osd requests may still refer to snapc */
1824 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825
Alex Elderb8136232012-07-25 09:32:41 -05001826 if (hver)
1827 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001828 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001829 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001830 rbd_dev->header.snapc = h.snapc;
1831 rbd_dev->header.snap_names = h.snap_names;
1832 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001833 /* Free the extra copy of the object prefix */
1834 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1835 kfree(h.object_prefix);
1836
Alex Elder304f6802012-08-31 17:29:52 -05001837 ret = rbd_dev_snaps_update(rbd_dev);
1838 if (!ret)
1839 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001840
Josh Durginc6666012011-11-21 17:11:12 -08001841 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001842
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001843 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001844}
1845
Alex Elder117973f2012-08-31 17:29:55 -05001846static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001847{
1848 int ret;
1849
Alex Elder117973f2012-08-31 17:29:55 -05001850 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001851 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001852 if (rbd_dev->image_format == 1)
1853 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1854 else
1855 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001856 mutex_unlock(&ctl_mutex);
1857
1858 return ret;
1859}
1860
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001861static int rbd_init_disk(struct rbd_device *rbd_dev)
1862{
1863 struct gendisk *disk;
1864 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001865 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001866
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001867 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001868 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1869 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001870 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001871
Alex Elderf0f8cef2012-01-29 13:57:44 -06001872 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001873 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001874 disk->major = rbd_dev->major;
1875 disk->first_minor = 0;
1876 disk->fops = &rbd_bd_ops;
1877 disk->private_data = rbd_dev;
1878
1879 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001880 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1881 if (!q)
1882 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001883
Alex Elder593a9e72012-02-07 12:03:37 -06001884 /* We use the default size, but let's be explicit about it. */
1885 blk_queue_physical_block_size(q, SECTOR_SIZE);
1886
Josh Durgin029bcbd2011-07-22 11:35:23 -07001887 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001888 segment_size = rbd_obj_bytes(&rbd_dev->header);
1889 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1890 blk_queue_max_segment_size(q, segment_size);
1891 blk_queue_io_min(q, segment_size);
1892 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001893
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001894 blk_queue_merge_bvec(q, rbd_merge_bvec);
1895 disk->queue = q;
1896
1897 q->queuedata = rbd_dev;
1898
1899 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001900
Alex Elder12f02942012-08-29 17:11:07 -05001901 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1902
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001903 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001904out_disk:
1905 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001906
1907 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001908}
1909
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001910/*
1911 sysfs
1912*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001913
Alex Elder593a9e72012-02-07 12:03:37 -06001914static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1915{
1916 return container_of(dev, struct rbd_device, dev);
1917}
1918
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001919static ssize_t rbd_size_show(struct device *dev,
1920 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001921{
Alex Elder593a9e72012-02-07 12:03:37 -06001922 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001923 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001924
Josh Durgina51aa0c2011-12-05 10:35:04 -08001925 down_read(&rbd_dev->header_rwsem);
1926 size = get_capacity(rbd_dev->disk);
1927 up_read(&rbd_dev->header_rwsem);
1928
1929 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001930}
1931
Alex Elder34b13182012-07-13 20:35:12 -05001932/*
1933 * Note this shows the features for whatever's mapped, which is not
1934 * necessarily the base image.
1935 */
1936static ssize_t rbd_features_show(struct device *dev,
1937 struct device_attribute *attr, char *buf)
1938{
1939 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1940
1941 return sprintf(buf, "0x%016llx\n",
1942 (unsigned long long) rbd_dev->mapping.features);
1943}
1944
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001945static ssize_t rbd_major_show(struct device *dev,
1946 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001947{
Alex Elder593a9e72012-02-07 12:03:37 -06001948 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001949
1950 return sprintf(buf, "%d\n", rbd_dev->major);
1951}
1952
1953static ssize_t rbd_client_id_show(struct device *dev,
1954 struct device_attribute *attr, char *buf)
1955{
Alex Elder593a9e72012-02-07 12:03:37 -06001956 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957
Alex Elder1dbb4392012-01-24 10:08:37 -06001958 return sprintf(buf, "client%lld\n",
1959 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001960}
1961
1962static ssize_t rbd_pool_show(struct device *dev,
1963 struct device_attribute *attr, char *buf)
1964{
Alex Elder593a9e72012-02-07 12:03:37 -06001965 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001966
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001967 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968}
1969
Alex Elder9bb2f332012-07-12 10:46:35 -05001970static ssize_t rbd_pool_id_show(struct device *dev,
1971 struct device_attribute *attr, char *buf)
1972{
1973 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1974
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001975 return sprintf(buf, "%llu\n",
1976 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05001977}
1978
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001979static ssize_t rbd_name_show(struct device *dev,
1980 struct device_attribute *attr, char *buf)
1981{
Alex Elder593a9e72012-02-07 12:03:37 -06001982 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001983
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001984 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001985}
1986
Alex Elder589d30e2012-07-10 20:30:11 -05001987static ssize_t rbd_image_id_show(struct device *dev,
1988 struct device_attribute *attr, char *buf)
1989{
1990 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1991
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001992 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05001993}
1994
Alex Elder34b13182012-07-13 20:35:12 -05001995/*
1996 * Shows the name of the currently-mapped snapshot (or
1997 * RBD_SNAP_HEAD_NAME for the base image).
1998 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001999static ssize_t rbd_snap_show(struct device *dev,
2000 struct device_attribute *attr,
2001 char *buf)
2002{
Alex Elder593a9e72012-02-07 12:03:37 -06002003 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002004
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002005 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002006}
2007
2008static ssize_t rbd_image_refresh(struct device *dev,
2009 struct device_attribute *attr,
2010 const char *buf,
2011 size_t size)
2012{
Alex Elder593a9e72012-02-07 12:03:37 -06002013 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002014 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002015
Alex Elder117973f2012-08-31 17:29:55 -05002016 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002017
2018 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002019}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002020
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002021static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002022static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002023static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2024static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2025static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002026static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002027static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002028static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002029static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2030static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002031
2032static struct attribute *rbd_attrs[] = {
2033 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002034 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002035 &dev_attr_major.attr,
2036 &dev_attr_client_id.attr,
2037 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002038 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002039 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002040 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002041 &dev_attr_current_snap.attr,
2042 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002043 NULL
2044};
2045
2046static struct attribute_group rbd_attr_group = {
2047 .attrs = rbd_attrs,
2048};
2049
2050static const struct attribute_group *rbd_attr_groups[] = {
2051 &rbd_attr_group,
2052 NULL
2053};
2054
2055static void rbd_sysfs_dev_release(struct device *dev)
2056{
2057}
2058
2059static struct device_type rbd_device_type = {
2060 .name = "rbd",
2061 .groups = rbd_attr_groups,
2062 .release = rbd_sysfs_dev_release,
2063};
2064
2065
2066/*
2067 sysfs - snapshots
2068*/
2069
2070static ssize_t rbd_snap_size_show(struct device *dev,
2071 struct device_attribute *attr,
2072 char *buf)
2073{
2074 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2075
Josh Durgin35915382011-12-05 18:25:13 -08002076 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002077}
2078
2079static ssize_t rbd_snap_id_show(struct device *dev,
2080 struct device_attribute *attr,
2081 char *buf)
2082{
2083 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2084
Josh Durgin35915382011-12-05 18:25:13 -08002085 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086}
2087
Alex Elder34b13182012-07-13 20:35:12 -05002088static ssize_t rbd_snap_features_show(struct device *dev,
2089 struct device_attribute *attr,
2090 char *buf)
2091{
2092 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2093
2094 return sprintf(buf, "0x%016llx\n",
2095 (unsigned long long) snap->features);
2096}
2097
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002098static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2099static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002100static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002101
2102static struct attribute *rbd_snap_attrs[] = {
2103 &dev_attr_snap_size.attr,
2104 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002105 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002106 NULL,
2107};
2108
2109static struct attribute_group rbd_snap_attr_group = {
2110 .attrs = rbd_snap_attrs,
2111};
2112
2113static void rbd_snap_dev_release(struct device *dev)
2114{
2115 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2116 kfree(snap->name);
2117 kfree(snap);
2118}
2119
2120static const struct attribute_group *rbd_snap_attr_groups[] = {
2121 &rbd_snap_attr_group,
2122 NULL
2123};
2124
2125static struct device_type rbd_snap_device_type = {
2126 .groups = rbd_snap_attr_groups,
2127 .release = rbd_snap_dev_release,
2128};
2129
Alex Elder8b8fb992012-10-26 17:25:24 -05002130static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2131{
2132 kref_get(&spec->kref);
2133
2134 return spec;
2135}
2136
2137static void rbd_spec_free(struct kref *kref);
2138static void rbd_spec_put(struct rbd_spec *spec)
2139{
2140 if (spec)
2141 kref_put(&spec->kref, rbd_spec_free);
2142}
2143
2144static struct rbd_spec *rbd_spec_alloc(void)
2145{
2146 struct rbd_spec *spec;
2147
2148 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2149 if (!spec)
2150 return NULL;
2151 kref_init(&spec->kref);
2152
2153 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2154
2155 return spec;
2156}
2157
2158static void rbd_spec_free(struct kref *kref)
2159{
2160 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2161
2162 kfree(spec->pool_name);
2163 kfree(spec->image_id);
2164 kfree(spec->image_name);
2165 kfree(spec->snap_name);
2166 kfree(spec);
2167}
2168
Alex Elder304f6802012-08-31 17:29:52 -05002169static bool rbd_snap_registered(struct rbd_snap *snap)
2170{
2171 bool ret = snap->dev.type == &rbd_snap_device_type;
2172 bool reg = device_is_registered(&snap->dev);
2173
2174 rbd_assert(!ret ^ reg);
2175
2176 return ret;
2177}
2178
Alex Elder41f38c22012-10-25 23:34:40 -05002179static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002180{
2181 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002182 if (device_is_registered(&snap->dev))
2183 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002184}
2185
Alex Elder14e70852012-07-19 09:09:27 -05002186static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002187 struct device *parent)
2188{
2189 struct device *dev = &snap->dev;
2190 int ret;
2191
2192 dev->type = &rbd_snap_device_type;
2193 dev->parent = parent;
2194 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002195 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002196 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2197
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002198 ret = device_register(dev);
2199
2200 return ret;
2201}
2202
Alex Elder4e891e02012-07-10 20:30:10 -05002203static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002204 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002205 u64 snap_id, u64 snap_size,
2206 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002207{
Alex Elder4e891e02012-07-10 20:30:10 -05002208 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002209 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002210
2211 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002212 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002213 return ERR_PTR(-ENOMEM);
2214
2215 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002216 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002217 if (!snap->name)
2218 goto err;
2219
Alex Elderc8d18422012-07-10 20:30:11 -05002220 snap->id = snap_id;
2221 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002222 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002223
2224 return snap;
2225
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002226err:
2227 kfree(snap->name);
2228 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002229
2230 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002231}
2232
Alex Eldercd892122012-07-03 16:01:19 -05002233static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2234 u64 *snap_size, u64 *snap_features)
2235{
2236 char *snap_name;
2237
2238 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2239
2240 *snap_size = rbd_dev->header.snap_sizes[which];
2241 *snap_features = 0; /* No features for v1 */
2242
2243 /* Skip over names until we find the one we are looking for */
2244
2245 snap_name = rbd_dev->header.snap_names;
2246 while (which--)
2247 snap_name += strlen(snap_name) + 1;
2248
2249 return snap_name;
2250}
2251
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002252/*
Alex Elder9d475de2012-07-03 16:01:19 -05002253 * Get the size and object order for an image snapshot, or if
2254 * snap_id is CEPH_NOSNAP, gets this information for the base
2255 * image.
2256 */
2257static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2258 u8 *order, u64 *snap_size)
2259{
2260 __le64 snapid = cpu_to_le64(snap_id);
2261 int ret;
2262 struct {
2263 u8 order;
2264 __le64 size;
2265 } __attribute__ ((packed)) size_buf = { 0 };
2266
2267 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2268 "rbd", "get_size",
2269 (char *) &snapid, sizeof (snapid),
2270 (char *) &size_buf, sizeof (size_buf),
2271 CEPH_OSD_FLAG_READ, NULL);
2272 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2273 if (ret < 0)
2274 return ret;
2275
2276 *order = size_buf.order;
2277 *snap_size = le64_to_cpu(size_buf.size);
2278
2279 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2280 (unsigned long long) snap_id, (unsigned int) *order,
2281 (unsigned long long) *snap_size);
2282
2283 return 0;
2284}
2285
2286static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2287{
2288 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2289 &rbd_dev->header.obj_order,
2290 &rbd_dev->header.image_size);
2291}
2292
Alex Elder1e130192012-07-03 16:01:19 -05002293static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2294{
2295 void *reply_buf;
2296 int ret;
2297 void *p;
2298
2299 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2300 if (!reply_buf)
2301 return -ENOMEM;
2302
2303 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2304 "rbd", "get_object_prefix",
2305 NULL, 0,
2306 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2307 CEPH_OSD_FLAG_READ, NULL);
2308 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2309 if (ret < 0)
2310 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002311 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002312
2313 p = reply_buf;
2314 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2315 p + RBD_OBJ_PREFIX_LEN_MAX,
2316 NULL, GFP_NOIO);
2317
2318 if (IS_ERR(rbd_dev->header.object_prefix)) {
2319 ret = PTR_ERR(rbd_dev->header.object_prefix);
2320 rbd_dev->header.object_prefix = NULL;
2321 } else {
2322 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2323 }
2324
2325out:
2326 kfree(reply_buf);
2327
2328 return ret;
2329}
2330
Alex Elderb1b54022012-07-03 16:01:19 -05002331static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2332 u64 *snap_features)
2333{
2334 __le64 snapid = cpu_to_le64(snap_id);
2335 struct {
2336 __le64 features;
2337 __le64 incompat;
2338 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002339 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002340 int ret;
2341
2342 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2343 "rbd", "get_features",
2344 (char *) &snapid, sizeof (snapid),
2345 (char *) &features_buf, sizeof (features_buf),
2346 CEPH_OSD_FLAG_READ, NULL);
2347 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2348 if (ret < 0)
2349 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002350
2351 incompat = le64_to_cpu(features_buf.incompat);
2352 if (incompat & ~RBD_FEATURES_ALL)
2353 return -ENOTSUPP;
2354
Alex Elderb1b54022012-07-03 16:01:19 -05002355 *snap_features = le64_to_cpu(features_buf.features);
2356
2357 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2358 (unsigned long long) snap_id,
2359 (unsigned long long) *snap_features,
2360 (unsigned long long) le64_to_cpu(features_buf.incompat));
2361
2362 return 0;
2363}
2364
2365static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2366{
2367 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2368 &rbd_dev->header.features);
2369}
2370
Alex Elder6e14b1a2012-07-03 16:01:19 -05002371static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002372{
2373 size_t size;
2374 int ret;
2375 void *reply_buf;
2376 void *p;
2377 void *end;
2378 u64 seq;
2379 u32 snap_count;
2380 struct ceph_snap_context *snapc;
2381 u32 i;
2382
2383 /*
2384 * We'll need room for the seq value (maximum snapshot id),
2385 * snapshot count, and array of that many snapshot ids.
2386 * For now we have a fixed upper limit on the number we're
2387 * prepared to receive.
2388 */
2389 size = sizeof (__le64) + sizeof (__le32) +
2390 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2391 reply_buf = kzalloc(size, GFP_KERNEL);
2392 if (!reply_buf)
2393 return -ENOMEM;
2394
2395 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2396 "rbd", "get_snapcontext",
2397 NULL, 0,
2398 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002399 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002400 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2401 if (ret < 0)
2402 goto out;
2403
2404 ret = -ERANGE;
2405 p = reply_buf;
2406 end = (char *) reply_buf + size;
2407 ceph_decode_64_safe(&p, end, seq, out);
2408 ceph_decode_32_safe(&p, end, snap_count, out);
2409
2410 /*
2411 * Make sure the reported number of snapshot ids wouldn't go
2412 * beyond the end of our buffer. But before checking that,
2413 * make sure the computed size of the snapshot context we
2414 * allocate is representable in a size_t.
2415 */
2416 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2417 / sizeof (u64)) {
2418 ret = -EINVAL;
2419 goto out;
2420 }
2421 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2422 goto out;
2423
2424 size = sizeof (struct ceph_snap_context) +
2425 snap_count * sizeof (snapc->snaps[0]);
2426 snapc = kmalloc(size, GFP_KERNEL);
2427 if (!snapc) {
2428 ret = -ENOMEM;
2429 goto out;
2430 }
2431
2432 atomic_set(&snapc->nref, 1);
2433 snapc->seq = seq;
2434 snapc->num_snaps = snap_count;
2435 for (i = 0; i < snap_count; i++)
2436 snapc->snaps[i] = ceph_decode_64(&p);
2437
2438 rbd_dev->header.snapc = snapc;
2439
2440 dout(" snap context seq = %llu, snap_count = %u\n",
2441 (unsigned long long) seq, (unsigned int) snap_count);
2442
2443out:
2444 kfree(reply_buf);
2445
2446 return 0;
2447}
2448
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002449static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2450{
2451 size_t size;
2452 void *reply_buf;
2453 __le64 snap_id;
2454 int ret;
2455 void *p;
2456 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002457 char *snap_name;
2458
2459 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2460 reply_buf = kmalloc(size, GFP_KERNEL);
2461 if (!reply_buf)
2462 return ERR_PTR(-ENOMEM);
2463
2464 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2465 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2466 "rbd", "get_snapshot_name",
2467 (char *) &snap_id, sizeof (snap_id),
2468 reply_buf, size,
2469 CEPH_OSD_FLAG_READ, NULL);
2470 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2471 if (ret < 0)
2472 goto out;
2473
2474 p = reply_buf;
2475 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002476 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002477 if (IS_ERR(snap_name)) {
2478 ret = PTR_ERR(snap_name);
2479 goto out;
2480 } else {
2481 dout(" snap_id 0x%016llx snap_name = %s\n",
2482 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2483 }
2484 kfree(reply_buf);
2485
2486 return snap_name;
2487out:
2488 kfree(reply_buf);
2489
2490 return ERR_PTR(ret);
2491}
2492
2493static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2494 u64 *snap_size, u64 *snap_features)
2495{
2496 __le64 snap_id;
2497 u8 order;
2498 int ret;
2499
2500 snap_id = rbd_dev->header.snapc->snaps[which];
2501 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2502 if (ret)
2503 return ERR_PTR(ret);
2504 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2505 if (ret)
2506 return ERR_PTR(ret);
2507
2508 return rbd_dev_v2_snap_name(rbd_dev, which);
2509}
2510
2511static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2512 u64 *snap_size, u64 *snap_features)
2513{
2514 if (rbd_dev->image_format == 1)
2515 return rbd_dev_v1_snap_info(rbd_dev, which,
2516 snap_size, snap_features);
2517 if (rbd_dev->image_format == 2)
2518 return rbd_dev_v2_snap_info(rbd_dev, which,
2519 snap_size, snap_features);
2520 return ERR_PTR(-EINVAL);
2521}
2522
Alex Elder117973f2012-08-31 17:29:55 -05002523static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2524{
2525 int ret;
2526 __u8 obj_order;
2527
2528 down_write(&rbd_dev->header_rwsem);
2529
2530 /* Grab old order first, to see if it changes */
2531
2532 obj_order = rbd_dev->header.obj_order,
2533 ret = rbd_dev_v2_image_size(rbd_dev);
2534 if (ret)
2535 goto out;
2536 if (rbd_dev->header.obj_order != obj_order) {
2537 ret = -EIO;
2538 goto out;
2539 }
2540 rbd_update_mapping_size(rbd_dev);
2541
2542 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2543 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2544 if (ret)
2545 goto out;
2546 ret = rbd_dev_snaps_update(rbd_dev);
2547 dout("rbd_dev_snaps_update returned %d\n", ret);
2548 if (ret)
2549 goto out;
2550 ret = rbd_dev_snaps_register(rbd_dev);
2551 dout("rbd_dev_snaps_register returned %d\n", ret);
2552out:
2553 up_write(&rbd_dev->header_rwsem);
2554
2555 return ret;
2556}
2557
Alex Elder9d475de2012-07-03 16:01:19 -05002558/*
Alex Elder35938152012-08-02 11:29:46 -05002559 * Scan the rbd device's current snapshot list and compare it to the
2560 * newly-received snapshot context. Remove any existing snapshots
2561 * not present in the new snapshot context. Add a new snapshot for
2562 * any snaphots in the snapshot context not in the current list.
2563 * And verify there are no changes to snapshots we already know
2564 * about.
2565 *
2566 * Assumes the snapshots in the snapshot context are sorted by
2567 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2568 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002569 */
Alex Elder304f6802012-08-31 17:29:52 -05002570static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002571{
Alex Elder35938152012-08-02 11:29:46 -05002572 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2573 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002574 struct list_head *head = &rbd_dev->snaps;
2575 struct list_head *links = head->next;
2576 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002577
Alex Elder9fcbb802012-08-23 23:48:49 -05002578 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002579 while (index < snap_count || links != head) {
2580 u64 snap_id;
2581 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002582 char *snap_name;
2583 u64 snap_size = 0;
2584 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002585
Alex Elder35938152012-08-02 11:29:46 -05002586 snap_id = index < snap_count ? snapc->snaps[index]
2587 : CEPH_NOSNAP;
2588 snap = links != head ? list_entry(links, struct rbd_snap, node)
2589 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002590 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002591
Alex Elder35938152012-08-02 11:29:46 -05002592 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2593 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002594
Alex Elder35938152012-08-02 11:29:46 -05002595 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002596
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002597 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderdaba5fd2012-10-26 17:25:23 -05002598 rbd_dev->exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002599 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002600 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002601 rbd_dev->spec->snap_id == snap->id ?
2602 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002603 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002604
Alex Elder35938152012-08-02 11:29:46 -05002605 /* Done with this list entry; advance */
2606
2607 links = next;
2608 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002609 }
Alex Elder35938152012-08-02 11:29:46 -05002610
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002611 snap_name = rbd_dev_snap_info(rbd_dev, index,
2612 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002613 if (IS_ERR(snap_name))
2614 return PTR_ERR(snap_name);
2615
Alex Elder9fcbb802012-08-23 23:48:49 -05002616 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2617 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002618 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2619 struct rbd_snap *new_snap;
2620
2621 /* We haven't seen this snapshot before */
2622
Alex Elderc8d18422012-07-10 20:30:11 -05002623 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002624 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002625 if (IS_ERR(new_snap)) {
2626 int err = PTR_ERR(new_snap);
2627
2628 dout(" failed to add dev, error %d\n", err);
2629
2630 return err;
2631 }
Alex Elder35938152012-08-02 11:29:46 -05002632
2633 /* New goes before existing, or at end of list */
2634
Alex Elder9fcbb802012-08-23 23:48:49 -05002635 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002636 if (snap)
2637 list_add_tail(&new_snap->node, &snap->node);
2638 else
Alex Elder523f3252012-08-30 00:16:37 -05002639 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002640 } else {
2641 /* Already have this one */
2642
Alex Elder9fcbb802012-08-23 23:48:49 -05002643 dout(" already present\n");
2644
Alex Eldercd892122012-07-03 16:01:19 -05002645 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002646 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002647 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002648
2649 /* Done with this list entry; advance */
2650
2651 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002652 }
Alex Elder35938152012-08-02 11:29:46 -05002653
2654 /* Advance to the next entry in the snapshot context */
2655
2656 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002657 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002658 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002659
2660 return 0;
2661}
2662
Alex Elder304f6802012-08-31 17:29:52 -05002663/*
2664 * Scan the list of snapshots and register the devices for any that
2665 * have not already been registered.
2666 */
2667static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2668{
2669 struct rbd_snap *snap;
2670 int ret = 0;
2671
2672 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002673 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2674 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002675
2676 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2677 if (!rbd_snap_registered(snap)) {
2678 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2679 if (ret < 0)
2680 break;
2681 }
2682 }
2683 dout("%s: returning %d\n", __func__, ret);
2684
2685 return ret;
2686}
2687
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002688static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2689{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002690 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002691 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002692
2693 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002694
Alex Eldercd789ab2012-08-30 00:16:38 -05002695 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002696 dev->bus = &rbd_bus_type;
2697 dev->type = &rbd_device_type;
2698 dev->parent = &rbd_root_dev;
2699 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002700 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002701 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002702
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002703 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002704
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002705 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002706}
2707
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002708static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2709{
2710 device_unregister(&rbd_dev->dev);
2711}
2712
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002713static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2714{
2715 int ret, rc;
2716
2717 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002718 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002719 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002720 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002721 if (rc < 0)
2722 return rc;
2723 }
2724 } while (ret == -ERANGE);
2725
2726 return ret;
2727}
2728
Alex Eldere2839302012-08-29 17:11:06 -05002729static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002730
2731/*
Alex Elder499afd52012-02-02 08:13:29 -06002732 * Get a unique rbd identifier for the given new rbd_dev, and add
2733 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002734 */
Alex Eldere2839302012-08-29 17:11:06 -05002735static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002736{
Alex Eldere2839302012-08-29 17:11:06 -05002737 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002738
2739 spin_lock(&rbd_dev_list_lock);
2740 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2741 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002742 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2743 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002744}
Alex Elderb7f23c32012-01-29 13:57:43 -06002745
Alex Elder1ddbe942012-01-29 13:57:44 -06002746/*
Alex Elder499afd52012-02-02 08:13:29 -06002747 * Remove an rbd_dev from the global list, and record that its
2748 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002749 */
Alex Eldere2839302012-08-29 17:11:06 -05002750static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002751{
Alex Elderd184f6b2012-01-29 13:57:44 -06002752 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002753 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002754 int max_id;
2755
Alex Elderaafb2302012-09-06 16:00:54 -05002756 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002757
Alex Eldere2839302012-08-29 17:11:06 -05002758 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2759 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002760 spin_lock(&rbd_dev_list_lock);
2761 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002762
2763 /*
2764 * If the id being "put" is not the current maximum, there
2765 * is nothing special we need to do.
2766 */
Alex Eldere2839302012-08-29 17:11:06 -05002767 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002768 spin_unlock(&rbd_dev_list_lock);
2769 return;
2770 }
2771
2772 /*
2773 * We need to update the current maximum id. Search the
2774 * list to find out what it is. We're more likely to find
2775 * the maximum at the end, so search the list backward.
2776 */
2777 max_id = 0;
2778 list_for_each_prev(tmp, &rbd_dev_list) {
2779 struct rbd_device *rbd_dev;
2780
2781 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002782 if (rbd_dev->dev_id > max_id)
2783 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002784 }
Alex Elder499afd52012-02-02 08:13:29 -06002785 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002786
Alex Elder1ddbe942012-01-29 13:57:44 -06002787 /*
Alex Eldere2839302012-08-29 17:11:06 -05002788 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002789 * which case it now accurately reflects the new maximum.
2790 * Be careful not to overwrite the maximum value in that
2791 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002792 */
Alex Eldere2839302012-08-29 17:11:06 -05002793 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2794 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002795}
2796
Alex Eldera725f65e2012-02-02 08:13:30 -06002797/*
Alex Eldere28fff262012-02-02 08:13:30 -06002798 * Skips over white space at *buf, and updates *buf to point to the
2799 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002800 * the token (string of non-white space characters) found. Note
2801 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002802 */
2803static inline size_t next_token(const char **buf)
2804{
2805 /*
2806 * These are the characters that produce nonzero for
2807 * isspace() in the "C" and "POSIX" locales.
2808 */
2809 const char *spaces = " \f\n\r\t\v";
2810
2811 *buf += strspn(*buf, spaces); /* Find start of token */
2812
2813 return strcspn(*buf, spaces); /* Return token length */
2814}
2815
2816/*
2817 * Finds the next token in *buf, and if the provided token buffer is
2818 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002819 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2820 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002821 *
2822 * Returns the length of the token found (not including the '\0').
2823 * Return value will be 0 if no token is found, and it will be >=
2824 * token_size if the token would not fit.
2825 *
Alex Elder593a9e72012-02-07 12:03:37 -06002826 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002827 * found token. Note that this occurs even if the token buffer is
2828 * too small to hold it.
2829 */
2830static inline size_t copy_token(const char **buf,
2831 char *token,
2832 size_t token_size)
2833{
2834 size_t len;
2835
2836 len = next_token(buf);
2837 if (len < token_size) {
2838 memcpy(token, *buf, len);
2839 *(token + len) = '\0';
2840 }
2841 *buf += len;
2842
2843 return len;
2844}
2845
2846/*
Alex Elderea3352f2012-07-09 21:04:23 -05002847 * Finds the next token in *buf, dynamically allocates a buffer big
2848 * enough to hold a copy of it, and copies the token into the new
2849 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2850 * that a duplicate buffer is created even for a zero-length token.
2851 *
2852 * Returns a pointer to the newly-allocated duplicate, or a null
2853 * pointer if memory for the duplicate was not available. If
2854 * the lenp argument is a non-null pointer, the length of the token
2855 * (not including the '\0') is returned in *lenp.
2856 *
2857 * If successful, the *buf pointer will be updated to point beyond
2858 * the end of the found token.
2859 *
2860 * Note: uses GFP_KERNEL for allocation.
2861 */
2862static inline char *dup_token(const char **buf, size_t *lenp)
2863{
2864 char *dup;
2865 size_t len;
2866
2867 len = next_token(buf);
2868 dup = kmalloc(len + 1, GFP_KERNEL);
2869 if (!dup)
2870 return NULL;
2871
2872 memcpy(dup, *buf, len);
2873 *(dup + len) = '\0';
2874 *buf += len;
2875
2876 if (lenp)
2877 *lenp = len;
2878
2879 return dup;
2880}
2881
2882/*
Alex Elder859c31d2012-10-25 23:34:42 -05002883 * Parse the options provided for an "rbd add" (i.e., rbd image
2884 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
2885 * and the data written is passed here via a NUL-terminated buffer.
2886 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002887 *
Alex Elder859c31d2012-10-25 23:34:42 -05002888 * The information extracted from these options is recorded in
2889 * the other parameters which return dynamically-allocated
2890 * structures:
2891 * ceph_opts
2892 * The address of a pointer that will refer to a ceph options
2893 * structure. Caller must release the returned pointer using
2894 * ceph_destroy_options() when it is no longer needed.
2895 * rbd_opts
2896 * Address of an rbd options pointer. Fully initialized by
2897 * this function; caller must release with kfree().
2898 * spec
2899 * Address of an rbd image specification pointer. Fully
2900 * initialized by this function based on parsed options.
2901 * Caller must release with rbd_spec_put().
2902 *
2903 * The options passed take this form:
2904 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
2905 * where:
2906 * <mon_addrs>
2907 * A comma-separated list of one or more monitor addresses.
2908 * A monitor address is an ip address, optionally followed
2909 * by a port number (separated by a colon).
2910 * I.e.: ip1[:port1][,ip2[:port2]...]
2911 * <options>
2912 * A comma-separated list of ceph and/or rbd options.
2913 * <pool_name>
2914 * The name of the rados pool containing the rbd image.
2915 * <image_name>
2916 * The name of the image in that pool to map.
2917 * <snap_id>
2918 * An optional snapshot id. If provided, the mapping will
2919 * present data from the image at the time that snapshot was
2920 * created. The image head is used if no snapshot id is
2921 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06002922 */
Alex Elder859c31d2012-10-25 23:34:42 -05002923static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05002924 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05002925 struct rbd_options **opts,
2926 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06002927{
Alex Elderd22f76e2012-07-12 10:46:35 -05002928 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05002929 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05002930 const char *mon_addrs;
2931 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05002932 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05002933 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05002934 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05002935 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002936
2937 /* The first four tokens are required */
2938
Alex Elder7ef32142012-02-02 08:13:30 -06002939 len = next_token(&buf);
2940 if (!len)
Alex Elderdc79b112012-10-25 23:34:41 -05002941 return -EINVAL; /* Missing monitor address(es) */
Alex Elder0ddebc02012-10-25 23:34:41 -05002942 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05002943 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002944 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002945
Alex Elderdc79b112012-10-25 23:34:41 -05002946 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05002947 options = dup_token(&buf, NULL);
2948 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05002949 return -ENOMEM;
Alex Elderf28e5652012-10-25 23:34:41 -05002950 if (!*options)
2951 goto out_err; /* Missing options */
Alex Eldera725f65e2012-02-02 08:13:30 -06002952
Alex Elder859c31d2012-10-25 23:34:42 -05002953 spec = rbd_spec_alloc();
2954 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05002955 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05002956
2957 spec->pool_name = dup_token(&buf, NULL);
2958 if (!spec->pool_name)
2959 goto out_mem;
2960 if (!*spec->pool_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002961 goto out_err; /* Missing pool name */
Alex Eldere28fff262012-02-02 08:13:30 -06002962
Alex Elder859c31d2012-10-25 23:34:42 -05002963 spec->image_name = dup_token(&buf, &spec->image_name_len);
2964 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002965 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05002966 if (!*spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002967 goto out_err; /* Missing image name */
Alex Eldere28fff262012-02-02 08:13:30 -06002968
Alex Elderf28e5652012-10-25 23:34:41 -05002969 /*
2970 * Snapshot name is optional; default is to use "-"
2971 * (indicating the head/no snapshot).
2972 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002973 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002974 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002975 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2976 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05002977 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05002978 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05002979 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05002980 }
Alex Elder859c31d2012-10-25 23:34:42 -05002981 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
2982 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05002983 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05002984 memcpy(spec->snap_name, buf, len);
2985 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05002986
Alex Elder0ddebc02012-10-25 23:34:41 -05002987 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06002988
Alex Elder4e9afeb2012-10-25 23:34:41 -05002989 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
2990 if (!rbd_opts)
2991 goto out_mem;
2992
2993 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05002994
Alex Elder859c31d2012-10-25 23:34:42 -05002995 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05002996 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05002997 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05002998 if (IS_ERR(copts)) {
2999 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003000 goto out_err;
3001 }
Alex Elder859c31d2012-10-25 23:34:42 -05003002 kfree(options);
3003
3004 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003005 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003006 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003007
Alex Elderdc79b112012-10-25 23:34:41 -05003008 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003009out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003010 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003011out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003012 kfree(rbd_opts);
3013 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003014 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003015
Alex Elderdc79b112012-10-25 23:34:41 -05003016 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003017}
3018
Alex Elder589d30e2012-07-10 20:30:11 -05003019/*
3020 * An rbd format 2 image has a unique identifier, distinct from the
3021 * name given to it by the user. Internally, that identifier is
3022 * what's used to specify the names of objects related to the image.
3023 *
3024 * A special "rbd id" object is used to map an rbd image name to its
3025 * id. If that object doesn't exist, then there is no v2 rbd image
3026 * with the supplied name.
3027 *
3028 * This function will record the given rbd_dev's image_id field if
3029 * it can be determined, and in that case will return 0. If any
3030 * errors occur a negative errno will be returned and the rbd_dev's
3031 * image_id field will be unchanged (and should be NULL).
3032 */
3033static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3034{
3035 int ret;
3036 size_t size;
3037 char *object_name;
3038 void *response;
3039 void *p;
3040
3041 /*
3042 * First, see if the format 2 image id file exists, and if
3043 * so, get the image's persistent id from it.
3044 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003045 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
Alex Elder589d30e2012-07-10 20:30:11 -05003046 object_name = kmalloc(size, GFP_NOIO);
3047 if (!object_name)
3048 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003049 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003050 dout("rbd id object name is %s\n", object_name);
3051
3052 /* Response will be an encoded string, which includes a length */
3053
3054 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3055 response = kzalloc(size, GFP_NOIO);
3056 if (!response) {
3057 ret = -ENOMEM;
3058 goto out;
3059 }
3060
3061 ret = rbd_req_sync_exec(rbd_dev, object_name,
3062 "rbd", "get_id",
3063 NULL, 0,
3064 response, RBD_IMAGE_ID_LEN_MAX,
3065 CEPH_OSD_FLAG_READ, NULL);
3066 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3067 if (ret < 0)
3068 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003069 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003070
3071 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003072 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003073 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003074 &rbd_dev->spec->image_id_len,
Alex Elder589d30e2012-07-10 20:30:11 -05003075 GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003076 if (IS_ERR(rbd_dev->spec->image_id)) {
3077 ret = PTR_ERR(rbd_dev->spec->image_id);
3078 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003079 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003080 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003081 }
3082out:
3083 kfree(response);
3084 kfree(object_name);
3085
3086 return ret;
3087}
3088
Alex Eldera30b71b2012-07-10 20:30:11 -05003089static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3090{
3091 int ret;
3092 size_t size;
3093
3094 /* Version 1 images have no id; empty string is used */
3095
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003096 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3097 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003098 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003099 rbd_dev->spec->image_id_len = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003100
3101 /* Record the header object name for this rbd image. */
3102
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003103 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003104 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3105 if (!rbd_dev->header_name) {
3106 ret = -ENOMEM;
3107 goto out_err;
3108 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003109 sprintf(rbd_dev->header_name, "%s%s",
3110 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003111
3112 /* Populate rbd image metadata */
3113
3114 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3115 if (ret < 0)
3116 goto out_err;
3117 rbd_dev->image_format = 1;
3118
3119 dout("discovered version 1 image, header name is %s\n",
3120 rbd_dev->header_name);
3121
3122 return 0;
3123
3124out_err:
3125 kfree(rbd_dev->header_name);
3126 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003127 kfree(rbd_dev->spec->image_id);
3128 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003129
3130 return ret;
3131}
3132
3133static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3134{
3135 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003136 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003137 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003138
3139 /*
3140 * Image id was filled in by the caller. Record the header
3141 * object name for this rbd image.
3142 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003143 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
Alex Eldera30b71b2012-07-10 20:30:11 -05003144 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3145 if (!rbd_dev->header_name)
3146 return -ENOMEM;
3147 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003148 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003149
3150 /* Get the size and object order for the image */
3151
3152 ret = rbd_dev_v2_image_size(rbd_dev);
3153 if (ret < 0)
3154 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003155
3156 /* Get the object prefix (a.k.a. block_name) for the image */
3157
3158 ret = rbd_dev_v2_object_prefix(rbd_dev);
3159 if (ret < 0)
3160 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003161
Alex Elderd8891402012-10-09 13:50:17 -07003162 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003163
3164 ret = rbd_dev_v2_features(rbd_dev);
3165 if (ret < 0)
3166 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003167
Alex Elder6e14b1a2012-07-03 16:01:19 -05003168 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003169
Alex Elder6e14b1a2012-07-03 16:01:19 -05003170 rbd_dev->header.crypt_type = 0;
3171 rbd_dev->header.comp_type = 0;
3172
3173 /* Get the snapshot context, plus the header version */
3174
3175 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003176 if (ret)
3177 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003178 rbd_dev->header.obj_version = ver;
3179
Alex Eldera30b71b2012-07-10 20:30:11 -05003180 rbd_dev->image_format = 2;
3181
3182 dout("discovered version 2 image, header name is %s\n",
3183 rbd_dev->header_name);
3184
Alex Elder35152972012-08-31 17:29:55 -05003185 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003186out_err:
3187 kfree(rbd_dev->header_name);
3188 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003189 kfree(rbd_dev->header.object_prefix);
3190 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003191
3192 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003193}
3194
3195/*
3196 * Probe for the existence of the header object for the given rbd
3197 * device. For format 2 images this includes determining the image
3198 * id.
3199 */
3200static int rbd_dev_probe(struct rbd_device *rbd_dev)
3201{
3202 int ret;
3203
3204 /*
3205 * Get the id from the image id object. If it's not a
3206 * format 2 image, we'll get ENOENT back, and we'll assume
3207 * it's a format 1 image.
3208 */
3209 ret = rbd_dev_image_id(rbd_dev);
3210 if (ret)
3211 ret = rbd_dev_v1_probe(rbd_dev);
3212 else
3213 ret = rbd_dev_v2_probe(rbd_dev);
3214 if (ret)
3215 dout("probe failed, returning %d\n", ret);
3216
3217 return ret;
3218}
3219
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003220static ssize_t rbd_add(struct bus_type *bus,
3221 const char *buf,
3222 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003223{
Alex Eldercb8627c2012-07-09 21:04:23 -05003224 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003225 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003226 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003227 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003228 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003229 struct ceph_osd_client *osdc;
3230 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003231
3232 if (!try_module_get(THIS_MODULE))
3233 return -ENODEV;
3234
Alex Eldera725f65e2012-02-02 08:13:30 -06003235 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003236 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003237 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003238 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003239
Alex Elder9d3997f2012-10-25 23:34:42 -05003240 rbdc = rbd_get_client(ceph_opts);
3241 if (IS_ERR(rbdc)) {
3242 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003243 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003244 }
Alex Elder78cea762012-10-25 23:34:41 -05003245 ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003246
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003247 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003248 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003249 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003250 if (rc < 0)
3251 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003252 spec->pool_id = (u64) rc;
3253
Alex Elderbd4ba652012-10-25 23:34:42 -05003254 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3255 if (!rbd_dev)
3256 goto err_out_client;
3257
3258 spin_lock_init(&rbd_dev->lock);
3259 INIT_LIST_HEAD(&rbd_dev->node);
3260 INIT_LIST_HEAD(&rbd_dev->snaps);
3261 init_rwsem(&rbd_dev->header_rwsem);
3262 rbd_dev->rbd_client = rbdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003263 rbd_dev->spec = spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003264
Alex Elderbd4ba652012-10-25 23:34:42 -05003265 rbd_dev->mapping.read_only = rbd_opts->read_only;
3266
Alex Eldera30b71b2012-07-10 20:30:11 -05003267 rc = rbd_dev_probe(rbd_dev);
3268 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003269 goto err_out_mem;
Alex Elder05fd6f62012-08-29 17:11:07 -05003270
3271 /* no need to lock here, as rbd_dev is not registered yet */
3272 rc = rbd_dev_snaps_update(rbd_dev);
3273 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003274 goto err_out_probe;
Alex Elder05fd6f62012-08-29 17:11:07 -05003275
Alex Elder819d52b2012-10-25 23:34:41 -05003276 rc = rbd_dev_set_mapping(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05003277 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003278 goto err_out_snaps;
Alex Elder05fd6f62012-08-29 17:11:07 -05003279
Alex Elder85ae8922012-07-26 23:37:14 -05003280 /* generate unique id: find highest unique id, add one */
3281 rbd_dev_id_get(rbd_dev);
3282
3283 /* Fill in the device name, now that we have its id. */
3284 BUILD_BUG_ON(DEV_NAME_LEN
3285 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3286 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3287
3288 /* Get our block major device number. */
3289
Alex Elder27cc2592012-02-02 08:13:30 -06003290 rc = register_blkdev(0, rbd_dev->name);
3291 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003292 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003293 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003294
Alex Elder0f308a32012-08-29 17:11:07 -05003295 /* Set up the blkdev mapping. */
3296
3297 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003298 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003299 goto err_out_blkdev;
3300
Alex Elder0f308a32012-08-29 17:11:07 -05003301 rc = rbd_bus_add_dev(rbd_dev);
3302 if (rc)
3303 goto err_out_disk;
3304
Alex Elder32eec682012-02-08 16:11:14 -06003305 /*
3306 * At this point cleanup in the event of an error is the job
3307 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003308 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003309
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003310 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003311 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003312 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003313 if (rc)
3314 goto err_out_bus;
3315
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003316 rc = rbd_init_watch_dev(rbd_dev);
3317 if (rc)
3318 goto err_out_bus;
3319
Alex Elder4e9afeb2012-10-25 23:34:41 -05003320 kfree(rbd_opts);
3321
Alex Elder3ee40012012-08-29 17:11:07 -05003322 /* Everything's ready. Announce the disk to the world. */
3323
3324 add_disk(rbd_dev->disk);
3325
3326 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3327 (unsigned long long) rbd_dev->mapping.size);
3328
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003329 return count;
3330
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003331err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003332 /* this will also clean up rest of rbd_dev stuff */
3333
3334 rbd_bus_del_dev(rbd_dev);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003335 kfree(rbd_opts);
3336
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003337 return rc;
3338
Alex Elder0f308a32012-08-29 17:11:07 -05003339err_out_disk:
3340 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003341err_out_blkdev:
3342 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003343err_out_id:
3344 rbd_dev_id_put(rbd_dev);
Alex Elder41f38c22012-10-25 23:34:40 -05003345err_out_snaps:
3346 rbd_remove_all_snaps(rbd_dev);
3347err_out_probe:
Alex Elder05fd6f62012-08-29 17:11:07 -05003348 rbd_header_free(&rbd_dev->header);
Alex Elder3fcf2582012-07-03 16:01:19 -05003349 kfree(rbd_dev->header_name);
Alex Elderbd4ba652012-10-25 23:34:42 -05003350err_out_mem:
3351 kfree(rbd_dev);
3352err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003353 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003354err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003355 if (ceph_opts)
3356 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003357 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003358 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003359err_out_module:
3360 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003361
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003362 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003363
3364 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003365}
3366
Alex Elderde71a292012-07-03 16:01:19 -05003367static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003368{
3369 struct list_head *tmp;
3370 struct rbd_device *rbd_dev;
3371
Alex Eldere124a822012-01-29 13:57:44 -06003372 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003373 list_for_each(tmp, &rbd_dev_list) {
3374 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003375 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003376 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003377 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003378 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003379 }
Alex Eldere124a822012-01-29 13:57:44 -06003380 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003381 return NULL;
3382}
3383
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003384static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003385{
Alex Elder593a9e72012-02-07 12:03:37 -06003386 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003387
Alex Elder1dbb4392012-01-24 10:08:37 -06003388 if (rbd_dev->watch_request) {
3389 struct ceph_client *client = rbd_dev->rbd_client->client;
3390
3391 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003392 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003393 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003394 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003395 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003396
Alex Elder9d3997f2012-10-25 23:34:42 -05003397 rbd_put_client(rbd_dev->rbd_client);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003398
3399 /* clean up and free blkdev */
3400 rbd_free_disk(rbd_dev);
3401 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003402
Alex Elder2ac4e752012-07-10 20:30:10 -05003403 /* release allocated disk header fields */
3404 rbd_header_free(&rbd_dev->header);
3405
Alex Elder32eec682012-02-08 16:11:14 -06003406 /* done with the id, and with the rbd_dev */
Alex Elder0bed54d2012-07-03 16:01:18 -05003407 kfree(rbd_dev->header_name);
Alex Eldere2839302012-08-29 17:11:06 -05003408 rbd_dev_id_put(rbd_dev);
Alex Elder8b8fb992012-10-26 17:25:24 -05003409 rbd_spec_put(rbd_dev->spec);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003410 kfree(rbd_dev);
3411
3412 /* release module ref */
3413 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003414}
3415
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003416static ssize_t rbd_remove(struct bus_type *bus,
3417 const char *buf,
3418 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003419{
3420 struct rbd_device *rbd_dev = NULL;
3421 int target_id, rc;
3422 unsigned long ul;
3423 int ret = count;
3424
3425 rc = strict_strtoul(buf, 10, &ul);
3426 if (rc)
3427 return rc;
3428
3429 /* convert to int; abort if we lost anything in the conversion */
3430 target_id = (int) ul;
3431 if (target_id != ul)
3432 return -EINVAL;
3433
3434 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3435
3436 rbd_dev = __rbd_get_dev(target_id);
3437 if (!rbd_dev) {
3438 ret = -ENOENT;
3439 goto done;
3440 }
3441
Alex Elder41f38c22012-10-25 23:34:40 -05003442 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003443 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003444
3445done:
3446 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003447
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003448 return ret;
3449}
3450
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003451/*
3452 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003453 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003454 */
3455static int rbd_sysfs_init(void)
3456{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003457 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003458
Alex Elderfed4c142012-02-07 12:03:36 -06003459 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003460 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003461 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003462
Alex Elderfed4c142012-02-07 12:03:36 -06003463 ret = bus_register(&rbd_bus_type);
3464 if (ret < 0)
3465 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003466
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003467 return ret;
3468}
3469
3470static void rbd_sysfs_cleanup(void)
3471{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003472 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003473 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003474}
3475
3476int __init rbd_init(void)
3477{
3478 int rc;
3479
3480 rc = rbd_sysfs_init();
3481 if (rc)
3482 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003483 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003484 return 0;
3485}
3486
3487void __exit rbd_exit(void)
3488{
3489 rbd_sysfs_cleanup();
3490}
3491
3492module_init(rbd_init);
3493module_exit(rbd_exit);
3494
3495MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3496MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3497MODULE_DESCRIPTION("rados block device");
3498
3499/* following authorship retained from original osdblk.c */
3500MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3501
3502MODULE_LICENSE("GPL");