blob: 7d28ce33056fbadd092ecf175f715f99227d1533 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
115struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700116 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117};
118
119/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600120 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600129 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700130 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145};
146
Alex Elderf0f8cef2012-01-29 13:57:44 -0600147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800159struct rbd_snap {
160 struct device dev;
161 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800162 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800163 struct list_head node;
164 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500165 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166};
167
Alex Elderf84344f2012-08-31 17:29:51 -0500168struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500169 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500170 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500171 bool snap_exists;
172 bool read_only;
173};
174
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700175/*
176 * a single device
177 */
178struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500179 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700180
181 int major; /* blkdev assigned major */
182 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700183
Alex Eldera30b71b2012-07-10 20:30:11 -0500184 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700185 struct rbd_client *rbd_client;
186
187 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
188
189 spinlock_t lock; /* queue lock */
190
191 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500192 char *image_id;
193 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500194 char *image_name;
195 size_t image_name_len;
196 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500197 char *pool_name;
Alex Elder86992092012-10-25 23:34:41 -0500198 u64 pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199
Alex Elder971f8392012-10-25 23:34:41 -0500200 char *snap_name;
201 u64 snap_id;
202
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700203 struct ceph_osd_event *watch_event;
204 struct ceph_osd_request *watch_request;
205
Josh Durginc6666012011-11-21 17:11:12 -0800206 /* protects updating the header */
207 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500208
209 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210
211 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800212
213 /* list of snapshots */
214 struct list_head snaps;
215
216 /* sysfs related */
217 struct device dev;
218};
219
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700220static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600221
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700222static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600223static DEFINE_SPINLOCK(rbd_dev_list_lock);
224
Alex Elder432b8582012-01-29 13:57:44 -0600225static LIST_HEAD(rbd_client_list); /* clients */
226static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700227
Alex Elder304f6802012-08-31 17:29:52 -0500228static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
229static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500232static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233
Alex Elderf0f8cef2012-01-29 13:57:44 -0600234static ssize_t rbd_add(struct bus_type *bus, const char *buf,
235 size_t count);
236static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
237 size_t count);
238
239static struct bus_attribute rbd_bus_attrs[] = {
240 __ATTR(add, S_IWUSR, NULL, rbd_add),
241 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
242 __ATTR_NULL
243};
244
245static struct bus_type rbd_bus_type = {
246 .name = "rbd",
247 .bus_attrs = rbd_bus_attrs,
248};
249
250static void rbd_root_dev_release(struct device *dev)
251{
252}
253
254static struct device rbd_root_dev = {
255 .init_name = "rbd",
256 .release = rbd_root_dev_release,
257};
258
Alex Elderaafb2302012-09-06 16:00:54 -0500259#ifdef RBD_DEBUG
260#define rbd_assert(expr) \
261 if (unlikely(!(expr))) { \
262 printk(KERN_ERR "\nAssertion failure in %s() " \
263 "at line %d:\n\n" \
264 "\trbd_assert(%s);\n\n", \
265 __func__, __LINE__, #expr); \
266 BUG(); \
267 }
268#else /* !RBD_DEBUG */
269# define rbd_assert(expr) ((void) 0)
270#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800272static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
273{
274 return get_device(&rbd_dev->dev);
275}
276
277static void rbd_put_dev(struct rbd_device *rbd_dev)
278{
279 put_device(&rbd_dev->dev);
280}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281
Alex Elder117973f2012-08-31 17:29:55 -0500282static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
283static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700284
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700285static int rbd_open(struct block_device *bdev, fmode_t mode)
286{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600287 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700288
Alex Elderf84344f2012-08-31 17:29:51 -0500289 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290 return -EROFS;
291
Alex Elder340c7a22012-08-10 13:12:07 -0700292 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500293 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 return 0;
296}
297
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800298static int rbd_release(struct gendisk *disk, fmode_t mode)
299{
300 struct rbd_device *rbd_dev = disk->private_data;
301
302 rbd_put_dev(rbd_dev);
303
304 return 0;
305}
306
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307static const struct block_device_operations rbd_bd_ops = {
308 .owner = THIS_MODULE,
309 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800310 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700311};
312
313/*
314 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500315 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 */
Alex Elderf8c38922012-08-10 13:12:07 -0700317static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318{
319 struct rbd_client *rbdc;
320 int ret = -ENOMEM;
321
322 dout("rbd_client_create\n");
323 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
324 if (!rbdc)
325 goto out_opt;
326
327 kref_init(&rbdc->kref);
328 INIT_LIST_HEAD(&rbdc->node);
329
Alex Elderbc534d862012-01-29 13:57:44 -0600330 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
331
Alex Elder43ae4702012-07-03 16:01:18 -0500332 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600334 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500335 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336
337 ret = ceph_open_session(rbdc->client);
338 if (ret < 0)
339 goto out_err;
340
Alex Elder432b8582012-01-29 13:57:44 -0600341 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700342 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600343 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
Alex Elderbc534d862012-01-29 13:57:44 -0600345 mutex_unlock(&ctl_mutex);
346
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 dout("rbd_client_create created %p\n", rbdc);
348 return rbdc;
349
350out_err:
351 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600352out_mutex:
353 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354 kfree(rbdc);
355out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500356 if (ceph_opts)
357 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400358 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700359}
360
361/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700362 * Find a ceph client with specific addr and configuration. If
363 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700365static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366{
367 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700368 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369
Alex Elder43ae4702012-07-03 16:01:18 -0500370 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371 return NULL;
372
Alex Elder1f7ba332012-08-10 13:12:07 -0700373 spin_lock(&rbd_client_list_lock);
374 list_for_each_entry(client_node, &rbd_client_list, node) {
375 if (!ceph_compare_options(ceph_opts, client_node->client)) {
376 kref_get(&client_node->kref);
377 found = true;
378 break;
379 }
380 }
381 spin_unlock(&rbd_client_list_lock);
382
383 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384}
385
386/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 * mount options
388 */
389enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700390 Opt_last_int,
391 /* int args above */
392 Opt_last_string,
393 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700394 Opt_read_only,
395 Opt_read_write,
396 /* Boolean args above */
397 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398};
399
Alex Elder43ae4702012-07-03 16:01:18 -0500400static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 /* int args above */
402 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500403 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700404 {Opt_read_only, "ro"}, /* Alternate spelling */
405 {Opt_read_write, "read_write"},
406 {Opt_read_write, "rw"}, /* Alternate spelling */
407 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408 {-1, NULL}
409};
410
411static int parse_rbd_opts_token(char *c, void *private)
412{
Alex Elder43ae4702012-07-03 16:01:18 -0500413 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700414 substring_t argstr[MAX_OPT_ARGS];
415 int token, intval, ret;
416
Alex Elder43ae4702012-07-03 16:01:18 -0500417 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700418 if (token < 0)
419 return -EINVAL;
420
421 if (token < Opt_last_int) {
422 ret = match_int(&argstr[0], &intval);
423 if (ret < 0) {
424 pr_err("bad mount option arg (not int) "
425 "at '%s'\n", c);
426 return ret;
427 }
428 dout("got int token %d val %d\n", token, intval);
429 } else if (token > Opt_last_int && token < Opt_last_string) {
430 dout("got string token %d val %s\n", token,
431 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700432 } else if (token > Opt_last_string && token < Opt_last_bool) {
433 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434 } else {
435 dout("got token %d\n", token);
436 }
437
438 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700439 case Opt_read_only:
440 rbd_opts->read_only = true;
441 break;
442 case Opt_read_write:
443 rbd_opts->read_only = false;
444 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700445 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500446 rbd_assert(false);
447 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448 }
449 return 0;
450}
451
452/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 * Get a ceph client with specific addr and configuration, if one does
454 * not exist create it.
455 */
Alex Elderf8c38922012-08-10 13:12:07 -0700456static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
457 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458{
Alex Elder069a4b52012-10-22 11:31:27 -0500459 struct rbd_options rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500460 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700461 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700462
Alex Elder069a4b52012-10-22 11:31:27 -0500463 /* Initialize all rbd options to the defaults */
464
465 rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466
Alex Elder43ae4702012-07-03 16:01:18 -0500467 ceph_opts = ceph_parse_options(options, mon_addr,
468 mon_addr + mon_addr_len,
Alex Elder069a4b52012-10-22 11:31:27 -0500469 parse_rbd_opts_token, &rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700470 if (IS_ERR(ceph_opts))
471 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700472
Alex Elder069a4b52012-10-22 11:31:27 -0500473 /* Record the parsed rbd options */
474
475 rbd_dev->mapping.read_only = rbd_opts.read_only;
476
Alex Elder1f7ba332012-08-10 13:12:07 -0700477 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700478 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600479 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500480 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700481 } else {
482 rbdc = rbd_client_create(ceph_opts);
483 if (IS_ERR(rbdc))
484 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700485 }
Alex Elderf8c38922012-08-10 13:12:07 -0700486 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700487
Alex Elderf8c38922012-08-10 13:12:07 -0700488 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700489}
490
491/*
492 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600493 *
Alex Elder432b8582012-01-29 13:57:44 -0600494 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495 */
496static void rbd_client_release(struct kref *kref)
497{
498 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
499
500 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500501 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700502 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500503 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700504
505 ceph_destroy_client(rbdc->client);
506 kfree(rbdc);
507}
508
509/*
510 * Drop reference to ceph client node. If it's not referenced anymore, release
511 * it.
512 */
513static void rbd_put_client(struct rbd_device *rbd_dev)
514{
515 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
516 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700517}
518
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700519/*
520 * Destroy requests collection
521 */
522static void rbd_coll_release(struct kref *kref)
523{
524 struct rbd_req_coll *coll =
525 container_of(kref, struct rbd_req_coll, kref);
526
527 dout("rbd_coll_release %p\n", coll);
528 kfree(coll);
529}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530
Alex Eldera30b71b2012-07-10 20:30:11 -0500531static bool rbd_image_format_valid(u32 image_format)
532{
533 return image_format == 1 || image_format == 2;
534}
535
Alex Elder8e94af82012-07-25 09:32:40 -0500536static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
537{
Alex Elder103a1502012-08-02 11:29:45 -0500538 size_t size;
539 u32 snap_count;
540
541 /* The header has to start with the magic rbd header text */
542 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
543 return false;
544
Alex Elderdb2388b2012-10-20 22:17:27 -0500545 /* The bio layer requires at least sector-sized I/O */
546
547 if (ondisk->options.order < SECTOR_SHIFT)
548 return false;
549
550 /* If we use u64 in a few spots we may be able to loosen this */
551
552 if (ondisk->options.order > 8 * sizeof (int) - 1)
553 return false;
554
Alex Elder103a1502012-08-02 11:29:45 -0500555 /*
556 * The size of a snapshot header has to fit in a size_t, and
557 * that limits the number of snapshots.
558 */
559 snap_count = le32_to_cpu(ondisk->snap_count);
560 size = SIZE_MAX - sizeof (struct ceph_snap_context);
561 if (snap_count > size / sizeof (__le64))
562 return false;
563
564 /*
565 * Not only that, but the size of the entire the snapshot
566 * header must also be representable in a size_t.
567 */
568 size -= snap_count * sizeof (__le64);
569 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
570 return false;
571
572 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500573}
574
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575/*
576 * Create a new header structure, translate header format from the on-disk
577 * header.
578 */
579static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500580 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700581{
Alex Elderccece232012-07-10 20:30:10 -0500582 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500583 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500584 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500585 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586
Alex Elder6a523252012-07-19 17:12:59 -0500587 memset(header, 0, sizeof (*header));
588
Alex Elder103a1502012-08-02 11:29:45 -0500589 snap_count = le32_to_cpu(ondisk->snap_count);
590
Alex Elder58c17b02012-08-23 23:22:06 -0500591 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
592 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500593 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700594 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500595 memcpy(header->object_prefix, ondisk->object_prefix, len);
596 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600597
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700598 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500599 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
600
Alex Elder621901d2012-08-23 23:22:06 -0500601 /* Save a copy of the snapshot names */
602
Alex Elderf785cc12012-08-23 23:22:06 -0500603 if (snap_names_len > (u64) SIZE_MAX)
604 return -EIO;
605 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500607 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500608 /*
609 * Note that rbd_dev_v1_header_read() guarantees
610 * the ondisk buffer we're working with has
611 * snap_names_len bytes beyond the end of the
612 * snapshot id array, this memcpy() is safe.
613 */
614 memcpy(header->snap_names, &ondisk->snaps[snap_count],
615 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500616
Alex Elder621901d2012-08-23 23:22:06 -0500617 /* Record each snapshot's size */
618
Alex Elderd2bb24e2012-07-26 23:37:14 -0500619 size = snap_count * sizeof (*header->snap_sizes);
620 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500622 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500623 for (i = 0; i < snap_count; i++)
624 header->snap_sizes[i] =
625 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626 } else {
Alex Elderccece232012-07-10 20:30:10 -0500627 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628 header->snap_names = NULL;
629 header->snap_sizes = NULL;
630 }
Alex Elder849b4262012-07-09 21:04:24 -0500631
Alex Elder34b13182012-07-13 20:35:12 -0500632 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700633 header->obj_order = ondisk->options.order;
634 header->crypt_type = ondisk->options.crypt_type;
635 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500636
Alex Elder621901d2012-08-23 23:22:06 -0500637 /* Allocate and fill in the snapshot context */
638
Alex Elderf84344f2012-08-31 17:29:51 -0500639 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500640 size = sizeof (struct ceph_snap_context);
641 size += snap_count * sizeof (header->snapc->snaps[0]);
642 header->snapc = kzalloc(size, GFP_KERNEL);
643 if (!header->snapc)
644 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700645
646 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500647 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500649 for (i = 0; i < snap_count; i++)
650 header->snapc->snaps[i] =
651 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652
653 return 0;
654
Alex Elder6a523252012-07-19 17:12:59 -0500655out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500656 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500657 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500659 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500660 kfree(header->object_prefix);
661 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500662
Alex Elder00f1f362012-02-07 12:03:36 -0600663 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700664}
665
Alex Elder8836b992012-08-30 14:42:15 -0500666static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668
Alex Eldere86924a2012-07-10 20:30:11 -0500669 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600670
Alex Eldere86924a2012-07-10 20:30:11 -0500671 list_for_each_entry(snap, &rbd_dev->snaps, node) {
672 if (!strcmp(snap_name, snap->name)) {
Alex Elder971f8392012-10-25 23:34:41 -0500673 rbd_dev->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500674 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500675 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600676
Alex Eldere86924a2012-07-10 20:30:11 -0500677 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600678 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679 }
Alex Eldere86924a2012-07-10 20:30:11 -0500680
Alex Elder00f1f362012-02-07 12:03:36 -0600681 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682}
683
Alex Elder5ed16172012-08-29 17:11:07 -0500684static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685{
Alex Elder78dc4472012-07-19 08:49:18 -0500686 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687
Alex Elder4e1105a2012-08-31 17:29:52 -0500688 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800689 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder971f8392012-10-25 23:34:41 -0500690 rbd_dev->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500691 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500692 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500693 rbd_dev->mapping.snap_exists = false;
Alex Eldere86924a2012-07-10 20:30:11 -0500694 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500696 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697 if (ret < 0)
698 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500699 rbd_dev->mapping.snap_exists = true;
700 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700701 }
Alex Elder971f8392012-10-25 23:34:41 -0500702 rbd_dev->snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 return ret;
705}
706
707static void rbd_header_free(struct rbd_image_header *header)
708{
Alex Elder849b4262012-07-09 21:04:24 -0500709 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500710 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700711 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500712 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500713 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500714 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800715 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500716 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700717}
718
Alex Elder65ccfe22012-08-09 10:33:26 -0700719static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700720{
Alex Elder65ccfe22012-08-09 10:33:26 -0700721 char *name;
722 u64 segment;
723 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700724
Alex Elder65ccfe22012-08-09 10:33:26 -0700725 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
726 if (!name)
727 return NULL;
728 segment = offset >> rbd_dev->header.obj_order;
729 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
730 rbd_dev->header.object_prefix, segment);
731 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
732 pr_err("error formatting segment name for #%llu (%d)\n",
733 segment, ret);
734 kfree(name);
735 name = NULL;
736 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700737
Alex Elder65ccfe22012-08-09 10:33:26 -0700738 return name;
739}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700740
Alex Elder65ccfe22012-08-09 10:33:26 -0700741static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
742{
743 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744
Alex Elder65ccfe22012-08-09 10:33:26 -0700745 return offset & (segment_size - 1);
746}
747
748static u64 rbd_segment_length(struct rbd_device *rbd_dev,
749 u64 offset, u64 length)
750{
751 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
752
753 offset &= segment_size - 1;
754
Alex Elderaafb2302012-09-06 16:00:54 -0500755 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700756 if (offset + length > segment_size)
757 length = segment_size - offset;
758
759 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700760}
761
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700762static int rbd_get_num_segments(struct rbd_image_header *header,
763 u64 ofs, u64 len)
764{
Alex Elderdf111be2012-08-09 10:33:26 -0700765 u64 start_seg;
766 u64 end_seg;
767
768 if (!len)
769 return 0;
770 if (len - 1 > U64_MAX - ofs)
771 return -ERANGE;
772
773 start_seg = ofs >> header->obj_order;
774 end_seg = (ofs + len - 1) >> header->obj_order;
775
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700776 return end_seg - start_seg + 1;
777}
778
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700780 * returns the size of an object in the image
781 */
782static u64 rbd_obj_bytes(struct rbd_image_header *header)
783{
784 return 1 << header->obj_order;
785}
786
787/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700788 * bio helpers
789 */
790
791static void bio_chain_put(struct bio *chain)
792{
793 struct bio *tmp;
794
795 while (chain) {
796 tmp = chain;
797 chain = chain->bi_next;
798 bio_put(tmp);
799 }
800}
801
802/*
803 * zeros a bio chain, starting at specific offset
804 */
805static void zero_bio_chain(struct bio *chain, int start_ofs)
806{
807 struct bio_vec *bv;
808 unsigned long flags;
809 void *buf;
810 int i;
811 int pos = 0;
812
813 while (chain) {
814 bio_for_each_segment(bv, chain, i) {
815 if (pos + bv->bv_len > start_ofs) {
816 int remainder = max(start_ofs - pos, 0);
817 buf = bvec_kmap_irq(bv, &flags);
818 memset(buf + remainder, 0,
819 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200820 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700821 }
822 pos += bv->bv_len;
823 }
824
825 chain = chain->bi_next;
826 }
827}
828
829/*
Alex Elderf7760da2012-10-20 22:17:27 -0500830 * Clone a portion of a bio, starting at the given byte offset
831 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700832 */
Alex Elderf7760da2012-10-20 22:17:27 -0500833static struct bio *bio_clone_range(struct bio *bio_src,
834 unsigned int offset,
835 unsigned int len,
836 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700837{
Alex Elderf7760da2012-10-20 22:17:27 -0500838 struct bio_vec *bv;
839 unsigned int resid;
840 unsigned short idx;
841 unsigned int voff;
842 unsigned short end_idx;
843 unsigned short vcnt;
844 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845
Alex Elderf7760da2012-10-20 22:17:27 -0500846 /* Handle the easy case for the caller */
847
848 if (!offset && len == bio_src->bi_size)
849 return bio_clone(bio_src, gfpmask);
850
851 if (WARN_ON_ONCE(!len))
852 return NULL;
853 if (WARN_ON_ONCE(len > bio_src->bi_size))
854 return NULL;
855 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
856 return NULL;
857
858 /* Find first affected segment... */
859
860 resid = offset;
861 __bio_for_each_segment(bv, bio_src, idx, 0) {
862 if (resid < bv->bv_len)
863 break;
864 resid -= bv->bv_len;
865 }
866 voff = resid;
867
868 /* ...and the last affected segment */
869
870 resid += len;
871 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
872 if (resid <= bv->bv_len)
873 break;
874 resid -= bv->bv_len;
875 }
876 vcnt = end_idx - idx + 1;
877
878 /* Build the clone */
879
880 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
881 if (!bio)
882 return NULL; /* ENOMEM */
883
884 bio->bi_bdev = bio_src->bi_bdev;
885 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
886 bio->bi_rw = bio_src->bi_rw;
887 bio->bi_flags |= 1 << BIO_CLONED;
888
889 /*
890 * Copy over our part of the bio_vec, then update the first
891 * and last (or only) entries.
892 */
893 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
894 vcnt * sizeof (struct bio_vec));
895 bio->bi_io_vec[0].bv_offset += voff;
896 if (vcnt > 1) {
897 bio->bi_io_vec[0].bv_len -= voff;
898 bio->bi_io_vec[vcnt - 1].bv_len = resid;
899 } else {
900 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901 }
902
Alex Elderf7760da2012-10-20 22:17:27 -0500903 bio->bi_vcnt = vcnt;
904 bio->bi_size = len;
905 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700906
Alex Elderf7760da2012-10-20 22:17:27 -0500907 return bio;
908}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909
Alex Elderf7760da2012-10-20 22:17:27 -0500910/*
911 * Clone a portion of a bio chain, starting at the given byte offset
912 * into the first bio in the source chain and continuing for the
913 * number of bytes indicated. The result is another bio chain of
914 * exactly the given length, or a null pointer on error.
915 *
916 * The bio_src and offset parameters are both in-out. On entry they
917 * refer to the first source bio and the offset into that bio where
918 * the start of data to be cloned is located.
919 *
920 * On return, bio_src is updated to refer to the bio in the source
921 * chain that contains first un-cloned byte, and *offset will
922 * contain the offset of that byte within that bio.
923 */
924static struct bio *bio_chain_clone_range(struct bio **bio_src,
925 unsigned int *offset,
926 unsigned int len,
927 gfp_t gfpmask)
928{
929 struct bio *bi = *bio_src;
930 unsigned int off = *offset;
931 struct bio *chain = NULL;
932 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700933
Alex Elderf7760da2012-10-20 22:17:27 -0500934 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700935
Alex Elderf7760da2012-10-20 22:17:27 -0500936 if (!bi || off >= bi->bi_size || !len)
937 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700938
Alex Elderf7760da2012-10-20 22:17:27 -0500939 end = &chain;
940 while (len) {
941 unsigned int bi_size;
942 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700943
Alex Elderf7760da2012-10-20 22:17:27 -0500944 if (!bi)
945 goto out_err; /* EINVAL; ran out of bio's */
946 bi_size = min_t(unsigned int, bi->bi_size - off, len);
947 bio = bio_clone_range(bi, off, bi_size, gfpmask);
948 if (!bio)
949 goto out_err; /* ENOMEM */
950
951 *end = bio;
952 end = &bio->bi_next;
953
954 off += bi_size;
955 if (off == bi->bi_size) {
956 bi = bi->bi_next;
957 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958 }
Alex Elderf7760da2012-10-20 22:17:27 -0500959 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700960 }
Alex Elderf7760da2012-10-20 22:17:27 -0500961 *bio_src = bi;
962 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963
Alex Elderf7760da2012-10-20 22:17:27 -0500964 return chain;
965out_err:
966 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 return NULL;
969}
970
971/*
972 * helpers for osd request op vectors.
973 */
Alex Elder57cfc102012-06-26 12:57:03 -0700974static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
975 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976{
Alex Elder57cfc102012-06-26 12:57:03 -0700977 struct ceph_osd_req_op *ops;
978
979 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
980 if (!ops)
981 return NULL;
982
983 ops[0].op = opcode;
984
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985 /*
986 * op extent offset and length will be set later on
987 * in calc_raw_layout()
988 */
Alex Elder57cfc102012-06-26 12:57:03 -0700989 ops[0].payload_len = payload_len;
990
991 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700992}
993
994static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
995{
996 kfree(ops);
997}
998
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700999static void rbd_coll_end_req_index(struct request *rq,
1000 struct rbd_req_coll *coll,
1001 int index,
1002 int ret, u64 len)
1003{
1004 struct request_queue *q;
1005 int min, max, i;
1006
Alex Elderbd919d42012-07-13 20:35:11 -05001007 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1008 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001009
1010 if (!rq)
1011 return;
1012
1013 if (!coll) {
1014 blk_end_request(rq, ret, len);
1015 return;
1016 }
1017
1018 q = rq->q;
1019
1020 spin_lock_irq(q->queue_lock);
1021 coll->status[index].done = 1;
1022 coll->status[index].rc = ret;
1023 coll->status[index].bytes = len;
1024 max = min = coll->num_done;
1025 while (max < coll->total && coll->status[max].done)
1026 max++;
1027
1028 for (i = min; i<max; i++) {
1029 __blk_end_request(rq, coll->status[i].rc,
1030 coll->status[i].bytes);
1031 coll->num_done++;
1032 kref_put(&coll->kref, rbd_coll_release);
1033 }
1034 spin_unlock_irq(q->queue_lock);
1035}
1036
1037static void rbd_coll_end_req(struct rbd_request *req,
1038 int ret, u64 len)
1039{
1040 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1041}
1042
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001043/*
1044 * Send ceph osd request
1045 */
1046static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001047 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001048 struct ceph_snap_context *snapc,
1049 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001050 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001051 struct bio *bio,
1052 struct page **pages,
1053 int num_pages,
1054 int flags,
1055 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001056 struct rbd_req_coll *coll,
1057 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001059 struct ceph_msg *msg),
1060 struct ceph_osd_request **linger_req,
1061 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001062{
1063 struct ceph_osd_request *req;
1064 struct ceph_file_layout *layout;
1065 int ret;
1066 u64 bno;
1067 struct timespec mtime = CURRENT_TIME;
1068 struct rbd_request *req_data;
1069 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001070 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001072 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001073 if (!req_data) {
1074 if (coll)
1075 rbd_coll_end_req_index(rq, coll, coll_index,
1076 -ENOMEM, len);
1077 return -ENOMEM;
1078 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001079
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001080 if (coll) {
1081 req_data->coll = coll;
1082 req_data->coll_index = coll_index;
1083 }
1084
Alex Elderf7760da2012-10-20 22:17:27 -05001085 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1086 object_name, (unsigned long long) ofs,
1087 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088
Alex Elder0ce1a792012-07-03 16:01:18 -05001089 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001090 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1091 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001092 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001093 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001094 goto done_pages;
1095 }
1096
1097 req->r_callback = rbd_cb;
1098
1099 req_data->rq = rq;
1100 req_data->bio = bio;
1101 req_data->pages = pages;
1102 req_data->len = len;
1103
1104 req->r_priv = req_data;
1105
1106 reqhead = req->r_request->front.iov_base;
1107 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1108
Alex Elderaded07e2012-07-03 16:01:18 -05001109 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110 req->r_oid_len = strlen(req->r_oid);
1111
1112 layout = &req->r_file_layout;
1113 memset(layout, 0, sizeof(*layout));
1114 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1115 layout->fl_stripe_count = cpu_to_le32(1);
1116 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder86992092012-10-25 23:34:41 -05001117 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001118 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1119 req, ops);
1120 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001121
1122 ceph_osdc_build_request(req, ofs, &len,
1123 ops,
1124 snapc,
1125 &mtime,
1126 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001127
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001128 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001129 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001130 *linger_req = req;
1131 }
1132
Alex Elder1dbb4392012-01-24 10:08:37 -06001133 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001134 if (ret < 0)
1135 goto done_err;
1136
1137 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001138 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001139 if (ver)
1140 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001141 dout("reassert_ver=%llu\n",
1142 (unsigned long long)
1143 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144 ceph_osdc_put_request(req);
1145 }
1146 return ret;
1147
1148done_err:
1149 bio_chain_put(req_data->bio);
1150 ceph_osdc_put_request(req);
1151done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001152 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154 return ret;
1155}
1156
1157/*
1158 * Ceph osd op callback
1159 */
1160static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1161{
1162 struct rbd_request *req_data = req->r_priv;
1163 struct ceph_osd_reply_head *replyhead;
1164 struct ceph_osd_op *op;
1165 __s32 rc;
1166 u64 bytes;
1167 int read_op;
1168
1169 /* parse reply */
1170 replyhead = msg->front.iov_base;
1171 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1172 op = (void *)(replyhead + 1);
1173 rc = le32_to_cpu(replyhead->result);
1174 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001175 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001176
Alex Elderbd919d42012-07-13 20:35:11 -05001177 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1178 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179
1180 if (rc == -ENOENT && read_op) {
1181 zero_bio_chain(req_data->bio, 0);
1182 rc = 0;
1183 } else if (rc == 0 && read_op && bytes < req_data->len) {
1184 zero_bio_chain(req_data->bio, bytes);
1185 bytes = req_data->len;
1186 }
1187
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001188 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001189
1190 if (req_data->bio)
1191 bio_chain_put(req_data->bio);
1192
1193 ceph_osdc_put_request(req);
1194 kfree(req_data);
1195}
1196
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001197static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1198{
1199 ceph_osdc_put_request(req);
1200}
1201
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001202/*
1203 * Do a synchronous ceph osd operation
1204 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001205static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001206 struct ceph_snap_context *snapc,
1207 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001209 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001210 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001211 u64 ofs, u64 inbound_size,
1212 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001213 struct ceph_osd_request **linger_req,
1214 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215{
1216 int ret;
1217 struct page **pages;
1218 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001219
Alex Elderaafb2302012-09-06 16:00:54 -05001220 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001221
Alex Elderf8d4de62012-07-03 16:01:19 -05001222 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001224 if (IS_ERR(pages))
1225 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226
Alex Elder0ce1a792012-07-03 16:01:18 -05001227 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001228 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001229 pages, num_pages,
1230 flags,
1231 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001232 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001233 NULL,
1234 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001235 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001236 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237
Alex Elderf8d4de62012-07-03 16:01:19 -05001238 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1239 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001240
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001241done:
1242 ceph_release_page_vector(pages, num_pages);
1243 return ret;
1244}
1245
1246/*
1247 * Do an asynchronous ceph osd operation
1248 */
1249static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001250 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001252 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001253 struct bio *bio,
1254 struct rbd_req_coll *coll,
1255 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001256{
1257 char *seg_name;
1258 u64 seg_ofs;
1259 u64 seg_len;
1260 int ret;
1261 struct ceph_osd_req_op *ops;
1262 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001263 int opcode;
1264 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001265 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266
Alex Elder65ccfe22012-08-09 10:33:26 -07001267 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001268 if (!seg_name)
1269 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001270 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1271 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272
Alex Elderff2e4bb2012-10-10 18:59:29 -07001273 if (rq_data_dir(rq) == WRITE) {
1274 opcode = CEPH_OSD_OP_WRITE;
1275 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001276 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001277 payload_len = seg_len;
1278 } else {
1279 opcode = CEPH_OSD_OP_READ;
1280 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001281 snapc = NULL;
Alex Elder971f8392012-10-25 23:34:41 -05001282 snapid = rbd_dev->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001283 payload_len = 0;
1284 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001285
Alex Elder57cfc102012-06-26 12:57:03 -07001286 ret = -ENOMEM;
1287 ops = rbd_create_rw_ops(1, opcode, payload_len);
1288 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001289 goto done;
1290
1291 /* we've taken care of segment sizes earlier when we
1292 cloned the bios. We should never have a segment
1293 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001294 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001295
1296 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1297 seg_name, seg_ofs, seg_len,
1298 bio,
1299 NULL, 0,
1300 flags,
1301 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001302 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001303 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001304
1305 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001306done:
1307 kfree(seg_name);
1308 return ret;
1309}
1310
1311/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001312 * Request sync osd read
1313 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001314static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001315 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001316 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001317 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001318 char *buf,
1319 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001320{
Alex Elder913d2fd2012-06-26 12:57:03 -07001321 struct ceph_osd_req_op *ops;
1322 int ret;
1323
1324 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1325 if (!ops)
1326 return -ENOMEM;
1327
1328 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001329 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001330 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001331 ops, object_name, ofs, len, buf, NULL, ver);
1332 rbd_destroy_ops(ops);
1333
1334 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001335}
1336
1337/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338 * Request sync osd watch
1339 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001340static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001342 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343{
1344 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001345 int ret;
1346
Alex Elder57cfc102012-06-26 12:57:03 -07001347 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1348 if (!ops)
1349 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001350
Josh Durgina71b8912011-12-05 18:10:44 -08001351 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001352 ops[0].watch.cookie = notify_id;
1353 ops[0].watch.flag = 0;
1354
Alex Elder0ce1a792012-07-03 16:01:18 -05001355 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001356 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001357 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001358 CEPH_OSD_FLAG_READ,
1359 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001360 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001361 rbd_simple_req_cb, 0, NULL);
1362
1363 rbd_destroy_ops(ops);
1364 return ret;
1365}
1366
1367static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1368{
Alex Elder0ce1a792012-07-03 16:01:18 -05001369 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001370 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001371 int rc;
1372
Alex Elder0ce1a792012-07-03 16:01:18 -05001373 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001374 return;
1375
Alex Elderbd919d42012-07-13 20:35:11 -05001376 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1377 rbd_dev->header_name, (unsigned long long) notify_id,
1378 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001379 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001380 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001381 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001382 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001383
Alex Elder7f0a24d2012-07-25 09:32:40 -05001384 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001385}
1386
1387/*
1388 * Request sync osd watch
1389 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001390static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001391{
1392 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001393 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001394 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001395
Alex Elder57cfc102012-06-26 12:57:03 -07001396 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1397 if (!ops)
1398 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399
1400 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001401 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001402 if (ret < 0)
1403 goto fail;
1404
Alex Elder0e6f3222012-07-25 09:32:40 -05001405 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001406 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001407 ops[0].watch.flag = 1;
1408
Alex Elder0ce1a792012-07-03 16:01:18 -05001409 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001410 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001411 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1412 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001413 rbd_dev->header_name,
1414 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001415 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001416
1417 if (ret < 0)
1418 goto fail_event;
1419
1420 rbd_destroy_ops(ops);
1421 return 0;
1422
1423fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001424 ceph_osdc_cancel_event(rbd_dev->watch_event);
1425 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001426fail:
1427 rbd_destroy_ops(ops);
1428 return ret;
1429}
1430
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001431/*
1432 * Request sync osd unwatch
1433 */
Alex Elder070c6332012-07-25 09:32:41 -05001434static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001435{
1436 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001437 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001438
Alex Elder57cfc102012-06-26 12:57:03 -07001439 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1440 if (!ops)
1441 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001442
1443 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001444 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001445 ops[0].watch.flag = 0;
1446
Alex Elder0ce1a792012-07-03 16:01:18 -05001447 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001448 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001449 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1450 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001451 rbd_dev->header_name,
1452 0, 0, NULL, NULL, NULL);
1453
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001454
1455 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001456 ceph_osdc_cancel_event(rbd_dev->watch_event);
1457 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001458 return ret;
1459}
1460
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001461/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001462 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001463 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001464static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001465 const char *object_name,
1466 const char *class_name,
1467 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001468 const char *outbound,
1469 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001470 char *inbound,
1471 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001472 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001473 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474{
1475 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001476 int class_name_len = strlen(class_name);
1477 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001478 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001479 int ret;
1480
Alex Elder3cb4a682012-06-26 12:57:03 -07001481 /*
1482 * Any input parameters required by the method we're calling
1483 * will be sent along with the class and method names as
1484 * part of the message payload. That data and its size are
1485 * supplied via the indata and indata_len fields (named from
1486 * the perspective of the server side) in the OSD request
1487 * operation.
1488 */
1489 payload_size = class_name_len + method_name_len + outbound_size;
1490 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001491 if (!ops)
1492 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001493
Alex Elderaded07e2012-07-03 16:01:18 -05001494 ops[0].cls.class_name = class_name;
1495 ops[0].cls.class_len = (__u8) class_name_len;
1496 ops[0].cls.method_name = method_name;
1497 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001498 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001499 ops[0].cls.indata = outbound;
1500 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001501
Alex Elder0ce1a792012-07-03 16:01:18 -05001502 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001504 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001505 object_name, 0, inbound_size, inbound,
1506 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507
1508 rbd_destroy_ops(ops);
1509
1510 dout("cls_exec returned %d\n", ret);
1511 return ret;
1512}
1513
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1515{
1516 struct rbd_req_coll *coll =
1517 kzalloc(sizeof(struct rbd_req_coll) +
1518 sizeof(struct rbd_req_status) * num_reqs,
1519 GFP_ATOMIC);
1520
1521 if (!coll)
1522 return NULL;
1523 coll->total = num_reqs;
1524 kref_init(&coll->kref);
1525 return coll;
1526}
1527
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528/*
1529 * block device queue callback
1530 */
1531static void rbd_rq_fn(struct request_queue *q)
1532{
1533 struct rbd_device *rbd_dev = q->queuedata;
1534 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535
Alex Elder00f1f362012-02-07 12:03:36 -06001536 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001537 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001539 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001541 int num_segs, cur_seg = 0;
1542 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001543 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001544 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546 dout("fetched request\n");
1547
1548 /* filter out block requests we don't understand */
1549 if ((rq->cmd_type != REQ_TYPE_FS)) {
1550 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001551 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001552 }
1553
1554 /* deduce our operation (read, write) */
1555 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001556 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001557 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001558 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001559 }
1560
1561 spin_unlock_irq(q->queue_lock);
1562
Josh Durgind1d25642011-12-05 14:03:05 -08001563 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001564
Alex Elder971f8392012-10-25 23:34:41 -05001565 if (rbd_dev->snap_id != CEPH_NOSNAP &&
Alex Elderf84344f2012-08-31 17:29:51 -05001566 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001567 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001568 dout("request for non-existent snapshot");
1569 spin_lock_irq(q->queue_lock);
1570 __blk_end_request_all(rq, -ENXIO);
1571 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001572 }
1573
Josh Durgind1d25642011-12-05 14:03:05 -08001574 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1575
1576 up_read(&rbd_dev->header_rwsem);
1577
Alex Elderf7760da2012-10-20 22:17:27 -05001578 size = blk_rq_bytes(rq);
1579 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1580 bio = rq->bio;
1581
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001582 dout("%s 0x%x bytes at 0x%llx\n",
1583 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001584 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001585
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001586 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001587 if (num_segs <= 0) {
1588 spin_lock_irq(q->queue_lock);
1589 __blk_end_request_all(rq, num_segs);
1590 ceph_put_snap_context(snapc);
1591 continue;
1592 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001593 coll = rbd_alloc_coll(num_segs);
1594 if (!coll) {
1595 spin_lock_irq(q->queue_lock);
1596 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001597 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001598 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001599 }
1600
Alex Elderf7760da2012-10-20 22:17:27 -05001601 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001602 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001603 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1604 unsigned int chain_size;
1605 struct bio *bio_chain;
1606
1607 BUG_ON(limit > (u64) UINT_MAX);
1608 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001609 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001610
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001611 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001612
1613 /* Pass a cloned bio chain via an osd request */
1614
1615 bio_chain = bio_chain_clone_range(&bio,
1616 &bio_offset, chain_size,
1617 GFP_ATOMIC);
1618 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001619 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001620 ofs, chain_size,
1621 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001622 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001623 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001624 -ENOMEM, chain_size);
1625 size -= chain_size;
1626 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001627
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001628 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001629 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001630 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001631
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001632 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001633
1634 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001635 }
1636}
1637
1638/*
1639 * a queue callback. Makes sure that we don't create a bio that spans across
1640 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001641 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001642 */
1643static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1644 struct bio_vec *bvec)
1645{
1646 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05001647 sector_t sector_offset;
1648 sector_t sectors_per_obj;
1649 sector_t obj_sector_offset;
1650 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001651
Alex Eldere5cfeed2012-10-20 22:17:27 -05001652 /*
1653 * Find how far into its rbd object the partition-relative
1654 * bio start sector is to offset relative to the enclosing
1655 * device.
1656 */
1657 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1658 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1659 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001660
Alex Eldere5cfeed2012-10-20 22:17:27 -05001661 /*
1662 * Compute the number of bytes from that offset to the end
1663 * of the object. Account for what's already used by the bio.
1664 */
1665 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1666 if (ret > bmd->bi_size)
1667 ret -= bmd->bi_size;
1668 else
1669 ret = 0;
1670
1671 /*
1672 * Don't send back more than was asked for. And if the bio
1673 * was empty, let the whole thing through because: "Note
1674 * that a block device *must* allow a single page to be
1675 * added to an empty bio."
1676 */
1677 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1678 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1679 ret = (int) bvec->bv_len;
1680
1681 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001682}
1683
1684static void rbd_free_disk(struct rbd_device *rbd_dev)
1685{
1686 struct gendisk *disk = rbd_dev->disk;
1687
1688 if (!disk)
1689 return;
1690
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001691 if (disk->flags & GENHD_FL_UP)
1692 del_gendisk(disk);
1693 if (disk->queue)
1694 blk_cleanup_queue(disk->queue);
1695 put_disk(disk);
1696}
1697
1698/*
Alex Elder4156d992012-08-02 11:29:46 -05001699 * Read the complete header for the given rbd device.
1700 *
1701 * Returns a pointer to a dynamically-allocated buffer containing
1702 * the complete and validated header. Caller can pass the address
1703 * of a variable that will be filled in with the version of the
1704 * header object at the time it was read.
1705 *
1706 * Returns a pointer-coded errno if a failure occurs.
1707 */
1708static struct rbd_image_header_ondisk *
1709rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1710{
1711 struct rbd_image_header_ondisk *ondisk = NULL;
1712 u32 snap_count = 0;
1713 u64 names_size = 0;
1714 u32 want_count;
1715 int ret;
1716
1717 /*
1718 * The complete header will include an array of its 64-bit
1719 * snapshot ids, followed by the names of those snapshots as
1720 * a contiguous block of NUL-terminated strings. Note that
1721 * the number of snapshots could change by the time we read
1722 * it in, in which case we re-read it.
1723 */
1724 do {
1725 size_t size;
1726
1727 kfree(ondisk);
1728
1729 size = sizeof (*ondisk);
1730 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1731 size += names_size;
1732 ondisk = kmalloc(size, GFP_KERNEL);
1733 if (!ondisk)
1734 return ERR_PTR(-ENOMEM);
1735
1736 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1737 rbd_dev->header_name,
1738 0, size,
1739 (char *) ondisk, version);
1740
1741 if (ret < 0)
1742 goto out_err;
1743 if (WARN_ON((size_t) ret < size)) {
1744 ret = -ENXIO;
1745 pr_warning("short header read for image %s"
1746 " (want %zd got %d)\n",
1747 rbd_dev->image_name, size, ret);
1748 goto out_err;
1749 }
1750 if (!rbd_dev_ondisk_valid(ondisk)) {
1751 ret = -ENXIO;
1752 pr_warning("invalid header for image %s\n",
1753 rbd_dev->image_name);
1754 goto out_err;
1755 }
1756
1757 names_size = le64_to_cpu(ondisk->snap_names_len);
1758 want_count = snap_count;
1759 snap_count = le32_to_cpu(ondisk->snap_count);
1760 } while (snap_count != want_count);
1761
1762 return ondisk;
1763
1764out_err:
1765 kfree(ondisk);
1766
1767 return ERR_PTR(ret);
1768}
1769
1770/*
1771 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001772 */
1773static int rbd_read_header(struct rbd_device *rbd_dev,
1774 struct rbd_image_header *header)
1775{
Alex Elder4156d992012-08-02 11:29:46 -05001776 struct rbd_image_header_ondisk *ondisk;
1777 u64 ver = 0;
1778 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001779
Alex Elder4156d992012-08-02 11:29:46 -05001780 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1781 if (IS_ERR(ondisk))
1782 return PTR_ERR(ondisk);
1783 ret = rbd_header_from_disk(header, ondisk);
1784 if (ret >= 0)
1785 header->obj_version = ver;
1786 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001787
Alex Elder4156d992012-08-02 11:29:46 -05001788 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001789}
1790
Alex Elder41f38c22012-10-25 23:34:40 -05001791static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001792{
1793 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001794 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001795
Alex Eldera0593292012-07-19 09:09:27 -05001796 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001797 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001798}
1799
Alex Elder94785542012-10-09 13:50:17 -07001800static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1801{
1802 sector_t size;
1803
Alex Elder971f8392012-10-25 23:34:41 -05001804 if (rbd_dev->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001805 return;
1806
1807 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1808 dout("setting size to %llu sectors", (unsigned long long) size);
1809 rbd_dev->mapping.size = (u64) size;
1810 set_capacity(rbd_dev->disk, size);
1811}
1812
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813/*
1814 * only read the first part of the ondisk header, without the snaps info
1815 */
Alex Elder117973f2012-08-31 17:29:55 -05001816static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001817{
1818 int ret;
1819 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001820
1821 ret = rbd_read_header(rbd_dev, &h);
1822 if (ret < 0)
1823 return ret;
1824
Josh Durgina51aa0c2011-12-05 10:35:04 -08001825 down_write(&rbd_dev->header_rwsem);
1826
Alex Elder94785542012-10-09 13:50:17 -07001827 /* Update image size, and check for resize of mapped image */
1828 rbd_dev->header.image_size = h.image_size;
1829 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001830
Alex Elder849b4262012-07-09 21:04:24 -05001831 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001833 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001834 /* osd requests may still refer to snapc */
1835 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001836
Alex Elderb8136232012-07-25 09:32:41 -05001837 if (hver)
1838 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001839 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001840 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001841 rbd_dev->header.snapc = h.snapc;
1842 rbd_dev->header.snap_names = h.snap_names;
1843 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001844 /* Free the extra copy of the object prefix */
1845 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1846 kfree(h.object_prefix);
1847
Alex Elder304f6802012-08-31 17:29:52 -05001848 ret = rbd_dev_snaps_update(rbd_dev);
1849 if (!ret)
1850 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001851
Josh Durginc6666012011-11-21 17:11:12 -08001852 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001853
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001854 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001855}
1856
Alex Elder117973f2012-08-31 17:29:55 -05001857static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001858{
1859 int ret;
1860
Alex Elder117973f2012-08-31 17:29:55 -05001861 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001862 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001863 if (rbd_dev->image_format == 1)
1864 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1865 else
1866 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001867 mutex_unlock(&ctl_mutex);
1868
1869 return ret;
1870}
1871
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001872static int rbd_init_disk(struct rbd_device *rbd_dev)
1873{
1874 struct gendisk *disk;
1875 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001876 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001877
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001878 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001879 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1880 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001881 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001882
Alex Elderf0f8cef2012-01-29 13:57:44 -06001883 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001884 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001885 disk->major = rbd_dev->major;
1886 disk->first_minor = 0;
1887 disk->fops = &rbd_bd_ops;
1888 disk->private_data = rbd_dev;
1889
1890 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001891 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1892 if (!q)
1893 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001894
Alex Elder593a9e72012-02-07 12:03:37 -06001895 /* We use the default size, but let's be explicit about it. */
1896 blk_queue_physical_block_size(q, SECTOR_SIZE);
1897
Josh Durgin029bcbd2011-07-22 11:35:23 -07001898 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001899 segment_size = rbd_obj_bytes(&rbd_dev->header);
1900 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1901 blk_queue_max_segment_size(q, segment_size);
1902 blk_queue_io_min(q, segment_size);
1903 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001904
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001905 blk_queue_merge_bvec(q, rbd_merge_bvec);
1906 disk->queue = q;
1907
1908 q->queuedata = rbd_dev;
1909
1910 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001911
Alex Elder12f02942012-08-29 17:11:07 -05001912 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1913
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001914 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001915out_disk:
1916 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001917
1918 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001919}
1920
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001921/*
1922 sysfs
1923*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001924
Alex Elder593a9e72012-02-07 12:03:37 -06001925static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1926{
1927 return container_of(dev, struct rbd_device, dev);
1928}
1929
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001930static ssize_t rbd_size_show(struct device *dev,
1931 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001932{
Alex Elder593a9e72012-02-07 12:03:37 -06001933 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001934 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001935
Josh Durgina51aa0c2011-12-05 10:35:04 -08001936 down_read(&rbd_dev->header_rwsem);
1937 size = get_capacity(rbd_dev->disk);
1938 up_read(&rbd_dev->header_rwsem);
1939
1940 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001941}
1942
Alex Elder34b13182012-07-13 20:35:12 -05001943/*
1944 * Note this shows the features for whatever's mapped, which is not
1945 * necessarily the base image.
1946 */
1947static ssize_t rbd_features_show(struct device *dev,
1948 struct device_attribute *attr, char *buf)
1949{
1950 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1951
1952 return sprintf(buf, "0x%016llx\n",
1953 (unsigned long long) rbd_dev->mapping.features);
1954}
1955
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001956static ssize_t rbd_major_show(struct device *dev,
1957 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001958{
Alex Elder593a9e72012-02-07 12:03:37 -06001959 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001960
1961 return sprintf(buf, "%d\n", rbd_dev->major);
1962}
1963
1964static ssize_t rbd_client_id_show(struct device *dev,
1965 struct device_attribute *attr, char *buf)
1966{
Alex Elder593a9e72012-02-07 12:03:37 -06001967 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968
Alex Elder1dbb4392012-01-24 10:08:37 -06001969 return sprintf(buf, "client%lld\n",
1970 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001971}
1972
1973static ssize_t rbd_pool_show(struct device *dev,
1974 struct device_attribute *attr, char *buf)
1975{
Alex Elder593a9e72012-02-07 12:03:37 -06001976 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001977
1978 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1979}
1980
Alex Elder9bb2f332012-07-12 10:46:35 -05001981static ssize_t rbd_pool_id_show(struct device *dev,
1982 struct device_attribute *attr, char *buf)
1983{
1984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1985
Alex Elder86992092012-10-25 23:34:41 -05001986 return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05001987}
1988
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001989static ssize_t rbd_name_show(struct device *dev,
1990 struct device_attribute *attr, char *buf)
1991{
Alex Elder593a9e72012-02-07 12:03:37 -06001992 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001993
Alex Elder0bed54d2012-07-03 16:01:18 -05001994 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001995}
1996
Alex Elder589d30e2012-07-10 20:30:11 -05001997static ssize_t rbd_image_id_show(struct device *dev,
1998 struct device_attribute *attr, char *buf)
1999{
2000 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2001
2002 return sprintf(buf, "%s\n", rbd_dev->image_id);
2003}
2004
Alex Elder34b13182012-07-13 20:35:12 -05002005/*
2006 * Shows the name of the currently-mapped snapshot (or
2007 * RBD_SNAP_HEAD_NAME for the base image).
2008 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002009static ssize_t rbd_snap_show(struct device *dev,
2010 struct device_attribute *attr,
2011 char *buf)
2012{
Alex Elder593a9e72012-02-07 12:03:37 -06002013 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002014
Alex Elder971f8392012-10-25 23:34:41 -05002015 return sprintf(buf, "%s\n", rbd_dev->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002016}
2017
2018static ssize_t rbd_image_refresh(struct device *dev,
2019 struct device_attribute *attr,
2020 const char *buf,
2021 size_t size)
2022{
Alex Elder593a9e72012-02-07 12:03:37 -06002023 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002024 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002025
Alex Elder117973f2012-08-31 17:29:55 -05002026 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002027
2028 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002029}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002030
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002031static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002032static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002033static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2034static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2035static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002036static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002037static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002038static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002039static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2040static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002041
2042static struct attribute *rbd_attrs[] = {
2043 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002044 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002045 &dev_attr_major.attr,
2046 &dev_attr_client_id.attr,
2047 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002048 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002049 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002050 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002051 &dev_attr_current_snap.attr,
2052 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002053 NULL
2054};
2055
2056static struct attribute_group rbd_attr_group = {
2057 .attrs = rbd_attrs,
2058};
2059
2060static const struct attribute_group *rbd_attr_groups[] = {
2061 &rbd_attr_group,
2062 NULL
2063};
2064
2065static void rbd_sysfs_dev_release(struct device *dev)
2066{
2067}
2068
2069static struct device_type rbd_device_type = {
2070 .name = "rbd",
2071 .groups = rbd_attr_groups,
2072 .release = rbd_sysfs_dev_release,
2073};
2074
2075
2076/*
2077 sysfs - snapshots
2078*/
2079
2080static ssize_t rbd_snap_size_show(struct device *dev,
2081 struct device_attribute *attr,
2082 char *buf)
2083{
2084 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2085
Josh Durgin35915382011-12-05 18:25:13 -08002086 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002087}
2088
2089static ssize_t rbd_snap_id_show(struct device *dev,
2090 struct device_attribute *attr,
2091 char *buf)
2092{
2093 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2094
Josh Durgin35915382011-12-05 18:25:13 -08002095 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002096}
2097
Alex Elder34b13182012-07-13 20:35:12 -05002098static ssize_t rbd_snap_features_show(struct device *dev,
2099 struct device_attribute *attr,
2100 char *buf)
2101{
2102 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2103
2104 return sprintf(buf, "0x%016llx\n",
2105 (unsigned long long) snap->features);
2106}
2107
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002108static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2109static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002110static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002111
2112static struct attribute *rbd_snap_attrs[] = {
2113 &dev_attr_snap_size.attr,
2114 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002115 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002116 NULL,
2117};
2118
2119static struct attribute_group rbd_snap_attr_group = {
2120 .attrs = rbd_snap_attrs,
2121};
2122
2123static void rbd_snap_dev_release(struct device *dev)
2124{
2125 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2126 kfree(snap->name);
2127 kfree(snap);
2128}
2129
2130static const struct attribute_group *rbd_snap_attr_groups[] = {
2131 &rbd_snap_attr_group,
2132 NULL
2133};
2134
2135static struct device_type rbd_snap_device_type = {
2136 .groups = rbd_snap_attr_groups,
2137 .release = rbd_snap_dev_release,
2138};
2139
Alex Elder304f6802012-08-31 17:29:52 -05002140static bool rbd_snap_registered(struct rbd_snap *snap)
2141{
2142 bool ret = snap->dev.type == &rbd_snap_device_type;
2143 bool reg = device_is_registered(&snap->dev);
2144
2145 rbd_assert(!ret ^ reg);
2146
2147 return ret;
2148}
2149
Alex Elder41f38c22012-10-25 23:34:40 -05002150static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002151{
2152 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002153 if (device_is_registered(&snap->dev))
2154 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002155}
2156
Alex Elder14e70852012-07-19 09:09:27 -05002157static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002158 struct device *parent)
2159{
2160 struct device *dev = &snap->dev;
2161 int ret;
2162
2163 dev->type = &rbd_snap_device_type;
2164 dev->parent = parent;
2165 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002166 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002167 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2168
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002169 ret = device_register(dev);
2170
2171 return ret;
2172}
2173
Alex Elder4e891e02012-07-10 20:30:10 -05002174static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002175 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002176 u64 snap_id, u64 snap_size,
2177 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002178{
Alex Elder4e891e02012-07-10 20:30:10 -05002179 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002180 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002181
2182 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002183 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002184 return ERR_PTR(-ENOMEM);
2185
2186 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002187 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002188 if (!snap->name)
2189 goto err;
2190
Alex Elderc8d18422012-07-10 20:30:11 -05002191 snap->id = snap_id;
2192 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002193 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002194
2195 return snap;
2196
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002197err:
2198 kfree(snap->name);
2199 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002200
2201 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002202}
2203
Alex Eldercd892122012-07-03 16:01:19 -05002204static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2205 u64 *snap_size, u64 *snap_features)
2206{
2207 char *snap_name;
2208
2209 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2210
2211 *snap_size = rbd_dev->header.snap_sizes[which];
2212 *snap_features = 0; /* No features for v1 */
2213
2214 /* Skip over names until we find the one we are looking for */
2215
2216 snap_name = rbd_dev->header.snap_names;
2217 while (which--)
2218 snap_name += strlen(snap_name) + 1;
2219
2220 return snap_name;
2221}
2222
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002223/*
Alex Elder9d475de2012-07-03 16:01:19 -05002224 * Get the size and object order for an image snapshot, or if
2225 * snap_id is CEPH_NOSNAP, gets this information for the base
2226 * image.
2227 */
2228static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2229 u8 *order, u64 *snap_size)
2230{
2231 __le64 snapid = cpu_to_le64(snap_id);
2232 int ret;
2233 struct {
2234 u8 order;
2235 __le64 size;
2236 } __attribute__ ((packed)) size_buf = { 0 };
2237
2238 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2239 "rbd", "get_size",
2240 (char *) &snapid, sizeof (snapid),
2241 (char *) &size_buf, sizeof (size_buf),
2242 CEPH_OSD_FLAG_READ, NULL);
2243 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2244 if (ret < 0)
2245 return ret;
2246
2247 *order = size_buf.order;
2248 *snap_size = le64_to_cpu(size_buf.size);
2249
2250 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2251 (unsigned long long) snap_id, (unsigned int) *order,
2252 (unsigned long long) *snap_size);
2253
2254 return 0;
2255}
2256
2257static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2258{
2259 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2260 &rbd_dev->header.obj_order,
2261 &rbd_dev->header.image_size);
2262}
2263
Alex Elder1e130192012-07-03 16:01:19 -05002264static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2265{
2266 void *reply_buf;
2267 int ret;
2268 void *p;
2269
2270 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2271 if (!reply_buf)
2272 return -ENOMEM;
2273
2274 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2275 "rbd", "get_object_prefix",
2276 NULL, 0,
2277 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2278 CEPH_OSD_FLAG_READ, NULL);
2279 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2280 if (ret < 0)
2281 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002282 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002283
2284 p = reply_buf;
2285 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2286 p + RBD_OBJ_PREFIX_LEN_MAX,
2287 NULL, GFP_NOIO);
2288
2289 if (IS_ERR(rbd_dev->header.object_prefix)) {
2290 ret = PTR_ERR(rbd_dev->header.object_prefix);
2291 rbd_dev->header.object_prefix = NULL;
2292 } else {
2293 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2294 }
2295
2296out:
2297 kfree(reply_buf);
2298
2299 return ret;
2300}
2301
Alex Elderb1b54022012-07-03 16:01:19 -05002302static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2303 u64 *snap_features)
2304{
2305 __le64 snapid = cpu_to_le64(snap_id);
2306 struct {
2307 __le64 features;
2308 __le64 incompat;
2309 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002310 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002311 int ret;
2312
2313 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2314 "rbd", "get_features",
2315 (char *) &snapid, sizeof (snapid),
2316 (char *) &features_buf, sizeof (features_buf),
2317 CEPH_OSD_FLAG_READ, NULL);
2318 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2319 if (ret < 0)
2320 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002321
2322 incompat = le64_to_cpu(features_buf.incompat);
2323 if (incompat & ~RBD_FEATURES_ALL)
2324 return -ENOTSUPP;
2325
Alex Elderb1b54022012-07-03 16:01:19 -05002326 *snap_features = le64_to_cpu(features_buf.features);
2327
2328 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2329 (unsigned long long) snap_id,
2330 (unsigned long long) *snap_features,
2331 (unsigned long long) le64_to_cpu(features_buf.incompat));
2332
2333 return 0;
2334}
2335
2336static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2337{
2338 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2339 &rbd_dev->header.features);
2340}
2341
Alex Elder6e14b1a2012-07-03 16:01:19 -05002342static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002343{
2344 size_t size;
2345 int ret;
2346 void *reply_buf;
2347 void *p;
2348 void *end;
2349 u64 seq;
2350 u32 snap_count;
2351 struct ceph_snap_context *snapc;
2352 u32 i;
2353
2354 /*
2355 * We'll need room for the seq value (maximum snapshot id),
2356 * snapshot count, and array of that many snapshot ids.
2357 * For now we have a fixed upper limit on the number we're
2358 * prepared to receive.
2359 */
2360 size = sizeof (__le64) + sizeof (__le32) +
2361 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2362 reply_buf = kzalloc(size, GFP_KERNEL);
2363 if (!reply_buf)
2364 return -ENOMEM;
2365
2366 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2367 "rbd", "get_snapcontext",
2368 NULL, 0,
2369 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002370 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002371 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2372 if (ret < 0)
2373 goto out;
2374
2375 ret = -ERANGE;
2376 p = reply_buf;
2377 end = (char *) reply_buf + size;
2378 ceph_decode_64_safe(&p, end, seq, out);
2379 ceph_decode_32_safe(&p, end, snap_count, out);
2380
2381 /*
2382 * Make sure the reported number of snapshot ids wouldn't go
2383 * beyond the end of our buffer. But before checking that,
2384 * make sure the computed size of the snapshot context we
2385 * allocate is representable in a size_t.
2386 */
2387 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2388 / sizeof (u64)) {
2389 ret = -EINVAL;
2390 goto out;
2391 }
2392 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2393 goto out;
2394
2395 size = sizeof (struct ceph_snap_context) +
2396 snap_count * sizeof (snapc->snaps[0]);
2397 snapc = kmalloc(size, GFP_KERNEL);
2398 if (!snapc) {
2399 ret = -ENOMEM;
2400 goto out;
2401 }
2402
2403 atomic_set(&snapc->nref, 1);
2404 snapc->seq = seq;
2405 snapc->num_snaps = snap_count;
2406 for (i = 0; i < snap_count; i++)
2407 snapc->snaps[i] = ceph_decode_64(&p);
2408
2409 rbd_dev->header.snapc = snapc;
2410
2411 dout(" snap context seq = %llu, snap_count = %u\n",
2412 (unsigned long long) seq, (unsigned int) snap_count);
2413
2414out:
2415 kfree(reply_buf);
2416
2417 return 0;
2418}
2419
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002420static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2421{
2422 size_t size;
2423 void *reply_buf;
2424 __le64 snap_id;
2425 int ret;
2426 void *p;
2427 void *end;
2428 size_t snap_name_len;
2429 char *snap_name;
2430
2431 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2432 reply_buf = kmalloc(size, GFP_KERNEL);
2433 if (!reply_buf)
2434 return ERR_PTR(-ENOMEM);
2435
2436 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2437 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2438 "rbd", "get_snapshot_name",
2439 (char *) &snap_id, sizeof (snap_id),
2440 reply_buf, size,
2441 CEPH_OSD_FLAG_READ, NULL);
2442 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2443 if (ret < 0)
2444 goto out;
2445
2446 p = reply_buf;
2447 end = (char *) reply_buf + size;
2448 snap_name_len = 0;
2449 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2450 GFP_KERNEL);
2451 if (IS_ERR(snap_name)) {
2452 ret = PTR_ERR(snap_name);
2453 goto out;
2454 } else {
2455 dout(" snap_id 0x%016llx snap_name = %s\n",
2456 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2457 }
2458 kfree(reply_buf);
2459
2460 return snap_name;
2461out:
2462 kfree(reply_buf);
2463
2464 return ERR_PTR(ret);
2465}
2466
2467static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2468 u64 *snap_size, u64 *snap_features)
2469{
2470 __le64 snap_id;
2471 u8 order;
2472 int ret;
2473
2474 snap_id = rbd_dev->header.snapc->snaps[which];
2475 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2476 if (ret)
2477 return ERR_PTR(ret);
2478 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2479 if (ret)
2480 return ERR_PTR(ret);
2481
2482 return rbd_dev_v2_snap_name(rbd_dev, which);
2483}
2484
2485static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2486 u64 *snap_size, u64 *snap_features)
2487{
2488 if (rbd_dev->image_format == 1)
2489 return rbd_dev_v1_snap_info(rbd_dev, which,
2490 snap_size, snap_features);
2491 if (rbd_dev->image_format == 2)
2492 return rbd_dev_v2_snap_info(rbd_dev, which,
2493 snap_size, snap_features);
2494 return ERR_PTR(-EINVAL);
2495}
2496
Alex Elder117973f2012-08-31 17:29:55 -05002497static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2498{
2499 int ret;
2500 __u8 obj_order;
2501
2502 down_write(&rbd_dev->header_rwsem);
2503
2504 /* Grab old order first, to see if it changes */
2505
2506 obj_order = rbd_dev->header.obj_order,
2507 ret = rbd_dev_v2_image_size(rbd_dev);
2508 if (ret)
2509 goto out;
2510 if (rbd_dev->header.obj_order != obj_order) {
2511 ret = -EIO;
2512 goto out;
2513 }
2514 rbd_update_mapping_size(rbd_dev);
2515
2516 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2517 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2518 if (ret)
2519 goto out;
2520 ret = rbd_dev_snaps_update(rbd_dev);
2521 dout("rbd_dev_snaps_update returned %d\n", ret);
2522 if (ret)
2523 goto out;
2524 ret = rbd_dev_snaps_register(rbd_dev);
2525 dout("rbd_dev_snaps_register returned %d\n", ret);
2526out:
2527 up_write(&rbd_dev->header_rwsem);
2528
2529 return ret;
2530}
2531
Alex Elder9d475de2012-07-03 16:01:19 -05002532/*
Alex Elder35938152012-08-02 11:29:46 -05002533 * Scan the rbd device's current snapshot list and compare it to the
2534 * newly-received snapshot context. Remove any existing snapshots
2535 * not present in the new snapshot context. Add a new snapshot for
2536 * any snaphots in the snapshot context not in the current list.
2537 * And verify there are no changes to snapshots we already know
2538 * about.
2539 *
2540 * Assumes the snapshots in the snapshot context are sorted by
2541 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2542 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002543 */
Alex Elder304f6802012-08-31 17:29:52 -05002544static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002545{
Alex Elder35938152012-08-02 11:29:46 -05002546 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2547 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002548 struct list_head *head = &rbd_dev->snaps;
2549 struct list_head *links = head->next;
2550 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002551
Alex Elder9fcbb802012-08-23 23:48:49 -05002552 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002553 while (index < snap_count || links != head) {
2554 u64 snap_id;
2555 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002556 char *snap_name;
2557 u64 snap_size = 0;
2558 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002559
Alex Elder35938152012-08-02 11:29:46 -05002560 snap_id = index < snap_count ? snapc->snaps[index]
2561 : CEPH_NOSNAP;
2562 snap = links != head ? list_entry(links, struct rbd_snap, node)
2563 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002564 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002565
Alex Elder35938152012-08-02 11:29:46 -05002566 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2567 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002568
Alex Elder35938152012-08-02 11:29:46 -05002569 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002570
Alex Elder971f8392012-10-25 23:34:41 -05002571 if (rbd_dev->snap_id == snap->id)
Alex Elderf84344f2012-08-31 17:29:51 -05002572 rbd_dev->mapping.snap_exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002573 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002574 dout("%ssnap id %llu has been removed\n",
Alex Elder971f8392012-10-25 23:34:41 -05002575 rbd_dev->snap_id == snap->id ? "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002576 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002577
Alex Elder35938152012-08-02 11:29:46 -05002578 /* Done with this list entry; advance */
2579
2580 links = next;
2581 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002582 }
Alex Elder35938152012-08-02 11:29:46 -05002583
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002584 snap_name = rbd_dev_snap_info(rbd_dev, index,
2585 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002586 if (IS_ERR(snap_name))
2587 return PTR_ERR(snap_name);
2588
Alex Elder9fcbb802012-08-23 23:48:49 -05002589 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2590 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002591 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2592 struct rbd_snap *new_snap;
2593
2594 /* We haven't seen this snapshot before */
2595
Alex Elderc8d18422012-07-10 20:30:11 -05002596 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002597 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002598 if (IS_ERR(new_snap)) {
2599 int err = PTR_ERR(new_snap);
2600
2601 dout(" failed to add dev, error %d\n", err);
2602
2603 return err;
2604 }
Alex Elder35938152012-08-02 11:29:46 -05002605
2606 /* New goes before existing, or at end of list */
2607
Alex Elder9fcbb802012-08-23 23:48:49 -05002608 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002609 if (snap)
2610 list_add_tail(&new_snap->node, &snap->node);
2611 else
Alex Elder523f3252012-08-30 00:16:37 -05002612 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002613 } else {
2614 /* Already have this one */
2615
Alex Elder9fcbb802012-08-23 23:48:49 -05002616 dout(" already present\n");
2617
Alex Eldercd892122012-07-03 16:01:19 -05002618 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002619 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002620 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002621
2622 /* Done with this list entry; advance */
2623
2624 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002625 }
Alex Elder35938152012-08-02 11:29:46 -05002626
2627 /* Advance to the next entry in the snapshot context */
2628
2629 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002630 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002631 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002632
2633 return 0;
2634}
2635
Alex Elder304f6802012-08-31 17:29:52 -05002636/*
2637 * Scan the list of snapshots and register the devices for any that
2638 * have not already been registered.
2639 */
2640static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2641{
2642 struct rbd_snap *snap;
2643 int ret = 0;
2644
2645 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002646 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2647 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002648
2649 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2650 if (!rbd_snap_registered(snap)) {
2651 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2652 if (ret < 0)
2653 break;
2654 }
2655 }
2656 dout("%s: returning %d\n", __func__, ret);
2657
2658 return ret;
2659}
2660
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002661static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2662{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002663 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002664 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002665
2666 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002667
Alex Eldercd789ab2012-08-30 00:16:38 -05002668 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002669 dev->bus = &rbd_bus_type;
2670 dev->type = &rbd_device_type;
2671 dev->parent = &rbd_root_dev;
2672 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002673 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002674 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002675
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002676 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002677
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002678 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002679}
2680
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002681static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2682{
2683 device_unregister(&rbd_dev->dev);
2684}
2685
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002686static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2687{
2688 int ret, rc;
2689
2690 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002691 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002692 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002693 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002694 if (rc < 0)
2695 return rc;
2696 }
2697 } while (ret == -ERANGE);
2698
2699 return ret;
2700}
2701
Alex Eldere2839302012-08-29 17:11:06 -05002702static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002703
2704/*
Alex Elder499afd52012-02-02 08:13:29 -06002705 * Get a unique rbd identifier for the given new rbd_dev, and add
2706 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002707 */
Alex Eldere2839302012-08-29 17:11:06 -05002708static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002709{
Alex Eldere2839302012-08-29 17:11:06 -05002710 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002711
2712 spin_lock(&rbd_dev_list_lock);
2713 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2714 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002715 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2716 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002717}
Alex Elderb7f23c32012-01-29 13:57:43 -06002718
Alex Elder1ddbe942012-01-29 13:57:44 -06002719/*
Alex Elder499afd52012-02-02 08:13:29 -06002720 * Remove an rbd_dev from the global list, and record that its
2721 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002722 */
Alex Eldere2839302012-08-29 17:11:06 -05002723static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002724{
Alex Elderd184f6b2012-01-29 13:57:44 -06002725 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002726 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002727 int max_id;
2728
Alex Elderaafb2302012-09-06 16:00:54 -05002729 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002730
Alex Eldere2839302012-08-29 17:11:06 -05002731 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2732 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002733 spin_lock(&rbd_dev_list_lock);
2734 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002735
2736 /*
2737 * If the id being "put" is not the current maximum, there
2738 * is nothing special we need to do.
2739 */
Alex Eldere2839302012-08-29 17:11:06 -05002740 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002741 spin_unlock(&rbd_dev_list_lock);
2742 return;
2743 }
2744
2745 /*
2746 * We need to update the current maximum id. Search the
2747 * list to find out what it is. We're more likely to find
2748 * the maximum at the end, so search the list backward.
2749 */
2750 max_id = 0;
2751 list_for_each_prev(tmp, &rbd_dev_list) {
2752 struct rbd_device *rbd_dev;
2753
2754 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002755 if (rbd_dev->dev_id > max_id)
2756 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002757 }
Alex Elder499afd52012-02-02 08:13:29 -06002758 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002759
Alex Elder1ddbe942012-01-29 13:57:44 -06002760 /*
Alex Eldere2839302012-08-29 17:11:06 -05002761 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002762 * which case it now accurately reflects the new maximum.
2763 * Be careful not to overwrite the maximum value in that
2764 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002765 */
Alex Eldere2839302012-08-29 17:11:06 -05002766 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2767 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002768}
2769
Alex Eldera725f65e2012-02-02 08:13:30 -06002770/*
Alex Eldere28fff262012-02-02 08:13:30 -06002771 * Skips over white space at *buf, and updates *buf to point to the
2772 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002773 * the token (string of non-white space characters) found. Note
2774 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002775 */
2776static inline size_t next_token(const char **buf)
2777{
2778 /*
2779 * These are the characters that produce nonzero for
2780 * isspace() in the "C" and "POSIX" locales.
2781 */
2782 const char *spaces = " \f\n\r\t\v";
2783
2784 *buf += strspn(*buf, spaces); /* Find start of token */
2785
2786 return strcspn(*buf, spaces); /* Return token length */
2787}
2788
2789/*
2790 * Finds the next token in *buf, and if the provided token buffer is
2791 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002792 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2793 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002794 *
2795 * Returns the length of the token found (not including the '\0').
2796 * Return value will be 0 if no token is found, and it will be >=
2797 * token_size if the token would not fit.
2798 *
Alex Elder593a9e72012-02-07 12:03:37 -06002799 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002800 * found token. Note that this occurs even if the token buffer is
2801 * too small to hold it.
2802 */
2803static inline size_t copy_token(const char **buf,
2804 char *token,
2805 size_t token_size)
2806{
2807 size_t len;
2808
2809 len = next_token(buf);
2810 if (len < token_size) {
2811 memcpy(token, *buf, len);
2812 *(token + len) = '\0';
2813 }
2814 *buf += len;
2815
2816 return len;
2817}
2818
2819/*
Alex Elderea3352f2012-07-09 21:04:23 -05002820 * Finds the next token in *buf, dynamically allocates a buffer big
2821 * enough to hold a copy of it, and copies the token into the new
2822 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2823 * that a duplicate buffer is created even for a zero-length token.
2824 *
2825 * Returns a pointer to the newly-allocated duplicate, or a null
2826 * pointer if memory for the duplicate was not available. If
2827 * the lenp argument is a non-null pointer, the length of the token
2828 * (not including the '\0') is returned in *lenp.
2829 *
2830 * If successful, the *buf pointer will be updated to point beyond
2831 * the end of the found token.
2832 *
2833 * Note: uses GFP_KERNEL for allocation.
2834 */
2835static inline char *dup_token(const char **buf, size_t *lenp)
2836{
2837 char *dup;
2838 size_t len;
2839
2840 len = next_token(buf);
2841 dup = kmalloc(len + 1, GFP_KERNEL);
2842 if (!dup)
2843 return NULL;
2844
2845 memcpy(dup, *buf, len);
2846 *(dup + len) = '\0';
2847 *buf += len;
2848
2849 if (lenp)
2850 *lenp = len;
2851
2852 return dup;
2853}
2854
2855/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002856 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2857 * rbd_md_name, and name fields of the given rbd_dev, based on the
2858 * list of monitor addresses and other options provided via
2859 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2860 * copy of the snapshot name to map if successful, or a
2861 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002862 *
2863 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002864 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002865static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2866 const char *buf,
2867 const char **mon_addrs,
2868 size_t *mon_addrs_size,
2869 char *options,
2870 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002871{
Alex Elderd22f76e2012-07-12 10:46:35 -05002872 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002873 char *err_ptr = ERR_PTR(-EINVAL);
2874 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002875
2876 /* The first four tokens are required */
2877
Alex Elder7ef32142012-02-02 08:13:30 -06002878 len = next_token(&buf);
2879 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002880 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002881 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002882 *mon_addrs = buf;
2883
2884 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002885
Alex Eldere28fff262012-02-02 08:13:30 -06002886 len = copy_token(&buf, options, options_size);
2887 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002888 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002889
Alex Elder3feeb8942012-08-31 17:29:52 -05002890 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002891 rbd_dev->pool_name = dup_token(&buf, NULL);
2892 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002893 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002894
Alex Elder0bed54d2012-07-03 16:01:18 -05002895 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2896 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002897 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002898
Alex Elderd4b125e2012-07-03 16:01:19 -05002899 /* Snapshot name is optional; default is to use "head" */
2900
Alex Elder3feeb8942012-08-31 17:29:52 -05002901 len = next_token(&buf);
Alex Elderd4b125e2012-07-03 16:01:19 -05002902 if (len > RBD_MAX_SNAP_NAME_LEN) {
2903 err_ptr = ERR_PTR(-ENAMETOOLONG);
2904 goto out_err;
2905 }
Alex Elder820a5f32012-07-09 21:04:24 -05002906 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002907 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2908 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002909 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002910 snap_name = kmalloc(len + 1, GFP_KERNEL);
2911 if (!snap_name)
2912 goto out_err;
2913 memcpy(snap_name, buf, len);
2914 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002915
Alex Elder3feeb8942012-08-31 17:29:52 -05002916 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002917
2918out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002919 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002920 rbd_dev->image_name = NULL;
2921 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002922 kfree(rbd_dev->pool_name);
2923 rbd_dev->pool_name = NULL;
2924
Alex Elder3feeb8942012-08-31 17:29:52 -05002925 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002926}
2927
Alex Elder589d30e2012-07-10 20:30:11 -05002928/*
2929 * An rbd format 2 image has a unique identifier, distinct from the
2930 * name given to it by the user. Internally, that identifier is
2931 * what's used to specify the names of objects related to the image.
2932 *
2933 * A special "rbd id" object is used to map an rbd image name to its
2934 * id. If that object doesn't exist, then there is no v2 rbd image
2935 * with the supplied name.
2936 *
2937 * This function will record the given rbd_dev's image_id field if
2938 * it can be determined, and in that case will return 0. If any
2939 * errors occur a negative errno will be returned and the rbd_dev's
2940 * image_id field will be unchanged (and should be NULL).
2941 */
2942static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2943{
2944 int ret;
2945 size_t size;
2946 char *object_name;
2947 void *response;
2948 void *p;
2949
2950 /*
2951 * First, see if the format 2 image id file exists, and if
2952 * so, get the image's persistent id from it.
2953 */
2954 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2955 object_name = kmalloc(size, GFP_NOIO);
2956 if (!object_name)
2957 return -ENOMEM;
2958 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2959 dout("rbd id object name is %s\n", object_name);
2960
2961 /* Response will be an encoded string, which includes a length */
2962
2963 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2964 response = kzalloc(size, GFP_NOIO);
2965 if (!response) {
2966 ret = -ENOMEM;
2967 goto out;
2968 }
2969
2970 ret = rbd_req_sync_exec(rbd_dev, object_name,
2971 "rbd", "get_id",
2972 NULL, 0,
2973 response, RBD_IMAGE_ID_LEN_MAX,
2974 CEPH_OSD_FLAG_READ, NULL);
2975 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2976 if (ret < 0)
2977 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002978 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05002979
2980 p = response;
2981 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2982 p + RBD_IMAGE_ID_LEN_MAX,
2983 &rbd_dev->image_id_len,
2984 GFP_NOIO);
2985 if (IS_ERR(rbd_dev->image_id)) {
2986 ret = PTR_ERR(rbd_dev->image_id);
2987 rbd_dev->image_id = NULL;
2988 } else {
2989 dout("image_id is %s\n", rbd_dev->image_id);
2990 }
2991out:
2992 kfree(response);
2993 kfree(object_name);
2994
2995 return ret;
2996}
2997
Alex Eldera30b71b2012-07-10 20:30:11 -05002998static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2999{
3000 int ret;
3001 size_t size;
3002
3003 /* Version 1 images have no id; empty string is used */
3004
3005 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
3006 if (!rbd_dev->image_id)
3007 return -ENOMEM;
3008 rbd_dev->image_id_len = 0;
3009
3010 /* Record the header object name for this rbd image. */
3011
3012 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
3013 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3014 if (!rbd_dev->header_name) {
3015 ret = -ENOMEM;
3016 goto out_err;
3017 }
3018 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
3019
3020 /* Populate rbd image metadata */
3021
3022 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3023 if (ret < 0)
3024 goto out_err;
3025 rbd_dev->image_format = 1;
3026
3027 dout("discovered version 1 image, header name is %s\n",
3028 rbd_dev->header_name);
3029
3030 return 0;
3031
3032out_err:
3033 kfree(rbd_dev->header_name);
3034 rbd_dev->header_name = NULL;
3035 kfree(rbd_dev->image_id);
3036 rbd_dev->image_id = NULL;
3037
3038 return ret;
3039}
3040
3041static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3042{
3043 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003044 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003045 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003046
3047 /*
3048 * Image id was filled in by the caller. Record the header
3049 * object name for this rbd image.
3050 */
3051 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
3052 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3053 if (!rbd_dev->header_name)
3054 return -ENOMEM;
3055 sprintf(rbd_dev->header_name, "%s%s",
3056 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003057
3058 /* Get the size and object order for the image */
3059
3060 ret = rbd_dev_v2_image_size(rbd_dev);
3061 if (ret < 0)
3062 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003063
3064 /* Get the object prefix (a.k.a. block_name) for the image */
3065
3066 ret = rbd_dev_v2_object_prefix(rbd_dev);
3067 if (ret < 0)
3068 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003069
Alex Elderd8891402012-10-09 13:50:17 -07003070 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003071
3072 ret = rbd_dev_v2_features(rbd_dev);
3073 if (ret < 0)
3074 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003075
Alex Elder6e14b1a2012-07-03 16:01:19 -05003076 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003077
Alex Elder6e14b1a2012-07-03 16:01:19 -05003078 rbd_dev->header.crypt_type = 0;
3079 rbd_dev->header.comp_type = 0;
3080
3081 /* Get the snapshot context, plus the header version */
3082
3083 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003084 if (ret)
3085 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003086 rbd_dev->header.obj_version = ver;
3087
Alex Eldera30b71b2012-07-10 20:30:11 -05003088 rbd_dev->image_format = 2;
3089
3090 dout("discovered version 2 image, header name is %s\n",
3091 rbd_dev->header_name);
3092
Alex Elder35152972012-08-31 17:29:55 -05003093 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003094out_err:
3095 kfree(rbd_dev->header_name);
3096 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003097 kfree(rbd_dev->header.object_prefix);
3098 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003099
3100 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003101}
3102
3103/*
3104 * Probe for the existence of the header object for the given rbd
3105 * device. For format 2 images this includes determining the image
3106 * id.
3107 */
3108static int rbd_dev_probe(struct rbd_device *rbd_dev)
3109{
3110 int ret;
3111
3112 /*
3113 * Get the id from the image id object. If it's not a
3114 * format 2 image, we'll get ENOENT back, and we'll assume
3115 * it's a format 1 image.
3116 */
3117 ret = rbd_dev_image_id(rbd_dev);
3118 if (ret)
3119 ret = rbd_dev_v1_probe(rbd_dev);
3120 else
3121 ret = rbd_dev_v2_probe(rbd_dev);
3122 if (ret)
3123 dout("probe failed, returning %d\n", ret);
3124
3125 return ret;
3126}
3127
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003128static ssize_t rbd_add(struct bus_type *bus,
3129 const char *buf,
3130 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003131{
Alex Eldercb8627c2012-07-09 21:04:23 -05003132 char *options;
3133 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06003134 const char *mon_addrs = NULL;
3135 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06003136 struct ceph_osd_client *osdc;
3137 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05003138 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003139
3140 if (!try_module_get(THIS_MODULE))
3141 return -ENODEV;
3142
Alex Elder27cc2592012-02-02 08:13:30 -06003143 options = kmalloc(count, GFP_KERNEL);
3144 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05003145 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05003146 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3147 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05003148 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003149
3150 /* static rbd_device initialization */
3151 spin_lock_init(&rbd_dev->lock);
3152 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003153 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003154 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003155
Alex Eldera725f65e2012-02-02 08:13:30 -06003156 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05003157 snap_name = rbd_add_parse_args(rbd_dev, buf,
3158 &mon_addrs, &mon_addrs_size, options, count);
3159 if (IS_ERR(snap_name)) {
3160 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003161 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05003162 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003163
Alex Elderf8c38922012-08-10 13:12:07 -07003164 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3165 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003166 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003167
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003168 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003169 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003170 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3171 if (rc < 0)
3172 goto err_out_client;
Alex Elder86992092012-10-25 23:34:41 -05003173 rbd_dev->pool_id = (u64) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003174
Alex Eldera30b71b2012-07-10 20:30:11 -05003175 rc = rbd_dev_probe(rbd_dev);
3176 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003177 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003178
3179 /* no need to lock here, as rbd_dev is not registered yet */
3180 rc = rbd_dev_snaps_update(rbd_dev);
3181 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003182 goto err_out_probe;
Alex Elder05fd6f62012-08-29 17:11:07 -05003183
3184 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3185 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003186 goto err_out_snaps;
Alex Elder05fd6f62012-08-29 17:11:07 -05003187
Alex Elder85ae8922012-07-26 23:37:14 -05003188 /* generate unique id: find highest unique id, add one */
3189 rbd_dev_id_get(rbd_dev);
3190
3191 /* Fill in the device name, now that we have its id. */
3192 BUILD_BUG_ON(DEV_NAME_LEN
3193 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3194 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3195
3196 /* Get our block major device number. */
3197
Alex Elder27cc2592012-02-02 08:13:30 -06003198 rc = register_blkdev(0, rbd_dev->name);
3199 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003200 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003201 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003202
Alex Elder0f308a32012-08-29 17:11:07 -05003203 /* Set up the blkdev mapping. */
3204
3205 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003206 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003207 goto err_out_blkdev;
3208
Alex Elder0f308a32012-08-29 17:11:07 -05003209 rc = rbd_bus_add_dev(rbd_dev);
3210 if (rc)
3211 goto err_out_disk;
3212
Alex Elder32eec682012-02-08 16:11:14 -06003213 /*
3214 * At this point cleanup in the event of an error is the job
3215 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003216 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003217
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003218 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003219 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003220 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003221 if (rc)
3222 goto err_out_bus;
3223
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003224 rc = rbd_init_watch_dev(rbd_dev);
3225 if (rc)
3226 goto err_out_bus;
3227
Alex Elder3ee40012012-08-29 17:11:07 -05003228 /* Everything's ready. Announce the disk to the world. */
3229
3230 add_disk(rbd_dev->disk);
3231
3232 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3233 (unsigned long long) rbd_dev->mapping.size);
3234
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003235 return count;
3236
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003237err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003238 /* this will also clean up rest of rbd_dev stuff */
3239
3240 rbd_bus_del_dev(rbd_dev);
3241 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003242 return rc;
3243
Alex Elder0f308a32012-08-29 17:11:07 -05003244err_out_disk:
3245 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003246err_out_blkdev:
3247 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003248err_out_id:
3249 rbd_dev_id_put(rbd_dev);
Alex Elder41f38c22012-10-25 23:34:40 -05003250err_out_snaps:
3251 rbd_remove_all_snaps(rbd_dev);
3252err_out_probe:
Alex Elder05fd6f62012-08-29 17:11:07 -05003253 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003254err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003255 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003256 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05003257 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05003258err_out_args:
Alex Elder971f8392012-10-25 23:34:41 -05003259 kfree(rbd_dev->snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003260 kfree(rbd_dev->image_name);
3261 kfree(rbd_dev->pool_name);
3262err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06003263 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05003264 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06003265
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003266 dout("Error adding device %s\n", buf);
3267 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003268
3269 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003270}
3271
Alex Elderde71a292012-07-03 16:01:19 -05003272static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003273{
3274 struct list_head *tmp;
3275 struct rbd_device *rbd_dev;
3276
Alex Eldere124a822012-01-29 13:57:44 -06003277 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003278 list_for_each(tmp, &rbd_dev_list) {
3279 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003280 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003281 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003282 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003283 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003284 }
Alex Eldere124a822012-01-29 13:57:44 -06003285 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003286 return NULL;
3287}
3288
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003289static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003290{
Alex Elder593a9e72012-02-07 12:03:37 -06003291 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003292
Alex Elder1dbb4392012-01-24 10:08:37 -06003293 if (rbd_dev->watch_request) {
3294 struct ceph_client *client = rbd_dev->rbd_client->client;
3295
3296 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003297 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003298 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003299 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003300 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003301
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003302 rbd_put_client(rbd_dev);
3303
3304 /* clean up and free blkdev */
3305 rbd_free_disk(rbd_dev);
3306 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003307
Alex Elder2ac4e752012-07-10 20:30:10 -05003308 /* release allocated disk header fields */
3309 rbd_header_free(&rbd_dev->header);
3310
Alex Elder32eec682012-02-08 16:11:14 -06003311 /* done with the id, and with the rbd_dev */
Alex Elder971f8392012-10-25 23:34:41 -05003312 kfree(rbd_dev->snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003313 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003314 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003315 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003316 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003317 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003318 kfree(rbd_dev);
3319
3320 /* release module ref */
3321 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003322}
3323
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003324static ssize_t rbd_remove(struct bus_type *bus,
3325 const char *buf,
3326 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003327{
3328 struct rbd_device *rbd_dev = NULL;
3329 int target_id, rc;
3330 unsigned long ul;
3331 int ret = count;
3332
3333 rc = strict_strtoul(buf, 10, &ul);
3334 if (rc)
3335 return rc;
3336
3337 /* convert to int; abort if we lost anything in the conversion */
3338 target_id = (int) ul;
3339 if (target_id != ul)
3340 return -EINVAL;
3341
3342 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3343
3344 rbd_dev = __rbd_get_dev(target_id);
3345 if (!rbd_dev) {
3346 ret = -ENOENT;
3347 goto done;
3348 }
3349
Alex Elder41f38c22012-10-25 23:34:40 -05003350 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003351 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003352
3353done:
3354 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003355
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003356 return ret;
3357}
3358
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003359/*
3360 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003361 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003362 */
3363static int rbd_sysfs_init(void)
3364{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003365 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003366
Alex Elderfed4c142012-02-07 12:03:36 -06003367 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003368 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003369 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003370
Alex Elderfed4c142012-02-07 12:03:36 -06003371 ret = bus_register(&rbd_bus_type);
3372 if (ret < 0)
3373 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003374
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003375 return ret;
3376}
3377
3378static void rbd_sysfs_cleanup(void)
3379{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003380 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003381 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003382}
3383
3384int __init rbd_init(void)
3385{
3386 int rc;
3387
3388 rc = rbd_sysfs_init();
3389 if (rc)
3390 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003391 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003392 return 0;
3393}
3394
3395void __exit rbd_exit(void)
3396{
3397 rbd_sysfs_cleanup();
3398}
3399
3400module_init(rbd_init);
3401module_exit(rbd_exit);
3402
3403MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3404MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3405MODULE_DESCRIPTION("rados block device");
3406
3407/* following authorship retained from original osdblk.c */
3408MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3409
3410MODULE_LICENSE("GPL");