blob: c800047f58350e6072c04deb1b0948b427a560da [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
115struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700116 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117};
118
119/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600120 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600129 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700130 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145};
146
Alex Elderf0f8cef2012-01-29 13:57:44 -0600147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800159struct rbd_snap {
160 struct device dev;
161 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800162 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800163 struct list_head node;
164 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500165 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166};
167
Alex Elderf84344f2012-08-31 17:29:51 -0500168struct rbd_mapping {
169 char *snap_name;
170 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500171 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500172 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500173 bool snap_exists;
174 bool read_only;
175};
176
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700177/*
178 * a single device
179 */
180struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500181 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
183 int major; /* blkdev assigned major */
184 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700185
Alex Eldera30b71b2012-07-10 20:30:11 -0500186 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700187 struct rbd_client *rbd_client;
188
189 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
190
191 spinlock_t lock; /* queue lock */
192
193 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500194 char *image_id;
195 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500196 char *image_name;
197 size_t image_name_len;
198 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500199 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500200 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700202 struct ceph_osd_event *watch_event;
203 struct ceph_osd_request *watch_request;
204
Josh Durginc6666012011-11-21 17:11:12 -0800205 /* protects updating the header */
206 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500207
208 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700209
210 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800211
212 /* list of snapshots */
213 struct list_head snaps;
214
215 /* sysfs related */
216 struct device dev;
217};
218
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700219static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600220
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700221static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600222static DEFINE_SPINLOCK(rbd_dev_list_lock);
223
Alex Elder432b8582012-01-29 13:57:44 -0600224static LIST_HEAD(rbd_client_list); /* clients */
225static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700226
Alex Elder304f6802012-08-31 17:29:52 -0500227static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
228static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
229
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800230static void rbd_dev_release(struct device *dev);
Alex Elder14e70852012-07-19 09:09:27 -0500231static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
Alex Elderf0f8cef2012-01-29 13:57:44 -0600233static ssize_t rbd_add(struct bus_type *bus, const char *buf,
234 size_t count);
235static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
236 size_t count);
237
238static struct bus_attribute rbd_bus_attrs[] = {
239 __ATTR(add, S_IWUSR, NULL, rbd_add),
240 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
241 __ATTR_NULL
242};
243
244static struct bus_type rbd_bus_type = {
245 .name = "rbd",
246 .bus_attrs = rbd_bus_attrs,
247};
248
249static void rbd_root_dev_release(struct device *dev)
250{
251}
252
253static struct device rbd_root_dev = {
254 .init_name = "rbd",
255 .release = rbd_root_dev_release,
256};
257
Alex Elderaafb2302012-09-06 16:00:54 -0500258#ifdef RBD_DEBUG
259#define rbd_assert(expr) \
260 if (unlikely(!(expr))) { \
261 printk(KERN_ERR "\nAssertion failure in %s() " \
262 "at line %d:\n\n" \
263 "\trbd_assert(%s);\n\n", \
264 __func__, __LINE__, #expr); \
265 BUG(); \
266 }
267#else /* !RBD_DEBUG */
268# define rbd_assert(expr) ((void) 0)
269#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800270
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
272{
273 return get_device(&rbd_dev->dev);
274}
275
276static void rbd_put_dev(struct rbd_device *rbd_dev)
277{
278 put_device(&rbd_dev->dev);
279}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700280
Alex Elder117973f2012-08-31 17:29:55 -0500281static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
282static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700283
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284static int rbd_open(struct block_device *bdev, fmode_t mode)
285{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600286 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700287
Alex Elderf84344f2012-08-31 17:29:51 -0500288 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700289 return -EROFS;
290
Alex Elder340c7a22012-08-10 13:12:07 -0700291 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500292 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700293
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 return 0;
295}
296
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800297static int rbd_release(struct gendisk *disk, fmode_t mode)
298{
299 struct rbd_device *rbd_dev = disk->private_data;
300
301 rbd_put_dev(rbd_dev);
302
303 return 0;
304}
305
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306static const struct block_device_operations rbd_bd_ops = {
307 .owner = THIS_MODULE,
308 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800309 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310};
311
312/*
313 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500314 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700315 */
Alex Elderf8c38922012-08-10 13:12:07 -0700316static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317{
318 struct rbd_client *rbdc;
319 int ret = -ENOMEM;
320
321 dout("rbd_client_create\n");
322 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
323 if (!rbdc)
324 goto out_opt;
325
326 kref_init(&rbdc->kref);
327 INIT_LIST_HEAD(&rbdc->node);
328
Alex Elderbc534d862012-01-29 13:57:44 -0600329 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
330
Alex Elder43ae4702012-07-03 16:01:18 -0500331 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600333 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500334 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700335
336 ret = ceph_open_session(rbdc->client);
337 if (ret < 0)
338 goto out_err;
339
Alex Elder432b8582012-01-29 13:57:44 -0600340 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600342 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700343
Alex Elderbc534d862012-01-29 13:57:44 -0600344 mutex_unlock(&ctl_mutex);
345
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346 dout("rbd_client_create created %p\n", rbdc);
347 return rbdc;
348
349out_err:
350 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600351out_mutex:
352 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700353 kfree(rbdc);
354out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500355 if (ceph_opts)
356 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400357 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358}
359
360/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700361 * Find a ceph client with specific addr and configuration. If
362 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700363 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700364static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700365{
366 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700367 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700368
Alex Elder43ae4702012-07-03 16:01:18 -0500369 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700370 return NULL;
371
Alex Elder1f7ba332012-08-10 13:12:07 -0700372 spin_lock(&rbd_client_list_lock);
373 list_for_each_entry(client_node, &rbd_client_list, node) {
374 if (!ceph_compare_options(ceph_opts, client_node->client)) {
375 kref_get(&client_node->kref);
376 found = true;
377 break;
378 }
379 }
380 spin_unlock(&rbd_client_list_lock);
381
382 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383}
384
385/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386 * mount options
387 */
388enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700389 Opt_last_int,
390 /* int args above */
391 Opt_last_string,
392 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700393 Opt_read_only,
394 Opt_read_write,
395 /* Boolean args above */
396 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700397};
398
Alex Elder43ae4702012-07-03 16:01:18 -0500399static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700400 /* int args above */
401 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500402 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700403 {Opt_read_only, "ro"}, /* Alternate spelling */
404 {Opt_read_write, "read_write"},
405 {Opt_read_write, "rw"}, /* Alternate spelling */
406 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700407 {-1, NULL}
408};
409
410static int parse_rbd_opts_token(char *c, void *private)
411{
Alex Elder43ae4702012-07-03 16:01:18 -0500412 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700413 substring_t argstr[MAX_OPT_ARGS];
414 int token, intval, ret;
415
Alex Elder43ae4702012-07-03 16:01:18 -0500416 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700417 if (token < 0)
418 return -EINVAL;
419
420 if (token < Opt_last_int) {
421 ret = match_int(&argstr[0], &intval);
422 if (ret < 0) {
423 pr_err("bad mount option arg (not int) "
424 "at '%s'\n", c);
425 return ret;
426 }
427 dout("got int token %d val %d\n", token, intval);
428 } else if (token > Opt_last_int && token < Opt_last_string) {
429 dout("got string token %d val %s\n", token,
430 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700431 } else if (token > Opt_last_string && token < Opt_last_bool) {
432 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700433 } else {
434 dout("got token %d\n", token);
435 }
436
437 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700438 case Opt_read_only:
439 rbd_opts->read_only = true;
440 break;
441 case Opt_read_write:
442 rbd_opts->read_only = false;
443 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700444 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500445 rbd_assert(false);
446 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700447 }
448 return 0;
449}
450
451/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 * Get a ceph client with specific addr and configuration, if one does
453 * not exist create it.
454 */
Alex Elderf8c38922012-08-10 13:12:07 -0700455static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
456 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457{
Alex Elder069a4b52012-10-22 11:31:27 -0500458 struct rbd_options rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500459 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700460 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700461
Alex Elder069a4b52012-10-22 11:31:27 -0500462 /* Initialize all rbd options to the defaults */
463
464 rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465
Alex Elder43ae4702012-07-03 16:01:18 -0500466 ceph_opts = ceph_parse_options(options, mon_addr,
467 mon_addr + mon_addr_len,
Alex Elder069a4b52012-10-22 11:31:27 -0500468 parse_rbd_opts_token, &rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700469 if (IS_ERR(ceph_opts))
470 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471
Alex Elder069a4b52012-10-22 11:31:27 -0500472 /* Record the parsed rbd options */
473
474 rbd_dev->mapping.read_only = rbd_opts.read_only;
475
Alex Elder1f7ba332012-08-10 13:12:07 -0700476 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700477 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600478 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500479 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700480 } else {
481 rbdc = rbd_client_create(ceph_opts);
482 if (IS_ERR(rbdc))
483 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700484 }
Alex Elderf8c38922012-08-10 13:12:07 -0700485 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486
Alex Elderf8c38922012-08-10 13:12:07 -0700487 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488}
489
490/*
491 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600492 *
Alex Elder432b8582012-01-29 13:57:44 -0600493 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700494 */
495static void rbd_client_release(struct kref *kref)
496{
497 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
498
499 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500500 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500502 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503
504 ceph_destroy_client(rbdc->client);
505 kfree(rbdc);
506}
507
508/*
509 * Drop reference to ceph client node. If it's not referenced anymore, release
510 * it.
511 */
512static void rbd_put_client(struct rbd_device *rbd_dev)
513{
514 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
515 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700516}
517
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700518/*
519 * Destroy requests collection
520 */
521static void rbd_coll_release(struct kref *kref)
522{
523 struct rbd_req_coll *coll =
524 container_of(kref, struct rbd_req_coll, kref);
525
526 dout("rbd_coll_release %p\n", coll);
527 kfree(coll);
528}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700529
Alex Eldera30b71b2012-07-10 20:30:11 -0500530static bool rbd_image_format_valid(u32 image_format)
531{
532 return image_format == 1 || image_format == 2;
533}
534
Alex Elder8e94af82012-07-25 09:32:40 -0500535static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
536{
Alex Elder103a1502012-08-02 11:29:45 -0500537 size_t size;
538 u32 snap_count;
539
540 /* The header has to start with the magic rbd header text */
541 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
542 return false;
543
Alex Elderdb2388b2012-10-20 22:17:27 -0500544 /* The bio layer requires at least sector-sized I/O */
545
546 if (ondisk->options.order < SECTOR_SHIFT)
547 return false;
548
549 /* If we use u64 in a few spots we may be able to loosen this */
550
551 if (ondisk->options.order > 8 * sizeof (int) - 1)
552 return false;
553
Alex Elder103a1502012-08-02 11:29:45 -0500554 /*
555 * The size of a snapshot header has to fit in a size_t, and
556 * that limits the number of snapshots.
557 */
558 snap_count = le32_to_cpu(ondisk->snap_count);
559 size = SIZE_MAX - sizeof (struct ceph_snap_context);
560 if (snap_count > size / sizeof (__le64))
561 return false;
562
563 /*
564 * Not only that, but the size of the entire the snapshot
565 * header must also be representable in a size_t.
566 */
567 size -= snap_count * sizeof (__le64);
568 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
569 return false;
570
571 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500572}
573
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700574/*
575 * Create a new header structure, translate header format from the on-disk
576 * header.
577 */
578static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500579 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580{
Alex Elderccece232012-07-10 20:30:10 -0500581 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500582 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500583 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500584 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585
Alex Elder6a523252012-07-19 17:12:59 -0500586 memset(header, 0, sizeof (*header));
587
Alex Elder103a1502012-08-02 11:29:45 -0500588 snap_count = le32_to_cpu(ondisk->snap_count);
589
Alex Elder58c17b02012-08-23 23:22:06 -0500590 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
591 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500592 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700593 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500594 memcpy(header->object_prefix, ondisk->object_prefix, len);
595 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600596
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500598 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
599
Alex Elder621901d2012-08-23 23:22:06 -0500600 /* Save a copy of the snapshot names */
601
Alex Elderf785cc12012-08-23 23:22:06 -0500602 if (snap_names_len > (u64) SIZE_MAX)
603 return -EIO;
604 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500606 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500607 /*
608 * Note that rbd_dev_v1_header_read() guarantees
609 * the ondisk buffer we're working with has
610 * snap_names_len bytes beyond the end of the
611 * snapshot id array, this memcpy() is safe.
612 */
613 memcpy(header->snap_names, &ondisk->snaps[snap_count],
614 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500615
Alex Elder621901d2012-08-23 23:22:06 -0500616 /* Record each snapshot's size */
617
Alex Elderd2bb24e2012-07-26 23:37:14 -0500618 size = snap_count * sizeof (*header->snap_sizes);
619 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700620 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500621 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500622 for (i = 0; i < snap_count; i++)
623 header->snap_sizes[i] =
624 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625 } else {
Alex Elderccece232012-07-10 20:30:10 -0500626 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700627 header->snap_names = NULL;
628 header->snap_sizes = NULL;
629 }
Alex Elder849b4262012-07-09 21:04:24 -0500630
Alex Elder34b13182012-07-13 20:35:12 -0500631 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632 header->obj_order = ondisk->options.order;
633 header->crypt_type = ondisk->options.crypt_type;
634 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500635
Alex Elder621901d2012-08-23 23:22:06 -0500636 /* Allocate and fill in the snapshot context */
637
Alex Elderf84344f2012-08-31 17:29:51 -0500638 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500639 size = sizeof (struct ceph_snap_context);
640 size += snap_count * sizeof (header->snapc->snaps[0]);
641 header->snapc = kzalloc(size, GFP_KERNEL);
642 if (!header->snapc)
643 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644
645 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500646 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700647 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500648 for (i = 0; i < snap_count; i++)
649 header->snapc->snaps[i] =
650 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
652 return 0;
653
Alex Elder6a523252012-07-19 17:12:59 -0500654out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500655 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500656 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500658 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500659 kfree(header->object_prefix);
660 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500661
Alex Elder00f1f362012-02-07 12:03:36 -0600662 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663}
664
Alex Elder8836b992012-08-30 14:42:15 -0500665static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667
Alex Eldere86924a2012-07-10 20:30:11 -0500668 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600669
Alex Eldere86924a2012-07-10 20:30:11 -0500670 list_for_each_entry(snap, &rbd_dev->snaps, node) {
671 if (!strcmp(snap_name, snap->name)) {
672 rbd_dev->mapping.snap_id = snap->id;
673 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500674 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600675
Alex Eldere86924a2012-07-10 20:30:11 -0500676 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600677 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678 }
Alex Eldere86924a2012-07-10 20:30:11 -0500679
Alex Elder00f1f362012-02-07 12:03:36 -0600680 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681}
682
Alex Elder5ed16172012-08-29 17:11:07 -0500683static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700684{
Alex Elder78dc4472012-07-19 08:49:18 -0500685 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686
Alex Elder4e1105a2012-08-31 17:29:52 -0500687 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800688 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500689 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500690 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500691 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500692 rbd_dev->mapping.snap_exists = false;
Alex Eldere86924a2012-07-10 20:30:11 -0500693 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500695 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696 if (ret < 0)
697 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500698 rbd_dev->mapping.snap_exists = true;
699 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500701 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703 return ret;
704}
705
706static void rbd_header_free(struct rbd_image_header *header)
707{
Alex Elder849b4262012-07-09 21:04:24 -0500708 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500709 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700710 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500711 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500712 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500713 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800714 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500715 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716}
717
Alex Elder65ccfe22012-08-09 10:33:26 -0700718static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719{
Alex Elder65ccfe22012-08-09 10:33:26 -0700720 char *name;
721 u64 segment;
722 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
Alex Elder65ccfe22012-08-09 10:33:26 -0700724 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
725 if (!name)
726 return NULL;
727 segment = offset >> rbd_dev->header.obj_order;
728 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
729 rbd_dev->header.object_prefix, segment);
730 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
731 pr_err("error formatting segment name for #%llu (%d)\n",
732 segment, ret);
733 kfree(name);
734 name = NULL;
735 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700736
Alex Elder65ccfe22012-08-09 10:33:26 -0700737 return name;
738}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700739
Alex Elder65ccfe22012-08-09 10:33:26 -0700740static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
741{
742 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743
Alex Elder65ccfe22012-08-09 10:33:26 -0700744 return offset & (segment_size - 1);
745}
746
747static u64 rbd_segment_length(struct rbd_device *rbd_dev,
748 u64 offset, u64 length)
749{
750 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
751
752 offset &= segment_size - 1;
753
Alex Elderaafb2302012-09-06 16:00:54 -0500754 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700755 if (offset + length > segment_size)
756 length = segment_size - offset;
757
758 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759}
760
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700761static int rbd_get_num_segments(struct rbd_image_header *header,
762 u64 ofs, u64 len)
763{
Alex Elderdf111be2012-08-09 10:33:26 -0700764 u64 start_seg;
765 u64 end_seg;
766
767 if (!len)
768 return 0;
769 if (len - 1 > U64_MAX - ofs)
770 return -ERANGE;
771
772 start_seg = ofs >> header->obj_order;
773 end_seg = (ofs + len - 1) >> header->obj_order;
774
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700775 return end_seg - start_seg + 1;
776}
777
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700778/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700779 * returns the size of an object in the image
780 */
781static u64 rbd_obj_bytes(struct rbd_image_header *header)
782{
783 return 1 << header->obj_order;
784}
785
786/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 * bio helpers
788 */
789
790static void bio_chain_put(struct bio *chain)
791{
792 struct bio *tmp;
793
794 while (chain) {
795 tmp = chain;
796 chain = chain->bi_next;
797 bio_put(tmp);
798 }
799}
800
801/*
802 * zeros a bio chain, starting at specific offset
803 */
804static void zero_bio_chain(struct bio *chain, int start_ofs)
805{
806 struct bio_vec *bv;
807 unsigned long flags;
808 void *buf;
809 int i;
810 int pos = 0;
811
812 while (chain) {
813 bio_for_each_segment(bv, chain, i) {
814 if (pos + bv->bv_len > start_ofs) {
815 int remainder = max(start_ofs - pos, 0);
816 buf = bvec_kmap_irq(bv, &flags);
817 memset(buf + remainder, 0,
818 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200819 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820 }
821 pos += bv->bv_len;
822 }
823
824 chain = chain->bi_next;
825 }
826}
827
828/*
829 * bio_chain_clone - clone a chain of bios up to a certain length.
830 * might return a bio_pair that will need to be released.
831 */
832static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
833 struct bio_pair **bp,
834 int len, gfp_t gfpmask)
835{
Alex Elder542582f2012-08-09 10:33:25 -0700836 struct bio *old_chain = *old;
837 struct bio *new_chain = NULL;
838 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700839 int total = 0;
840
841 if (*bp) {
842 bio_pair_release(*bp);
843 *bp = NULL;
844 }
845
846 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700847 struct bio *tmp;
848
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700849 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
850 if (!tmp)
851 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700852 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853
854 if (total + old_chain->bi_size > len) {
855 struct bio_pair *bp;
856
857 /*
858 * this split can only happen with a single paged bio,
859 * split_bio will BUG_ON if this is not the case
860 */
861 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500862 "bi_size=%u\n",
863 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700864
865 /* split the bio. We'll release it either in the next
866 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600867 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700868 if (!bp)
869 goto err_out;
870
871 __bio_clone(tmp, &bp->bio1);
872
873 *next = &bp->bio2;
874 } else {
875 __bio_clone(tmp, old_chain);
876 *next = old_chain->bi_next;
877 }
878
879 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700880 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700881 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700883 else
884 new_chain = tmp;
885 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700886 old_chain = old_chain->bi_next;
887
888 total += tmp->bi_size;
889 }
890
Alex Elderaafb2302012-09-06 16:00:54 -0500891 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700893 *old = old_chain;
894
895 return new_chain;
896
897err_out:
898 dout("bio_chain_clone with err\n");
899 bio_chain_put(new_chain);
900 return NULL;
901}
902
903/*
904 * helpers for osd request op vectors.
905 */
Alex Elder57cfc102012-06-26 12:57:03 -0700906static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
907 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700908{
Alex Elder57cfc102012-06-26 12:57:03 -0700909 struct ceph_osd_req_op *ops;
910
911 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
912 if (!ops)
913 return NULL;
914
915 ops[0].op = opcode;
916
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700917 /*
918 * op extent offset and length will be set later on
919 * in calc_raw_layout()
920 */
Alex Elder57cfc102012-06-26 12:57:03 -0700921 ops[0].payload_len = payload_len;
922
923 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924}
925
926static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
927{
928 kfree(ops);
929}
930
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700931static void rbd_coll_end_req_index(struct request *rq,
932 struct rbd_req_coll *coll,
933 int index,
934 int ret, u64 len)
935{
936 struct request_queue *q;
937 int min, max, i;
938
Alex Elderbd919d42012-07-13 20:35:11 -0500939 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
940 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700941
942 if (!rq)
943 return;
944
945 if (!coll) {
946 blk_end_request(rq, ret, len);
947 return;
948 }
949
950 q = rq->q;
951
952 spin_lock_irq(q->queue_lock);
953 coll->status[index].done = 1;
954 coll->status[index].rc = ret;
955 coll->status[index].bytes = len;
956 max = min = coll->num_done;
957 while (max < coll->total && coll->status[max].done)
958 max++;
959
960 for (i = min; i<max; i++) {
961 __blk_end_request(rq, coll->status[i].rc,
962 coll->status[i].bytes);
963 coll->num_done++;
964 kref_put(&coll->kref, rbd_coll_release);
965 }
966 spin_unlock_irq(q->queue_lock);
967}
968
969static void rbd_coll_end_req(struct rbd_request *req,
970 int ret, u64 len)
971{
972 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
973}
974
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975/*
976 * Send ceph osd request
977 */
978static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500979 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700980 struct ceph_snap_context *snapc,
981 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500982 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700983 struct bio *bio,
984 struct page **pages,
985 int num_pages,
986 int flags,
987 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700988 struct rbd_req_coll *coll,
989 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700991 struct ceph_msg *msg),
992 struct ceph_osd_request **linger_req,
993 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700994{
995 struct ceph_osd_request *req;
996 struct ceph_file_layout *layout;
997 int ret;
998 u64 bno;
999 struct timespec mtime = CURRENT_TIME;
1000 struct rbd_request *req_data;
1001 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001002 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001003
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001005 if (!req_data) {
1006 if (coll)
1007 rbd_coll_end_req_index(rq, coll, coll_index,
1008 -ENOMEM, len);
1009 return -ENOMEM;
1010 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001011
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001012 if (coll) {
1013 req_data->coll = coll;
1014 req_data->coll_index = coll_index;
1015 }
1016
Alex Elderbd919d42012-07-13 20:35:11 -05001017 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
1018 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001019
Alex Elder0ce1a792012-07-03 16:01:18 -05001020 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001021 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1022 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001023 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001024 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001025 goto done_pages;
1026 }
1027
1028 req->r_callback = rbd_cb;
1029
1030 req_data->rq = rq;
1031 req_data->bio = bio;
1032 req_data->pages = pages;
1033 req_data->len = len;
1034
1035 req->r_priv = req_data;
1036
1037 reqhead = req->r_request->front.iov_base;
1038 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1039
Alex Elderaded07e2012-07-03 16:01:18 -05001040 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001041 req->r_oid_len = strlen(req->r_oid);
1042
1043 layout = &req->r_file_layout;
1044 memset(layout, 0, sizeof(*layout));
1045 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1046 layout->fl_stripe_count = cpu_to_le32(1);
1047 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001048 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001049 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1050 req, ops);
1051 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001052
1053 ceph_osdc_build_request(req, ofs, &len,
1054 ops,
1055 snapc,
1056 &mtime,
1057 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001059 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001060 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001061 *linger_req = req;
1062 }
1063
Alex Elder1dbb4392012-01-24 10:08:37 -06001064 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001065 if (ret < 0)
1066 goto done_err;
1067
1068 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001069 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001070 if (ver)
1071 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001072 dout("reassert_ver=%llu\n",
1073 (unsigned long long)
1074 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001075 ceph_osdc_put_request(req);
1076 }
1077 return ret;
1078
1079done_err:
1080 bio_chain_put(req_data->bio);
1081 ceph_osdc_put_request(req);
1082done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001083 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001085 return ret;
1086}
1087
1088/*
1089 * Ceph osd op callback
1090 */
1091static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1092{
1093 struct rbd_request *req_data = req->r_priv;
1094 struct ceph_osd_reply_head *replyhead;
1095 struct ceph_osd_op *op;
1096 __s32 rc;
1097 u64 bytes;
1098 int read_op;
1099
1100 /* parse reply */
1101 replyhead = msg->front.iov_base;
1102 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1103 op = (void *)(replyhead + 1);
1104 rc = le32_to_cpu(replyhead->result);
1105 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001106 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001107
Alex Elderbd919d42012-07-13 20:35:11 -05001108 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1109 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110
1111 if (rc == -ENOENT && read_op) {
1112 zero_bio_chain(req_data->bio, 0);
1113 rc = 0;
1114 } else if (rc == 0 && read_op && bytes < req_data->len) {
1115 zero_bio_chain(req_data->bio, bytes);
1116 bytes = req_data->len;
1117 }
1118
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001119 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001120
1121 if (req_data->bio)
1122 bio_chain_put(req_data->bio);
1123
1124 ceph_osdc_put_request(req);
1125 kfree(req_data);
1126}
1127
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001128static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1129{
1130 ceph_osdc_put_request(req);
1131}
1132
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133/*
1134 * Do a synchronous ceph osd operation
1135 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001136static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137 struct ceph_snap_context *snapc,
1138 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001140 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001141 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001142 u64 ofs, u64 inbound_size,
1143 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001144 struct ceph_osd_request **linger_req,
1145 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001146{
1147 int ret;
1148 struct page **pages;
1149 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001150
Alex Elderaafb2302012-09-06 16:00:54 -05001151 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001152
Alex Elderf8d4de62012-07-03 16:01:19 -05001153 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001155 if (IS_ERR(pages))
1156 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001157
Alex Elder0ce1a792012-07-03 16:01:18 -05001158 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001159 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001160 pages, num_pages,
1161 flags,
1162 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001163 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001164 NULL,
1165 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001166 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001167 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168
Alex Elderf8d4de62012-07-03 16:01:19 -05001169 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1170 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172done:
1173 ceph_release_page_vector(pages, num_pages);
1174 return ret;
1175}
1176
1177/*
1178 * Do an asynchronous ceph osd operation
1179 */
1180static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001181 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001182 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001183 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001184 struct bio *bio,
1185 struct rbd_req_coll *coll,
1186 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001187{
1188 char *seg_name;
1189 u64 seg_ofs;
1190 u64 seg_len;
1191 int ret;
1192 struct ceph_osd_req_op *ops;
1193 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001194 int opcode;
1195 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001196 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197
Alex Elder65ccfe22012-08-09 10:33:26 -07001198 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001199 if (!seg_name)
1200 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001201 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1202 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001203
Alex Elderff2e4bb2012-10-10 18:59:29 -07001204 if (rq_data_dir(rq) == WRITE) {
1205 opcode = CEPH_OSD_OP_WRITE;
1206 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001207 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001208 payload_len = seg_len;
1209 } else {
1210 opcode = CEPH_OSD_OP_READ;
1211 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001212 snapc = NULL;
1213 snapid = rbd_dev->mapping.snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001214 payload_len = 0;
1215 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001216
Alex Elder57cfc102012-06-26 12:57:03 -07001217 ret = -ENOMEM;
1218 ops = rbd_create_rw_ops(1, opcode, payload_len);
1219 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220 goto done;
1221
1222 /* we've taken care of segment sizes earlier when we
1223 cloned the bios. We should never have a segment
1224 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001225 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226
1227 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1228 seg_name, seg_ofs, seg_len,
1229 bio,
1230 NULL, 0,
1231 flags,
1232 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001233 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001234 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001235
1236 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237done:
1238 kfree(seg_name);
1239 return ret;
1240}
1241
1242/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001243 * Request sync osd read
1244 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001245static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001247 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001248 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001249 char *buf,
1250 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251{
Alex Elder913d2fd2012-06-26 12:57:03 -07001252 struct ceph_osd_req_op *ops;
1253 int ret;
1254
1255 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1256 if (!ops)
1257 return -ENOMEM;
1258
1259 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001260 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001262 ops, object_name, ofs, len, buf, NULL, ver);
1263 rbd_destroy_ops(ops);
1264
1265 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266}
1267
1268/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269 * Request sync osd watch
1270 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001271static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001273 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001274{
1275 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001276 int ret;
1277
Alex Elder57cfc102012-06-26 12:57:03 -07001278 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1279 if (!ops)
1280 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001281
Josh Durgina71b8912011-12-05 18:10:44 -08001282 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001283 ops[0].watch.cookie = notify_id;
1284 ops[0].watch.flag = 0;
1285
Alex Elder0ce1a792012-07-03 16:01:18 -05001286 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001287 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001288 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001289 CEPH_OSD_FLAG_READ,
1290 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001291 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001292 rbd_simple_req_cb, 0, NULL);
1293
1294 rbd_destroy_ops(ops);
1295 return ret;
1296}
1297
1298static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1299{
Alex Elder0ce1a792012-07-03 16:01:18 -05001300 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001301 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001302 int rc;
1303
Alex Elder0ce1a792012-07-03 16:01:18 -05001304 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001305 return;
1306
Alex Elderbd919d42012-07-13 20:35:11 -05001307 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1308 rbd_dev->header_name, (unsigned long long) notify_id,
1309 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001310 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001311 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001312 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001313 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001314
Alex Elder7f0a24d2012-07-25 09:32:40 -05001315 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316}
1317
1318/*
1319 * Request sync osd watch
1320 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001321static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001322{
1323 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001324 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001325 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001326
Alex Elder57cfc102012-06-26 12:57:03 -07001327 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1328 if (!ops)
1329 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330
1331 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001332 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001333 if (ret < 0)
1334 goto fail;
1335
Alex Elder0e6f3222012-07-25 09:32:40 -05001336 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001337 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338 ops[0].watch.flag = 1;
1339
Alex Elder0ce1a792012-07-03 16:01:18 -05001340 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001342 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1343 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001344 rbd_dev->header_name,
1345 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001346 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001347
1348 if (ret < 0)
1349 goto fail_event;
1350
1351 rbd_destroy_ops(ops);
1352 return 0;
1353
1354fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001355 ceph_osdc_cancel_event(rbd_dev->watch_event);
1356 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001357fail:
1358 rbd_destroy_ops(ops);
1359 return ret;
1360}
1361
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001362/*
1363 * Request sync osd unwatch
1364 */
Alex Elder070c6332012-07-25 09:32:41 -05001365static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001366{
1367 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001368 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001369
Alex Elder57cfc102012-06-26 12:57:03 -07001370 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1371 if (!ops)
1372 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001373
1374 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001375 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001376 ops[0].watch.flag = 0;
1377
Alex Elder0ce1a792012-07-03 16:01:18 -05001378 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001379 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001380 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1381 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001382 rbd_dev->header_name,
1383 0, 0, NULL, NULL, NULL);
1384
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001385
1386 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001387 ceph_osdc_cancel_event(rbd_dev->watch_event);
1388 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001389 return ret;
1390}
1391
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001392/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001393 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001394 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001395static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001396 const char *object_name,
1397 const char *class_name,
1398 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001399 const char *outbound,
1400 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001401 char *inbound,
1402 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001403 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001404 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001405{
1406 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001407 int class_name_len = strlen(class_name);
1408 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001409 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001410 int ret;
1411
Alex Elder3cb4a682012-06-26 12:57:03 -07001412 /*
1413 * Any input parameters required by the method we're calling
1414 * will be sent along with the class and method names as
1415 * part of the message payload. That data and its size are
1416 * supplied via the indata and indata_len fields (named from
1417 * the perspective of the server side) in the OSD request
1418 * operation.
1419 */
1420 payload_size = class_name_len + method_name_len + outbound_size;
1421 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001422 if (!ops)
1423 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001424
Alex Elderaded07e2012-07-03 16:01:18 -05001425 ops[0].cls.class_name = class_name;
1426 ops[0].cls.class_len = (__u8) class_name_len;
1427 ops[0].cls.method_name = method_name;
1428 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001429 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001430 ops[0].cls.indata = outbound;
1431 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001432
Alex Elder0ce1a792012-07-03 16:01:18 -05001433 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001435 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001436 object_name, 0, inbound_size, inbound,
1437 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001438
1439 rbd_destroy_ops(ops);
1440
1441 dout("cls_exec returned %d\n", ret);
1442 return ret;
1443}
1444
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001445static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1446{
1447 struct rbd_req_coll *coll =
1448 kzalloc(sizeof(struct rbd_req_coll) +
1449 sizeof(struct rbd_req_status) * num_reqs,
1450 GFP_ATOMIC);
1451
1452 if (!coll)
1453 return NULL;
1454 coll->total = num_reqs;
1455 kref_init(&coll->kref);
1456 return coll;
1457}
1458
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001459/*
1460 * block device queue callback
1461 */
1462static void rbd_rq_fn(struct request_queue *q)
1463{
1464 struct rbd_device *rbd_dev = q->queuedata;
1465 struct request *rq;
1466 struct bio_pair *bp = NULL;
1467
Alex Elder00f1f362012-02-07 12:03:36 -06001468 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001469 struct bio *bio;
1470 struct bio *rq_bio, *next_bio = NULL;
1471 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001472 unsigned int size;
1473 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001475 int num_segs, cur_seg = 0;
1476 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001477 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001478
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001479 dout("fetched request\n");
1480
1481 /* filter out block requests we don't understand */
1482 if ((rq->cmd_type != REQ_TYPE_FS)) {
1483 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001484 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001485 }
1486
1487 /* deduce our operation (read, write) */
1488 do_write = (rq_data_dir(rq) == WRITE);
1489
1490 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001491 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001492 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001493 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001494 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001495 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001496 }
1497
1498 spin_unlock_irq(q->queue_lock);
1499
Josh Durgind1d25642011-12-05 14:03:05 -08001500 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001501
Alex Elderf84344f2012-08-31 17:29:51 -05001502 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1503 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001504 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001505 dout("request for non-existent snapshot");
1506 spin_lock_irq(q->queue_lock);
1507 __blk_end_request_all(rq, -ENXIO);
1508 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001509 }
1510
Josh Durgind1d25642011-12-05 14:03:05 -08001511 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1512
1513 up_read(&rbd_dev->header_rwsem);
1514
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001515 dout("%s 0x%x bytes at 0x%llx\n",
1516 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001517 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001518
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001519 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001520 if (num_segs <= 0) {
1521 spin_lock_irq(q->queue_lock);
1522 __blk_end_request_all(rq, num_segs);
1523 ceph_put_snap_context(snapc);
1524 continue;
1525 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001526 coll = rbd_alloc_coll(num_segs);
1527 if (!coll) {
1528 spin_lock_irq(q->queue_lock);
1529 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001530 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001531 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001532 }
1533
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 do {
1535 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001536 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001537 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001538 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1540 op_size, GFP_ATOMIC);
Alex Elder46342462012-10-10 18:59:29 -07001541 if (bio)
1542 (void) rbd_do_op(rq, rbd_dev, snapc,
1543 ofs, op_size,
1544 bio, coll, cur_seg);
1545 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001546 rbd_coll_end_req_index(rq, coll, cur_seg,
1547 -ENOMEM, op_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 size -= op_size;
1549 ofs += op_size;
1550
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001551 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001552 rq_bio = next_bio;
1553 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001554 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555
1556 if (bp)
1557 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001559
1560 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001561 }
1562}
1563
1564/*
1565 * a queue callback. Makes sure that we don't create a bio that spans across
1566 * multiple osd objects. One exception would be with a single page bios,
1567 * which we handle later at bio_chain_clone
1568 */
1569static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1570 struct bio_vec *bvec)
1571{
1572 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05001573 sector_t sector_offset;
1574 sector_t sectors_per_obj;
1575 sector_t obj_sector_offset;
1576 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001577
Alex Eldere5cfeed2012-10-20 22:17:27 -05001578 /*
1579 * Find how far into its rbd object the partition-relative
1580 * bio start sector is to offset relative to the enclosing
1581 * device.
1582 */
1583 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1584 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1585 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001586
Alex Eldere5cfeed2012-10-20 22:17:27 -05001587 /*
1588 * Compute the number of bytes from that offset to the end
1589 * of the object. Account for what's already used by the bio.
1590 */
1591 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1592 if (ret > bmd->bi_size)
1593 ret -= bmd->bi_size;
1594 else
1595 ret = 0;
1596
1597 /*
1598 * Don't send back more than was asked for. And if the bio
1599 * was empty, let the whole thing through because: "Note
1600 * that a block device *must* allow a single page to be
1601 * added to an empty bio."
1602 */
1603 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1604 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1605 ret = (int) bvec->bv_len;
1606
1607 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608}
1609
1610static void rbd_free_disk(struct rbd_device *rbd_dev)
1611{
1612 struct gendisk *disk = rbd_dev->disk;
1613
1614 if (!disk)
1615 return;
1616
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001617 if (disk->flags & GENHD_FL_UP)
1618 del_gendisk(disk);
1619 if (disk->queue)
1620 blk_cleanup_queue(disk->queue);
1621 put_disk(disk);
1622}
1623
1624/*
Alex Elder4156d992012-08-02 11:29:46 -05001625 * Read the complete header for the given rbd device.
1626 *
1627 * Returns a pointer to a dynamically-allocated buffer containing
1628 * the complete and validated header. Caller can pass the address
1629 * of a variable that will be filled in with the version of the
1630 * header object at the time it was read.
1631 *
1632 * Returns a pointer-coded errno if a failure occurs.
1633 */
1634static struct rbd_image_header_ondisk *
1635rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1636{
1637 struct rbd_image_header_ondisk *ondisk = NULL;
1638 u32 snap_count = 0;
1639 u64 names_size = 0;
1640 u32 want_count;
1641 int ret;
1642
1643 /*
1644 * The complete header will include an array of its 64-bit
1645 * snapshot ids, followed by the names of those snapshots as
1646 * a contiguous block of NUL-terminated strings. Note that
1647 * the number of snapshots could change by the time we read
1648 * it in, in which case we re-read it.
1649 */
1650 do {
1651 size_t size;
1652
1653 kfree(ondisk);
1654
1655 size = sizeof (*ondisk);
1656 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1657 size += names_size;
1658 ondisk = kmalloc(size, GFP_KERNEL);
1659 if (!ondisk)
1660 return ERR_PTR(-ENOMEM);
1661
1662 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1663 rbd_dev->header_name,
1664 0, size,
1665 (char *) ondisk, version);
1666
1667 if (ret < 0)
1668 goto out_err;
1669 if (WARN_ON((size_t) ret < size)) {
1670 ret = -ENXIO;
1671 pr_warning("short header read for image %s"
1672 " (want %zd got %d)\n",
1673 rbd_dev->image_name, size, ret);
1674 goto out_err;
1675 }
1676 if (!rbd_dev_ondisk_valid(ondisk)) {
1677 ret = -ENXIO;
1678 pr_warning("invalid header for image %s\n",
1679 rbd_dev->image_name);
1680 goto out_err;
1681 }
1682
1683 names_size = le64_to_cpu(ondisk->snap_names_len);
1684 want_count = snap_count;
1685 snap_count = le32_to_cpu(ondisk->snap_count);
1686 } while (snap_count != want_count);
1687
1688 return ondisk;
1689
1690out_err:
1691 kfree(ondisk);
1692
1693 return ERR_PTR(ret);
1694}
1695
1696/*
1697 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001698 */
1699static int rbd_read_header(struct rbd_device *rbd_dev,
1700 struct rbd_image_header *header)
1701{
Alex Elder4156d992012-08-02 11:29:46 -05001702 struct rbd_image_header_ondisk *ondisk;
1703 u64 ver = 0;
1704 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001705
Alex Elder4156d992012-08-02 11:29:46 -05001706 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1707 if (IS_ERR(ondisk))
1708 return PTR_ERR(ondisk);
1709 ret = rbd_header_from_disk(header, ondisk);
1710 if (ret >= 0)
1711 header->obj_version = ver;
1712 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001713
Alex Elder4156d992012-08-02 11:29:46 -05001714 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001715}
1716
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001717static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1718{
1719 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001720 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001721
Alex Eldera0593292012-07-19 09:09:27 -05001722 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001723 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001724}
1725
Alex Elder94785542012-10-09 13:50:17 -07001726static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1727{
1728 sector_t size;
1729
1730 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
1731 return;
1732
1733 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1734 dout("setting size to %llu sectors", (unsigned long long) size);
1735 rbd_dev->mapping.size = (u64) size;
1736 set_capacity(rbd_dev->disk, size);
1737}
1738
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001739/*
1740 * only read the first part of the ondisk header, without the snaps info
1741 */
Alex Elder117973f2012-08-31 17:29:55 -05001742static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001743{
1744 int ret;
1745 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746
1747 ret = rbd_read_header(rbd_dev, &h);
1748 if (ret < 0)
1749 return ret;
1750
Josh Durgina51aa0c2011-12-05 10:35:04 -08001751 down_write(&rbd_dev->header_rwsem);
1752
Alex Elder94785542012-10-09 13:50:17 -07001753 /* Update image size, and check for resize of mapped image */
1754 rbd_dev->header.image_size = h.image_size;
1755 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001756
Alex Elder849b4262012-07-09 21:04:24 -05001757 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001758 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001759 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001760 /* osd requests may still refer to snapc */
1761 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001762
Alex Elderb8136232012-07-25 09:32:41 -05001763 if (hver)
1764 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001765 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001766 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001767 rbd_dev->header.snapc = h.snapc;
1768 rbd_dev->header.snap_names = h.snap_names;
1769 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001770 /* Free the extra copy of the object prefix */
1771 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1772 kfree(h.object_prefix);
1773
Alex Elder304f6802012-08-31 17:29:52 -05001774 ret = rbd_dev_snaps_update(rbd_dev);
1775 if (!ret)
1776 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001777
Josh Durginc6666012011-11-21 17:11:12 -08001778 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001779
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001780 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001781}
1782
Alex Elder117973f2012-08-31 17:29:55 -05001783static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001784{
1785 int ret;
1786
Alex Elder117973f2012-08-31 17:29:55 -05001787 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001788 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001789 if (rbd_dev->image_format == 1)
1790 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1791 else
1792 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001793 mutex_unlock(&ctl_mutex);
1794
1795 return ret;
1796}
1797
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001798static int rbd_init_disk(struct rbd_device *rbd_dev)
1799{
1800 struct gendisk *disk;
1801 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001802 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001803
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001804 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001805 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1806 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001807 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001808
Alex Elderf0f8cef2012-01-29 13:57:44 -06001809 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001810 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001811 disk->major = rbd_dev->major;
1812 disk->first_minor = 0;
1813 disk->fops = &rbd_bd_ops;
1814 disk->private_data = rbd_dev;
1815
1816 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001817 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1818 if (!q)
1819 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001820
Alex Elder593a9e72012-02-07 12:03:37 -06001821 /* We use the default size, but let's be explicit about it. */
1822 blk_queue_physical_block_size(q, SECTOR_SIZE);
1823
Josh Durgin029bcbd2011-07-22 11:35:23 -07001824 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001825 segment_size = rbd_obj_bytes(&rbd_dev->header);
1826 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1827 blk_queue_max_segment_size(q, segment_size);
1828 blk_queue_io_min(q, segment_size);
1829 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001830
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001831 blk_queue_merge_bvec(q, rbd_merge_bvec);
1832 disk->queue = q;
1833
1834 q->queuedata = rbd_dev;
1835
1836 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001837
Alex Elder12f02942012-08-29 17:11:07 -05001838 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1839
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001841out_disk:
1842 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001843
1844 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001845}
1846
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001847/*
1848 sysfs
1849*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001850
Alex Elder593a9e72012-02-07 12:03:37 -06001851static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1852{
1853 return container_of(dev, struct rbd_device, dev);
1854}
1855
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001856static ssize_t rbd_size_show(struct device *dev,
1857 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001858{
Alex Elder593a9e72012-02-07 12:03:37 -06001859 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001860 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001861
Josh Durgina51aa0c2011-12-05 10:35:04 -08001862 down_read(&rbd_dev->header_rwsem);
1863 size = get_capacity(rbd_dev->disk);
1864 up_read(&rbd_dev->header_rwsem);
1865
1866 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001867}
1868
Alex Elder34b13182012-07-13 20:35:12 -05001869/*
1870 * Note this shows the features for whatever's mapped, which is not
1871 * necessarily the base image.
1872 */
1873static ssize_t rbd_features_show(struct device *dev,
1874 struct device_attribute *attr, char *buf)
1875{
1876 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1877
1878 return sprintf(buf, "0x%016llx\n",
1879 (unsigned long long) rbd_dev->mapping.features);
1880}
1881
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001882static ssize_t rbd_major_show(struct device *dev,
1883 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001884{
Alex Elder593a9e72012-02-07 12:03:37 -06001885 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001886
1887 return sprintf(buf, "%d\n", rbd_dev->major);
1888}
1889
1890static ssize_t rbd_client_id_show(struct device *dev,
1891 struct device_attribute *attr, char *buf)
1892{
Alex Elder593a9e72012-02-07 12:03:37 -06001893 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001894
Alex Elder1dbb4392012-01-24 10:08:37 -06001895 return sprintf(buf, "client%lld\n",
1896 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001897}
1898
1899static ssize_t rbd_pool_show(struct device *dev,
1900 struct device_attribute *attr, char *buf)
1901{
Alex Elder593a9e72012-02-07 12:03:37 -06001902 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001903
1904 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1905}
1906
Alex Elder9bb2f332012-07-12 10:46:35 -05001907static ssize_t rbd_pool_id_show(struct device *dev,
1908 struct device_attribute *attr, char *buf)
1909{
1910 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1911
1912 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1913}
1914
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001915static ssize_t rbd_name_show(struct device *dev,
1916 struct device_attribute *attr, char *buf)
1917{
Alex Elder593a9e72012-02-07 12:03:37 -06001918 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001919
Alex Elder0bed54d2012-07-03 16:01:18 -05001920 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001921}
1922
Alex Elder589d30e2012-07-10 20:30:11 -05001923static ssize_t rbd_image_id_show(struct device *dev,
1924 struct device_attribute *attr, char *buf)
1925{
1926 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1927
1928 return sprintf(buf, "%s\n", rbd_dev->image_id);
1929}
1930
Alex Elder34b13182012-07-13 20:35:12 -05001931/*
1932 * Shows the name of the currently-mapped snapshot (or
1933 * RBD_SNAP_HEAD_NAME for the base image).
1934 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001935static ssize_t rbd_snap_show(struct device *dev,
1936 struct device_attribute *attr,
1937 char *buf)
1938{
Alex Elder593a9e72012-02-07 12:03:37 -06001939 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001940
Alex Elderf84344f2012-08-31 17:29:51 -05001941 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001942}
1943
1944static ssize_t rbd_image_refresh(struct device *dev,
1945 struct device_attribute *attr,
1946 const char *buf,
1947 size_t size)
1948{
Alex Elder593a9e72012-02-07 12:03:37 -06001949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001950 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001951
Alex Elder117973f2012-08-31 17:29:55 -05001952 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001953
1954 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001955}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001956
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05001958static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1960static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1961static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001962static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001963static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05001964static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001965static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1966static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001967
1968static struct attribute *rbd_attrs[] = {
1969 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05001970 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001971 &dev_attr_major.attr,
1972 &dev_attr_client_id.attr,
1973 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001974 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001975 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05001976 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001977 &dev_attr_current_snap.attr,
1978 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001979 NULL
1980};
1981
1982static struct attribute_group rbd_attr_group = {
1983 .attrs = rbd_attrs,
1984};
1985
1986static const struct attribute_group *rbd_attr_groups[] = {
1987 &rbd_attr_group,
1988 NULL
1989};
1990
1991static void rbd_sysfs_dev_release(struct device *dev)
1992{
1993}
1994
1995static struct device_type rbd_device_type = {
1996 .name = "rbd",
1997 .groups = rbd_attr_groups,
1998 .release = rbd_sysfs_dev_release,
1999};
2000
2001
2002/*
2003 sysfs - snapshots
2004*/
2005
2006static ssize_t rbd_snap_size_show(struct device *dev,
2007 struct device_attribute *attr,
2008 char *buf)
2009{
2010 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2011
Josh Durgin35915382011-12-05 18:25:13 -08002012 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002013}
2014
2015static ssize_t rbd_snap_id_show(struct device *dev,
2016 struct device_attribute *attr,
2017 char *buf)
2018{
2019 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2020
Josh Durgin35915382011-12-05 18:25:13 -08002021 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002022}
2023
Alex Elder34b13182012-07-13 20:35:12 -05002024static ssize_t rbd_snap_features_show(struct device *dev,
2025 struct device_attribute *attr,
2026 char *buf)
2027{
2028 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2029
2030 return sprintf(buf, "0x%016llx\n",
2031 (unsigned long long) snap->features);
2032}
2033
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002034static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2035static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002036static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002037
2038static struct attribute *rbd_snap_attrs[] = {
2039 &dev_attr_snap_size.attr,
2040 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002041 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002042 NULL,
2043};
2044
2045static struct attribute_group rbd_snap_attr_group = {
2046 .attrs = rbd_snap_attrs,
2047};
2048
2049static void rbd_snap_dev_release(struct device *dev)
2050{
2051 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2052 kfree(snap->name);
2053 kfree(snap);
2054}
2055
2056static const struct attribute_group *rbd_snap_attr_groups[] = {
2057 &rbd_snap_attr_group,
2058 NULL
2059};
2060
2061static struct device_type rbd_snap_device_type = {
2062 .groups = rbd_snap_attr_groups,
2063 .release = rbd_snap_dev_release,
2064};
2065
Alex Elder304f6802012-08-31 17:29:52 -05002066static bool rbd_snap_registered(struct rbd_snap *snap)
2067{
2068 bool ret = snap->dev.type == &rbd_snap_device_type;
2069 bool reg = device_is_registered(&snap->dev);
2070
2071 rbd_assert(!ret ^ reg);
2072
2073 return ret;
2074}
2075
Alex Elder14e70852012-07-19 09:09:27 -05002076static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002077{
2078 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002079 if (device_is_registered(&snap->dev))
2080 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002081}
2082
Alex Elder14e70852012-07-19 09:09:27 -05002083static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002084 struct device *parent)
2085{
2086 struct device *dev = &snap->dev;
2087 int ret;
2088
2089 dev->type = &rbd_snap_device_type;
2090 dev->parent = parent;
2091 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002092 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002093 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2094
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002095 ret = device_register(dev);
2096
2097 return ret;
2098}
2099
Alex Elder4e891e02012-07-10 20:30:10 -05002100static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002101 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002102 u64 snap_id, u64 snap_size,
2103 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002104{
Alex Elder4e891e02012-07-10 20:30:10 -05002105 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002106 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002107
2108 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002109 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002110 return ERR_PTR(-ENOMEM);
2111
2112 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002113 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002114 if (!snap->name)
2115 goto err;
2116
Alex Elderc8d18422012-07-10 20:30:11 -05002117 snap->id = snap_id;
2118 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002119 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002120
2121 return snap;
2122
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002123err:
2124 kfree(snap->name);
2125 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002126
2127 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002128}
2129
Alex Eldercd892122012-07-03 16:01:19 -05002130static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2131 u64 *snap_size, u64 *snap_features)
2132{
2133 char *snap_name;
2134
2135 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2136
2137 *snap_size = rbd_dev->header.snap_sizes[which];
2138 *snap_features = 0; /* No features for v1 */
2139
2140 /* Skip over names until we find the one we are looking for */
2141
2142 snap_name = rbd_dev->header.snap_names;
2143 while (which--)
2144 snap_name += strlen(snap_name) + 1;
2145
2146 return snap_name;
2147}
2148
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002149/*
Alex Elder9d475de2012-07-03 16:01:19 -05002150 * Get the size and object order for an image snapshot, or if
2151 * snap_id is CEPH_NOSNAP, gets this information for the base
2152 * image.
2153 */
2154static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2155 u8 *order, u64 *snap_size)
2156{
2157 __le64 snapid = cpu_to_le64(snap_id);
2158 int ret;
2159 struct {
2160 u8 order;
2161 __le64 size;
2162 } __attribute__ ((packed)) size_buf = { 0 };
2163
2164 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2165 "rbd", "get_size",
2166 (char *) &snapid, sizeof (snapid),
2167 (char *) &size_buf, sizeof (size_buf),
2168 CEPH_OSD_FLAG_READ, NULL);
2169 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2170 if (ret < 0)
2171 return ret;
2172
2173 *order = size_buf.order;
2174 *snap_size = le64_to_cpu(size_buf.size);
2175
2176 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2177 (unsigned long long) snap_id, (unsigned int) *order,
2178 (unsigned long long) *snap_size);
2179
2180 return 0;
2181}
2182
2183static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2184{
2185 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2186 &rbd_dev->header.obj_order,
2187 &rbd_dev->header.image_size);
2188}
2189
Alex Elder1e130192012-07-03 16:01:19 -05002190static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2191{
2192 void *reply_buf;
2193 int ret;
2194 void *p;
2195
2196 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2197 if (!reply_buf)
2198 return -ENOMEM;
2199
2200 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2201 "rbd", "get_object_prefix",
2202 NULL, 0,
2203 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2204 CEPH_OSD_FLAG_READ, NULL);
2205 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2206 if (ret < 0)
2207 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002208 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002209
2210 p = reply_buf;
2211 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2212 p + RBD_OBJ_PREFIX_LEN_MAX,
2213 NULL, GFP_NOIO);
2214
2215 if (IS_ERR(rbd_dev->header.object_prefix)) {
2216 ret = PTR_ERR(rbd_dev->header.object_prefix);
2217 rbd_dev->header.object_prefix = NULL;
2218 } else {
2219 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2220 }
2221
2222out:
2223 kfree(reply_buf);
2224
2225 return ret;
2226}
2227
Alex Elderb1b54022012-07-03 16:01:19 -05002228static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2229 u64 *snap_features)
2230{
2231 __le64 snapid = cpu_to_le64(snap_id);
2232 struct {
2233 __le64 features;
2234 __le64 incompat;
2235 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002236 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002237 int ret;
2238
2239 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2240 "rbd", "get_features",
2241 (char *) &snapid, sizeof (snapid),
2242 (char *) &features_buf, sizeof (features_buf),
2243 CEPH_OSD_FLAG_READ, NULL);
2244 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2245 if (ret < 0)
2246 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002247
2248 incompat = le64_to_cpu(features_buf.incompat);
2249 if (incompat & ~RBD_FEATURES_ALL)
2250 return -ENOTSUPP;
2251
Alex Elderb1b54022012-07-03 16:01:19 -05002252 *snap_features = le64_to_cpu(features_buf.features);
2253
2254 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2255 (unsigned long long) snap_id,
2256 (unsigned long long) *snap_features,
2257 (unsigned long long) le64_to_cpu(features_buf.incompat));
2258
2259 return 0;
2260}
2261
2262static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2263{
2264 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2265 &rbd_dev->header.features);
2266}
2267
Alex Elder6e14b1a2012-07-03 16:01:19 -05002268static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002269{
2270 size_t size;
2271 int ret;
2272 void *reply_buf;
2273 void *p;
2274 void *end;
2275 u64 seq;
2276 u32 snap_count;
2277 struct ceph_snap_context *snapc;
2278 u32 i;
2279
2280 /*
2281 * We'll need room for the seq value (maximum snapshot id),
2282 * snapshot count, and array of that many snapshot ids.
2283 * For now we have a fixed upper limit on the number we're
2284 * prepared to receive.
2285 */
2286 size = sizeof (__le64) + sizeof (__le32) +
2287 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2288 reply_buf = kzalloc(size, GFP_KERNEL);
2289 if (!reply_buf)
2290 return -ENOMEM;
2291
2292 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2293 "rbd", "get_snapcontext",
2294 NULL, 0,
2295 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002296 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002297 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2298 if (ret < 0)
2299 goto out;
2300
2301 ret = -ERANGE;
2302 p = reply_buf;
2303 end = (char *) reply_buf + size;
2304 ceph_decode_64_safe(&p, end, seq, out);
2305 ceph_decode_32_safe(&p, end, snap_count, out);
2306
2307 /*
2308 * Make sure the reported number of snapshot ids wouldn't go
2309 * beyond the end of our buffer. But before checking that,
2310 * make sure the computed size of the snapshot context we
2311 * allocate is representable in a size_t.
2312 */
2313 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2314 / sizeof (u64)) {
2315 ret = -EINVAL;
2316 goto out;
2317 }
2318 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2319 goto out;
2320
2321 size = sizeof (struct ceph_snap_context) +
2322 snap_count * sizeof (snapc->snaps[0]);
2323 snapc = kmalloc(size, GFP_KERNEL);
2324 if (!snapc) {
2325 ret = -ENOMEM;
2326 goto out;
2327 }
2328
2329 atomic_set(&snapc->nref, 1);
2330 snapc->seq = seq;
2331 snapc->num_snaps = snap_count;
2332 for (i = 0; i < snap_count; i++)
2333 snapc->snaps[i] = ceph_decode_64(&p);
2334
2335 rbd_dev->header.snapc = snapc;
2336
2337 dout(" snap context seq = %llu, snap_count = %u\n",
2338 (unsigned long long) seq, (unsigned int) snap_count);
2339
2340out:
2341 kfree(reply_buf);
2342
2343 return 0;
2344}
2345
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002346static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2347{
2348 size_t size;
2349 void *reply_buf;
2350 __le64 snap_id;
2351 int ret;
2352 void *p;
2353 void *end;
2354 size_t snap_name_len;
2355 char *snap_name;
2356
2357 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2358 reply_buf = kmalloc(size, GFP_KERNEL);
2359 if (!reply_buf)
2360 return ERR_PTR(-ENOMEM);
2361
2362 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2363 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2364 "rbd", "get_snapshot_name",
2365 (char *) &snap_id, sizeof (snap_id),
2366 reply_buf, size,
2367 CEPH_OSD_FLAG_READ, NULL);
2368 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2369 if (ret < 0)
2370 goto out;
2371
2372 p = reply_buf;
2373 end = (char *) reply_buf + size;
2374 snap_name_len = 0;
2375 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2376 GFP_KERNEL);
2377 if (IS_ERR(snap_name)) {
2378 ret = PTR_ERR(snap_name);
2379 goto out;
2380 } else {
2381 dout(" snap_id 0x%016llx snap_name = %s\n",
2382 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2383 }
2384 kfree(reply_buf);
2385
2386 return snap_name;
2387out:
2388 kfree(reply_buf);
2389
2390 return ERR_PTR(ret);
2391}
2392
2393static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2394 u64 *snap_size, u64 *snap_features)
2395{
2396 __le64 snap_id;
2397 u8 order;
2398 int ret;
2399
2400 snap_id = rbd_dev->header.snapc->snaps[which];
2401 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2402 if (ret)
2403 return ERR_PTR(ret);
2404 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2405 if (ret)
2406 return ERR_PTR(ret);
2407
2408 return rbd_dev_v2_snap_name(rbd_dev, which);
2409}
2410
2411static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2412 u64 *snap_size, u64 *snap_features)
2413{
2414 if (rbd_dev->image_format == 1)
2415 return rbd_dev_v1_snap_info(rbd_dev, which,
2416 snap_size, snap_features);
2417 if (rbd_dev->image_format == 2)
2418 return rbd_dev_v2_snap_info(rbd_dev, which,
2419 snap_size, snap_features);
2420 return ERR_PTR(-EINVAL);
2421}
2422
Alex Elder117973f2012-08-31 17:29:55 -05002423static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2424{
2425 int ret;
2426 __u8 obj_order;
2427
2428 down_write(&rbd_dev->header_rwsem);
2429
2430 /* Grab old order first, to see if it changes */
2431
2432 obj_order = rbd_dev->header.obj_order,
2433 ret = rbd_dev_v2_image_size(rbd_dev);
2434 if (ret)
2435 goto out;
2436 if (rbd_dev->header.obj_order != obj_order) {
2437 ret = -EIO;
2438 goto out;
2439 }
2440 rbd_update_mapping_size(rbd_dev);
2441
2442 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2443 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2444 if (ret)
2445 goto out;
2446 ret = rbd_dev_snaps_update(rbd_dev);
2447 dout("rbd_dev_snaps_update returned %d\n", ret);
2448 if (ret)
2449 goto out;
2450 ret = rbd_dev_snaps_register(rbd_dev);
2451 dout("rbd_dev_snaps_register returned %d\n", ret);
2452out:
2453 up_write(&rbd_dev->header_rwsem);
2454
2455 return ret;
2456}
2457
Alex Elder9d475de2012-07-03 16:01:19 -05002458/*
Alex Elder35938152012-08-02 11:29:46 -05002459 * Scan the rbd device's current snapshot list and compare it to the
2460 * newly-received snapshot context. Remove any existing snapshots
2461 * not present in the new snapshot context. Add a new snapshot for
2462 * any snaphots in the snapshot context not in the current list.
2463 * And verify there are no changes to snapshots we already know
2464 * about.
2465 *
2466 * Assumes the snapshots in the snapshot context are sorted by
2467 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2468 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002469 */
Alex Elder304f6802012-08-31 17:29:52 -05002470static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002471{
Alex Elder35938152012-08-02 11:29:46 -05002472 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2473 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002474 struct list_head *head = &rbd_dev->snaps;
2475 struct list_head *links = head->next;
2476 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002477
Alex Elder9fcbb802012-08-23 23:48:49 -05002478 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002479 while (index < snap_count || links != head) {
2480 u64 snap_id;
2481 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002482 char *snap_name;
2483 u64 snap_size = 0;
2484 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002485
Alex Elder35938152012-08-02 11:29:46 -05002486 snap_id = index < snap_count ? snapc->snaps[index]
2487 : CEPH_NOSNAP;
2488 snap = links != head ? list_entry(links, struct rbd_snap, node)
2489 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002490 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002491
Alex Elder35938152012-08-02 11:29:46 -05002492 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2493 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002494
Alex Elder35938152012-08-02 11:29:46 -05002495 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002496
Alex Elderf84344f2012-08-31 17:29:51 -05002497 if (rbd_dev->mapping.snap_id == snap->id)
2498 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002499 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002500 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002501 rbd_dev->mapping.snap_id == snap->id ?
2502 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002503 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002504
Alex Elder35938152012-08-02 11:29:46 -05002505 /* Done with this list entry; advance */
2506
2507 links = next;
2508 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002509 }
Alex Elder35938152012-08-02 11:29:46 -05002510
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002511 snap_name = rbd_dev_snap_info(rbd_dev, index,
2512 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002513 if (IS_ERR(snap_name))
2514 return PTR_ERR(snap_name);
2515
Alex Elder9fcbb802012-08-23 23:48:49 -05002516 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2517 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002518 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2519 struct rbd_snap *new_snap;
2520
2521 /* We haven't seen this snapshot before */
2522
Alex Elderc8d18422012-07-10 20:30:11 -05002523 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002524 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002525 if (IS_ERR(new_snap)) {
2526 int err = PTR_ERR(new_snap);
2527
2528 dout(" failed to add dev, error %d\n", err);
2529
2530 return err;
2531 }
Alex Elder35938152012-08-02 11:29:46 -05002532
2533 /* New goes before existing, or at end of list */
2534
Alex Elder9fcbb802012-08-23 23:48:49 -05002535 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002536 if (snap)
2537 list_add_tail(&new_snap->node, &snap->node);
2538 else
Alex Elder523f3252012-08-30 00:16:37 -05002539 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002540 } else {
2541 /* Already have this one */
2542
Alex Elder9fcbb802012-08-23 23:48:49 -05002543 dout(" already present\n");
2544
Alex Eldercd892122012-07-03 16:01:19 -05002545 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002546 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002547 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002548
2549 /* Done with this list entry; advance */
2550
2551 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002552 }
Alex Elder35938152012-08-02 11:29:46 -05002553
2554 /* Advance to the next entry in the snapshot context */
2555
2556 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002557 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002558 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002559
2560 return 0;
2561}
2562
Alex Elder304f6802012-08-31 17:29:52 -05002563/*
2564 * Scan the list of snapshots and register the devices for any that
2565 * have not already been registered.
2566 */
2567static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2568{
2569 struct rbd_snap *snap;
2570 int ret = 0;
2571
2572 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002573 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2574 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002575
2576 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2577 if (!rbd_snap_registered(snap)) {
2578 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2579 if (ret < 0)
2580 break;
2581 }
2582 }
2583 dout("%s: returning %d\n", __func__, ret);
2584
2585 return ret;
2586}
2587
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002588static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2589{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002590 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002591 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002592
2593 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002594
Alex Eldercd789ab2012-08-30 00:16:38 -05002595 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002596 dev->bus = &rbd_bus_type;
2597 dev->type = &rbd_device_type;
2598 dev->parent = &rbd_root_dev;
2599 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002600 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002601 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002602
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002603 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002604
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002605 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002606}
2607
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002608static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2609{
2610 device_unregister(&rbd_dev->dev);
2611}
2612
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002613static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2614{
2615 int ret, rc;
2616
2617 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002618 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002619 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002620 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002621 if (rc < 0)
2622 return rc;
2623 }
2624 } while (ret == -ERANGE);
2625
2626 return ret;
2627}
2628
Alex Eldere2839302012-08-29 17:11:06 -05002629static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002630
2631/*
Alex Elder499afd52012-02-02 08:13:29 -06002632 * Get a unique rbd identifier for the given new rbd_dev, and add
2633 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002634 */
Alex Eldere2839302012-08-29 17:11:06 -05002635static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002636{
Alex Eldere2839302012-08-29 17:11:06 -05002637 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002638
2639 spin_lock(&rbd_dev_list_lock);
2640 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2641 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002642 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2643 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002644}
Alex Elderb7f23c32012-01-29 13:57:43 -06002645
Alex Elder1ddbe942012-01-29 13:57:44 -06002646/*
Alex Elder499afd52012-02-02 08:13:29 -06002647 * Remove an rbd_dev from the global list, and record that its
2648 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002649 */
Alex Eldere2839302012-08-29 17:11:06 -05002650static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002651{
Alex Elderd184f6b2012-01-29 13:57:44 -06002652 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002653 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002654 int max_id;
2655
Alex Elderaafb2302012-09-06 16:00:54 -05002656 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002657
Alex Eldere2839302012-08-29 17:11:06 -05002658 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2659 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002660 spin_lock(&rbd_dev_list_lock);
2661 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002662
2663 /*
2664 * If the id being "put" is not the current maximum, there
2665 * is nothing special we need to do.
2666 */
Alex Eldere2839302012-08-29 17:11:06 -05002667 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002668 spin_unlock(&rbd_dev_list_lock);
2669 return;
2670 }
2671
2672 /*
2673 * We need to update the current maximum id. Search the
2674 * list to find out what it is. We're more likely to find
2675 * the maximum at the end, so search the list backward.
2676 */
2677 max_id = 0;
2678 list_for_each_prev(tmp, &rbd_dev_list) {
2679 struct rbd_device *rbd_dev;
2680
2681 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002682 if (rbd_dev->dev_id > max_id)
2683 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002684 }
Alex Elder499afd52012-02-02 08:13:29 -06002685 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002686
Alex Elder1ddbe942012-01-29 13:57:44 -06002687 /*
Alex Eldere2839302012-08-29 17:11:06 -05002688 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002689 * which case it now accurately reflects the new maximum.
2690 * Be careful not to overwrite the maximum value in that
2691 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002692 */
Alex Eldere2839302012-08-29 17:11:06 -05002693 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2694 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002695}
2696
Alex Eldera725f65e2012-02-02 08:13:30 -06002697/*
Alex Eldere28fff262012-02-02 08:13:30 -06002698 * Skips over white space at *buf, and updates *buf to point to the
2699 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002700 * the token (string of non-white space characters) found. Note
2701 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002702 */
2703static inline size_t next_token(const char **buf)
2704{
2705 /*
2706 * These are the characters that produce nonzero for
2707 * isspace() in the "C" and "POSIX" locales.
2708 */
2709 const char *spaces = " \f\n\r\t\v";
2710
2711 *buf += strspn(*buf, spaces); /* Find start of token */
2712
2713 return strcspn(*buf, spaces); /* Return token length */
2714}
2715
2716/*
2717 * Finds the next token in *buf, and if the provided token buffer is
2718 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002719 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2720 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002721 *
2722 * Returns the length of the token found (not including the '\0').
2723 * Return value will be 0 if no token is found, and it will be >=
2724 * token_size if the token would not fit.
2725 *
Alex Elder593a9e72012-02-07 12:03:37 -06002726 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002727 * found token. Note that this occurs even if the token buffer is
2728 * too small to hold it.
2729 */
2730static inline size_t copy_token(const char **buf,
2731 char *token,
2732 size_t token_size)
2733{
2734 size_t len;
2735
2736 len = next_token(buf);
2737 if (len < token_size) {
2738 memcpy(token, *buf, len);
2739 *(token + len) = '\0';
2740 }
2741 *buf += len;
2742
2743 return len;
2744}
2745
2746/*
Alex Elderea3352f2012-07-09 21:04:23 -05002747 * Finds the next token in *buf, dynamically allocates a buffer big
2748 * enough to hold a copy of it, and copies the token into the new
2749 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2750 * that a duplicate buffer is created even for a zero-length token.
2751 *
2752 * Returns a pointer to the newly-allocated duplicate, or a null
2753 * pointer if memory for the duplicate was not available. If
2754 * the lenp argument is a non-null pointer, the length of the token
2755 * (not including the '\0') is returned in *lenp.
2756 *
2757 * If successful, the *buf pointer will be updated to point beyond
2758 * the end of the found token.
2759 *
2760 * Note: uses GFP_KERNEL for allocation.
2761 */
2762static inline char *dup_token(const char **buf, size_t *lenp)
2763{
2764 char *dup;
2765 size_t len;
2766
2767 len = next_token(buf);
2768 dup = kmalloc(len + 1, GFP_KERNEL);
2769 if (!dup)
2770 return NULL;
2771
2772 memcpy(dup, *buf, len);
2773 *(dup + len) = '\0';
2774 *buf += len;
2775
2776 if (lenp)
2777 *lenp = len;
2778
2779 return dup;
2780}
2781
2782/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002783 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2784 * rbd_md_name, and name fields of the given rbd_dev, based on the
2785 * list of monitor addresses and other options provided via
2786 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2787 * copy of the snapshot name to map if successful, or a
2788 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002789 *
2790 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002791 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002792static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2793 const char *buf,
2794 const char **mon_addrs,
2795 size_t *mon_addrs_size,
2796 char *options,
2797 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002798{
Alex Elderd22f76e2012-07-12 10:46:35 -05002799 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002800 char *err_ptr = ERR_PTR(-EINVAL);
2801 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002802
2803 /* The first four tokens are required */
2804
Alex Elder7ef32142012-02-02 08:13:30 -06002805 len = next_token(&buf);
2806 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002807 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002808 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002809 *mon_addrs = buf;
2810
2811 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002812
Alex Eldere28fff262012-02-02 08:13:30 -06002813 len = copy_token(&buf, options, options_size);
2814 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002815 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002816
Alex Elder3feeb8942012-08-31 17:29:52 -05002817 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002818 rbd_dev->pool_name = dup_token(&buf, NULL);
2819 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002820 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002821
Alex Elder0bed54d2012-07-03 16:01:18 -05002822 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2823 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002824 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002825
Alex Elderd4b125e2012-07-03 16:01:19 -05002826 /* Snapshot name is optional; default is to use "head" */
2827
Alex Elder3feeb8942012-08-31 17:29:52 -05002828 len = next_token(&buf);
Alex Elderd4b125e2012-07-03 16:01:19 -05002829 if (len > RBD_MAX_SNAP_NAME_LEN) {
2830 err_ptr = ERR_PTR(-ENAMETOOLONG);
2831 goto out_err;
2832 }
Alex Elder820a5f32012-07-09 21:04:24 -05002833 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002834 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2835 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002836 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002837 snap_name = kmalloc(len + 1, GFP_KERNEL);
2838 if (!snap_name)
2839 goto out_err;
2840 memcpy(snap_name, buf, len);
2841 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002842
Alex Elder3feeb8942012-08-31 17:29:52 -05002843 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002844
2845out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002846 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002847 rbd_dev->image_name = NULL;
2848 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002849 kfree(rbd_dev->pool_name);
2850 rbd_dev->pool_name = NULL;
2851
Alex Elder3feeb8942012-08-31 17:29:52 -05002852 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002853}
2854
Alex Elder589d30e2012-07-10 20:30:11 -05002855/*
2856 * An rbd format 2 image has a unique identifier, distinct from the
2857 * name given to it by the user. Internally, that identifier is
2858 * what's used to specify the names of objects related to the image.
2859 *
2860 * A special "rbd id" object is used to map an rbd image name to its
2861 * id. If that object doesn't exist, then there is no v2 rbd image
2862 * with the supplied name.
2863 *
2864 * This function will record the given rbd_dev's image_id field if
2865 * it can be determined, and in that case will return 0. If any
2866 * errors occur a negative errno will be returned and the rbd_dev's
2867 * image_id field will be unchanged (and should be NULL).
2868 */
2869static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2870{
2871 int ret;
2872 size_t size;
2873 char *object_name;
2874 void *response;
2875 void *p;
2876
2877 /*
2878 * First, see if the format 2 image id file exists, and if
2879 * so, get the image's persistent id from it.
2880 */
2881 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2882 object_name = kmalloc(size, GFP_NOIO);
2883 if (!object_name)
2884 return -ENOMEM;
2885 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2886 dout("rbd id object name is %s\n", object_name);
2887
2888 /* Response will be an encoded string, which includes a length */
2889
2890 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2891 response = kzalloc(size, GFP_NOIO);
2892 if (!response) {
2893 ret = -ENOMEM;
2894 goto out;
2895 }
2896
2897 ret = rbd_req_sync_exec(rbd_dev, object_name,
2898 "rbd", "get_id",
2899 NULL, 0,
2900 response, RBD_IMAGE_ID_LEN_MAX,
2901 CEPH_OSD_FLAG_READ, NULL);
2902 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2903 if (ret < 0)
2904 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002905 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05002906
2907 p = response;
2908 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2909 p + RBD_IMAGE_ID_LEN_MAX,
2910 &rbd_dev->image_id_len,
2911 GFP_NOIO);
2912 if (IS_ERR(rbd_dev->image_id)) {
2913 ret = PTR_ERR(rbd_dev->image_id);
2914 rbd_dev->image_id = NULL;
2915 } else {
2916 dout("image_id is %s\n", rbd_dev->image_id);
2917 }
2918out:
2919 kfree(response);
2920 kfree(object_name);
2921
2922 return ret;
2923}
2924
Alex Eldera30b71b2012-07-10 20:30:11 -05002925static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2926{
2927 int ret;
2928 size_t size;
2929
2930 /* Version 1 images have no id; empty string is used */
2931
2932 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2933 if (!rbd_dev->image_id)
2934 return -ENOMEM;
2935 rbd_dev->image_id_len = 0;
2936
2937 /* Record the header object name for this rbd image. */
2938
2939 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2940 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2941 if (!rbd_dev->header_name) {
2942 ret = -ENOMEM;
2943 goto out_err;
2944 }
2945 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2946
2947 /* Populate rbd image metadata */
2948
2949 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2950 if (ret < 0)
2951 goto out_err;
2952 rbd_dev->image_format = 1;
2953
2954 dout("discovered version 1 image, header name is %s\n",
2955 rbd_dev->header_name);
2956
2957 return 0;
2958
2959out_err:
2960 kfree(rbd_dev->header_name);
2961 rbd_dev->header_name = NULL;
2962 kfree(rbd_dev->image_id);
2963 rbd_dev->image_id = NULL;
2964
2965 return ret;
2966}
2967
2968static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2969{
2970 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05002971 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05002972 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05002973
2974 /*
2975 * Image id was filled in by the caller. Record the header
2976 * object name for this rbd image.
2977 */
2978 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2979 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2980 if (!rbd_dev->header_name)
2981 return -ENOMEM;
2982 sprintf(rbd_dev->header_name, "%s%s",
2983 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05002984
2985 /* Get the size and object order for the image */
2986
2987 ret = rbd_dev_v2_image_size(rbd_dev);
2988 if (ret < 0)
2989 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05002990
2991 /* Get the object prefix (a.k.a. block_name) for the image */
2992
2993 ret = rbd_dev_v2_object_prefix(rbd_dev);
2994 if (ret < 0)
2995 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05002996
Alex Elderd8891402012-10-09 13:50:17 -07002997 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05002998
2999 ret = rbd_dev_v2_features(rbd_dev);
3000 if (ret < 0)
3001 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003002
Alex Elder6e14b1a2012-07-03 16:01:19 -05003003 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003004
Alex Elder6e14b1a2012-07-03 16:01:19 -05003005 rbd_dev->header.crypt_type = 0;
3006 rbd_dev->header.comp_type = 0;
3007
3008 /* Get the snapshot context, plus the header version */
3009
3010 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003011 if (ret)
3012 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003013 rbd_dev->header.obj_version = ver;
3014
Alex Eldera30b71b2012-07-10 20:30:11 -05003015 rbd_dev->image_format = 2;
3016
3017 dout("discovered version 2 image, header name is %s\n",
3018 rbd_dev->header_name);
3019
Alex Elder35152972012-08-31 17:29:55 -05003020 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003021out_err:
3022 kfree(rbd_dev->header_name);
3023 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003024 kfree(rbd_dev->header.object_prefix);
3025 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003026
3027 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003028}
3029
3030/*
3031 * Probe for the existence of the header object for the given rbd
3032 * device. For format 2 images this includes determining the image
3033 * id.
3034 */
3035static int rbd_dev_probe(struct rbd_device *rbd_dev)
3036{
3037 int ret;
3038
3039 /*
3040 * Get the id from the image id object. If it's not a
3041 * format 2 image, we'll get ENOENT back, and we'll assume
3042 * it's a format 1 image.
3043 */
3044 ret = rbd_dev_image_id(rbd_dev);
3045 if (ret)
3046 ret = rbd_dev_v1_probe(rbd_dev);
3047 else
3048 ret = rbd_dev_v2_probe(rbd_dev);
3049 if (ret)
3050 dout("probe failed, returning %d\n", ret);
3051
3052 return ret;
3053}
3054
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003055static ssize_t rbd_add(struct bus_type *bus,
3056 const char *buf,
3057 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003058{
Alex Eldercb8627c2012-07-09 21:04:23 -05003059 char *options;
3060 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06003061 const char *mon_addrs = NULL;
3062 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06003063 struct ceph_osd_client *osdc;
3064 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05003065 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003066
3067 if (!try_module_get(THIS_MODULE))
3068 return -ENODEV;
3069
Alex Elder27cc2592012-02-02 08:13:30 -06003070 options = kmalloc(count, GFP_KERNEL);
3071 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05003072 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05003073 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3074 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05003075 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003076
3077 /* static rbd_device initialization */
3078 spin_lock_init(&rbd_dev->lock);
3079 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003080 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003081 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003082
Alex Eldera725f65e2012-02-02 08:13:30 -06003083 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05003084 snap_name = rbd_add_parse_args(rbd_dev, buf,
3085 &mon_addrs, &mon_addrs_size, options, count);
3086 if (IS_ERR(snap_name)) {
3087 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003088 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05003089 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003090
Alex Elderf8c38922012-08-10 13:12:07 -07003091 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3092 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003093 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003094
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003095 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003096 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003097 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3098 if (rc < 0)
3099 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05003100 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003101
Alex Eldera30b71b2012-07-10 20:30:11 -05003102 rc = rbd_dev_probe(rbd_dev);
3103 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003104 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003105
3106 /* no need to lock here, as rbd_dev is not registered yet */
3107 rc = rbd_dev_snaps_update(rbd_dev);
3108 if (rc)
3109 goto err_out_header;
3110
3111 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3112 if (rc)
3113 goto err_out_header;
3114
Alex Elder85ae8922012-07-26 23:37:14 -05003115 /* generate unique id: find highest unique id, add one */
3116 rbd_dev_id_get(rbd_dev);
3117
3118 /* Fill in the device name, now that we have its id. */
3119 BUILD_BUG_ON(DEV_NAME_LEN
3120 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3121 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3122
3123 /* Get our block major device number. */
3124
Alex Elder27cc2592012-02-02 08:13:30 -06003125 rc = register_blkdev(0, rbd_dev->name);
3126 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003127 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003128 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003129
Alex Elder0f308a32012-08-29 17:11:07 -05003130 /* Set up the blkdev mapping. */
3131
3132 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003133 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003134 goto err_out_blkdev;
3135
Alex Elder0f308a32012-08-29 17:11:07 -05003136 rc = rbd_bus_add_dev(rbd_dev);
3137 if (rc)
3138 goto err_out_disk;
3139
Alex Elder32eec682012-02-08 16:11:14 -06003140 /*
3141 * At this point cleanup in the event of an error is the job
3142 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003143 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003144
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003145 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003146 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003147 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003148 if (rc)
3149 goto err_out_bus;
3150
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003151 rc = rbd_init_watch_dev(rbd_dev);
3152 if (rc)
3153 goto err_out_bus;
3154
Alex Elder3ee40012012-08-29 17:11:07 -05003155 /* Everything's ready. Announce the disk to the world. */
3156
3157 add_disk(rbd_dev->disk);
3158
3159 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3160 (unsigned long long) rbd_dev->mapping.size);
3161
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003162 return count;
3163
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003164err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003165 /* this will also clean up rest of rbd_dev stuff */
3166
3167 rbd_bus_del_dev(rbd_dev);
3168 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003169 return rc;
3170
Alex Elder0f308a32012-08-29 17:11:07 -05003171err_out_disk:
3172 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003173err_out_blkdev:
3174 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003175err_out_id:
3176 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05003177err_out_header:
3178 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003179err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003180 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003181 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05003182 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05003183err_out_args:
3184 kfree(rbd_dev->mapping.snap_name);
3185 kfree(rbd_dev->image_name);
3186 kfree(rbd_dev->pool_name);
3187err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06003188 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05003189 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06003190
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003191 dout("Error adding device %s\n", buf);
3192 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003193
3194 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003195}
3196
Alex Elderde71a292012-07-03 16:01:19 -05003197static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003198{
3199 struct list_head *tmp;
3200 struct rbd_device *rbd_dev;
3201
Alex Eldere124a822012-01-29 13:57:44 -06003202 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003203 list_for_each(tmp, &rbd_dev_list) {
3204 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003205 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003206 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003207 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003208 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003209 }
Alex Eldere124a822012-01-29 13:57:44 -06003210 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003211 return NULL;
3212}
3213
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003214static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003215{
Alex Elder593a9e72012-02-07 12:03:37 -06003216 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003217
Alex Elder1dbb4392012-01-24 10:08:37 -06003218 if (rbd_dev->watch_request) {
3219 struct ceph_client *client = rbd_dev->rbd_client->client;
3220
3221 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003222 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003223 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003224 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003225 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003226
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003227 rbd_put_client(rbd_dev);
3228
3229 /* clean up and free blkdev */
3230 rbd_free_disk(rbd_dev);
3231 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003232
Alex Elder2ac4e752012-07-10 20:30:10 -05003233 /* release allocated disk header fields */
3234 rbd_header_free(&rbd_dev->header);
3235
Alex Elder32eec682012-02-08 16:11:14 -06003236 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05003237 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003238 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003239 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003240 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003241 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003242 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003243 kfree(rbd_dev);
3244
3245 /* release module ref */
3246 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003247}
3248
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003249static ssize_t rbd_remove(struct bus_type *bus,
3250 const char *buf,
3251 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003252{
3253 struct rbd_device *rbd_dev = NULL;
3254 int target_id, rc;
3255 unsigned long ul;
3256 int ret = count;
3257
3258 rc = strict_strtoul(buf, 10, &ul);
3259 if (rc)
3260 return rc;
3261
3262 /* convert to int; abort if we lost anything in the conversion */
3263 target_id = (int) ul;
3264 if (target_id != ul)
3265 return -EINVAL;
3266
3267 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3268
3269 rbd_dev = __rbd_get_dev(target_id);
3270 if (!rbd_dev) {
3271 ret = -ENOENT;
3272 goto done;
3273 }
3274
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003275 __rbd_remove_all_snaps(rbd_dev);
3276 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003277
3278done:
3279 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003280
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003281 return ret;
3282}
3283
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003284/*
3285 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003286 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003287 */
3288static int rbd_sysfs_init(void)
3289{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003290 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003291
Alex Elderfed4c142012-02-07 12:03:36 -06003292 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003293 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003294 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003295
Alex Elderfed4c142012-02-07 12:03:36 -06003296 ret = bus_register(&rbd_bus_type);
3297 if (ret < 0)
3298 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003299
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003300 return ret;
3301}
3302
3303static void rbd_sysfs_cleanup(void)
3304{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003305 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003306 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003307}
3308
3309int __init rbd_init(void)
3310{
3311 int rc;
3312
3313 rc = rbd_sysfs_init();
3314 if (rc)
3315 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003316 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003317 return 0;
3318}
3319
3320void __exit rbd_exit(void)
3321{
3322 rbd_sysfs_cleanup();
3323}
3324
3325module_init(rbd_init);
3326module_exit(rbd_exit);
3327
3328MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3329MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3330MODULE_DESCRIPTION("rados block device");
3331
3332/* following authorship retained from original osdblk.c */
3333MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3334
3335MODULE_LICENSE("GPL");