blob: a29c6d2a49ad6d80c0639bf710b73d9d69a14661 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
Alex Elder35d489f2012-07-03 16:01:19 -050065#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070066#define RBD_MAX_OPT_LEN 1024
67
68#define RBD_SNAP_HEAD_NAME "-"
69
Alex Elder1e130192012-07-03 16:01:19 -050070#define RBD_IMAGE_ID_LEN_MAX 64
71#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050072
Alex Elderd8891402012-10-09 13:50:17 -070073/* Feature bits */
74
75#define RBD_FEATURE_LAYERING 1
76
77/* Features supported by this (client software) implementation. */
78
79#define RBD_FEATURES_ALL (0)
80
Alex Elder81a89792012-02-02 08:13:30 -060081/*
82 * An RBD device name will be "rbd#", where the "rbd" comes from
83 * RBD_DRV_NAME above, and # is a unique integer identifier.
84 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
85 * enough to hold all possible device names.
86 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070087#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060088#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070089
Alex Eldercc0538b2012-08-10 13:12:07 -070090#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070091
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092/*
93 * block device image metadata (in-memory version)
94 */
95struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050096 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050097 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -050098 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099 __u8 obj_order;
100 __u8 crypt_type;
101 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102
Alex Elderf84344f2012-08-31 17:29:51 -0500103 /* The remaining fields need to be updated occasionally */
104 u64 image_size;
105 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700106 char *snap_names;
107 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700108
109 u64 obj_version;
110};
111
112struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700113 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700114};
115
116/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600117 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700118 */
119struct rbd_client {
120 struct ceph_client *client;
121 struct kref kref;
122 struct list_head node;
123};
124
125/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700127 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700128struct rbd_req_status {
129 int done;
130 int rc;
131 u64 bytes;
132};
133
134/*
135 * a collection of requests
136 */
137struct rbd_req_coll {
138 int total;
139 int num_done;
140 struct kref kref;
141 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700142};
143
Alex Elderf0f8cef2012-01-29 13:57:44 -0600144/*
145 * a single io request
146 */
147struct rbd_request {
148 struct request *rq; /* blk layer request */
149 struct bio *bio; /* cloned bio */
150 struct page **pages; /* list of used pages */
151 u64 len;
152 int coll_index;
153 struct rbd_req_coll *coll;
154};
155
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800156struct rbd_snap {
157 struct device dev;
158 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800159 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800160 struct list_head node;
161 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500162 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800163};
164
Alex Elderf84344f2012-08-31 17:29:51 -0500165struct rbd_mapping {
166 char *snap_name;
167 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500168 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500169 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500170 bool snap_exists;
171 bool read_only;
172};
173
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700174/*
175 * a single device
176 */
177struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500178 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700179
180 int major; /* blkdev assigned major */
181 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
Alex Eldera30b71b2012-07-10 20:30:11 -0500183 u32 image_format; /* Either 1 or 2 */
Alex Elderf8c38922012-08-10 13:12:07 -0700184 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700185 struct rbd_client *rbd_client;
186
187 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
188
189 spinlock_t lock; /* queue lock */
190
191 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500192 char *image_id;
193 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500194 char *image_name;
195 size_t image_name_len;
196 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500197 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500198 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700200 struct ceph_osd_event *watch_event;
201 struct ceph_osd_request *watch_request;
202
Josh Durginc6666012011-11-21 17:11:12 -0800203 /* protects updating the header */
204 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500205
206 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700207
208 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800209
210 /* list of snapshots */
211 struct list_head snaps;
212
213 /* sysfs related */
214 struct device dev;
215};
216
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700217static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600218
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700219static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600220static DEFINE_SPINLOCK(rbd_dev_list_lock);
221
Alex Elder432b8582012-01-29 13:57:44 -0600222static LIST_HEAD(rbd_client_list); /* clients */
223static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700224
Alex Elder304f6802012-08-31 17:29:52 -0500225static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
226static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
227
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800228static void rbd_dev_release(struct device *dev);
Alex Elder14e70852012-07-19 09:09:27 -0500229static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800230
Alex Elderf0f8cef2012-01-29 13:57:44 -0600231static ssize_t rbd_add(struct bus_type *bus, const char *buf,
232 size_t count);
233static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
234 size_t count);
235
236static struct bus_attribute rbd_bus_attrs[] = {
237 __ATTR(add, S_IWUSR, NULL, rbd_add),
238 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
239 __ATTR_NULL
240};
241
242static struct bus_type rbd_bus_type = {
243 .name = "rbd",
244 .bus_attrs = rbd_bus_attrs,
245};
246
247static void rbd_root_dev_release(struct device *dev)
248{
249}
250
251static struct device rbd_root_dev = {
252 .init_name = "rbd",
253 .release = rbd_root_dev_release,
254};
255
Alex Elderaafb2302012-09-06 16:00:54 -0500256#ifdef RBD_DEBUG
257#define rbd_assert(expr) \
258 if (unlikely(!(expr))) { \
259 printk(KERN_ERR "\nAssertion failure in %s() " \
260 "at line %d:\n\n" \
261 "\trbd_assert(%s);\n\n", \
262 __func__, __LINE__, #expr); \
263 BUG(); \
264 }
265#else /* !RBD_DEBUG */
266# define rbd_assert(expr) ((void) 0)
267#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800268
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800269static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
270{
271 return get_device(&rbd_dev->dev);
272}
273
274static void rbd_put_dev(struct rbd_device *rbd_dev)
275{
276 put_device(&rbd_dev->dev);
277}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700278
Alex Elder117973f2012-08-31 17:29:55 -0500279static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
280static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700281
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700282static int rbd_open(struct block_device *bdev, fmode_t mode)
283{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600284 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700285
Alex Elderf84344f2012-08-31 17:29:51 -0500286 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700287 return -EROFS;
288
Alex Elder340c7a22012-08-10 13:12:07 -0700289 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500290 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700291
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700292 return 0;
293}
294
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800295static int rbd_release(struct gendisk *disk, fmode_t mode)
296{
297 struct rbd_device *rbd_dev = disk->private_data;
298
299 rbd_put_dev(rbd_dev);
300
301 return 0;
302}
303
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304static const struct block_device_operations rbd_bd_ops = {
305 .owner = THIS_MODULE,
306 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800307 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700308};
309
310/*
311 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500312 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700313 */
Alex Elderf8c38922012-08-10 13:12:07 -0700314static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700315{
316 struct rbd_client *rbdc;
317 int ret = -ENOMEM;
318
319 dout("rbd_client_create\n");
320 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
321 if (!rbdc)
322 goto out_opt;
323
324 kref_init(&rbdc->kref);
325 INIT_LIST_HEAD(&rbdc->node);
326
Alex Elderbc534d862012-01-29 13:57:44 -0600327 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
328
Alex Elder43ae4702012-07-03 16:01:18 -0500329 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700330 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600331 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500332 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333
334 ret = ceph_open_session(rbdc->client);
335 if (ret < 0)
336 goto out_err;
337
Alex Elder432b8582012-01-29 13:57:44 -0600338 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600340 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341
Alex Elderbc534d862012-01-29 13:57:44 -0600342 mutex_unlock(&ctl_mutex);
343
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344 dout("rbd_client_create created %p\n", rbdc);
345 return rbdc;
346
347out_err:
348 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600349out_mutex:
350 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351 kfree(rbdc);
352out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500353 if (ceph_opts)
354 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400355 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356}
357
358/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700359 * Find a ceph client with specific addr and configuration. If
360 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700362static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700363{
364 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700365 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366
Alex Elder43ae4702012-07-03 16:01:18 -0500367 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700368 return NULL;
369
Alex Elder1f7ba332012-08-10 13:12:07 -0700370 spin_lock(&rbd_client_list_lock);
371 list_for_each_entry(client_node, &rbd_client_list, node) {
372 if (!ceph_compare_options(ceph_opts, client_node->client)) {
373 kref_get(&client_node->kref);
374 found = true;
375 break;
376 }
377 }
378 spin_unlock(&rbd_client_list_lock);
379
380 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700381}
382
383/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700384 * mount options
385 */
386enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 Opt_last_int,
388 /* int args above */
389 Opt_last_string,
390 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700391 Opt_read_only,
392 Opt_read_write,
393 /* Boolean args above */
394 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700395};
396
Alex Elder43ae4702012-07-03 16:01:18 -0500397static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398 /* int args above */
399 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500400 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700401 {Opt_read_only, "ro"}, /* Alternate spelling */
402 {Opt_read_write, "read_write"},
403 {Opt_read_write, "rw"}, /* Alternate spelling */
404 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700405 {-1, NULL}
406};
407
408static int parse_rbd_opts_token(char *c, void *private)
409{
Alex Elder43ae4702012-07-03 16:01:18 -0500410 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700411 substring_t argstr[MAX_OPT_ARGS];
412 int token, intval, ret;
413
Alex Elder43ae4702012-07-03 16:01:18 -0500414 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700415 if (token < 0)
416 return -EINVAL;
417
418 if (token < Opt_last_int) {
419 ret = match_int(&argstr[0], &intval);
420 if (ret < 0) {
421 pr_err("bad mount option arg (not int) "
422 "at '%s'\n", c);
423 return ret;
424 }
425 dout("got int token %d val %d\n", token, intval);
426 } else if (token > Opt_last_int && token < Opt_last_string) {
427 dout("got string token %d val %s\n", token,
428 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700429 } else if (token > Opt_last_string && token < Opt_last_bool) {
430 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700431 } else {
432 dout("got token %d\n", token);
433 }
434
435 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700436 case Opt_read_only:
437 rbd_opts->read_only = true;
438 break;
439 case Opt_read_write:
440 rbd_opts->read_only = false;
441 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700442 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500443 rbd_assert(false);
444 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700445 }
446 return 0;
447}
448
449/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450 * Get a ceph client with specific addr and configuration, if one does
451 * not exist create it.
452 */
Alex Elderf8c38922012-08-10 13:12:07 -0700453static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
454 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455{
Alex Elderf8c38922012-08-10 13:12:07 -0700456 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500457 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700458 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700459
Alex Eldercc0538b2012-08-10 13:12:07 -0700460 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700461
Alex Elder43ae4702012-07-03 16:01:18 -0500462 ceph_opts = ceph_parse_options(options, mon_addr,
463 mon_addr + mon_addr_len,
464 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700465 if (IS_ERR(ceph_opts))
466 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467
Alex Elder1f7ba332012-08-10 13:12:07 -0700468 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600470 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500471 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700472 } else {
473 rbdc = rbd_client_create(ceph_opts);
474 if (IS_ERR(rbdc))
475 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476 }
Alex Elderf8c38922012-08-10 13:12:07 -0700477 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700478
Alex Elderf8c38922012-08-10 13:12:07 -0700479 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700480}
481
482/*
483 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600484 *
Alex Elder432b8582012-01-29 13:57:44 -0600485 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486 */
487static void rbd_client_release(struct kref *kref)
488{
489 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
490
491 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500492 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700493 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500494 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495
496 ceph_destroy_client(rbdc->client);
497 kfree(rbdc);
498}
499
500/*
501 * Drop reference to ceph client node. If it's not referenced anymore, release
502 * it.
503 */
504static void rbd_put_client(struct rbd_device *rbd_dev)
505{
506 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
507 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700508}
509
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700510/*
511 * Destroy requests collection
512 */
513static void rbd_coll_release(struct kref *kref)
514{
515 struct rbd_req_coll *coll =
516 container_of(kref, struct rbd_req_coll, kref);
517
518 dout("rbd_coll_release %p\n", coll);
519 kfree(coll);
520}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700521
Alex Eldera30b71b2012-07-10 20:30:11 -0500522static bool rbd_image_format_valid(u32 image_format)
523{
524 return image_format == 1 || image_format == 2;
525}
526
Alex Elder8e94af82012-07-25 09:32:40 -0500527static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
528{
Alex Elder103a1502012-08-02 11:29:45 -0500529 size_t size;
530 u32 snap_count;
531
532 /* The header has to start with the magic rbd header text */
533 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
534 return false;
535
536 /*
537 * The size of a snapshot header has to fit in a size_t, and
538 * that limits the number of snapshots.
539 */
540 snap_count = le32_to_cpu(ondisk->snap_count);
541 size = SIZE_MAX - sizeof (struct ceph_snap_context);
542 if (snap_count > size / sizeof (__le64))
543 return false;
544
545 /*
546 * Not only that, but the size of the entire the snapshot
547 * header must also be representable in a size_t.
548 */
549 size -= snap_count * sizeof (__le64);
550 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
551 return false;
552
553 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500554}
555
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700556/*
557 * Create a new header structure, translate header format from the on-disk
558 * header.
559 */
560static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500561 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562{
Alex Elderccece232012-07-10 20:30:10 -0500563 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500564 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500565 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500566 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700567
Alex Elder6a523252012-07-19 17:12:59 -0500568 memset(header, 0, sizeof (*header));
569
Alex Elder103a1502012-08-02 11:29:45 -0500570 snap_count = le32_to_cpu(ondisk->snap_count);
571
Alex Elder58c17b02012-08-23 23:22:06 -0500572 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
573 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500574 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500576 memcpy(header->object_prefix, ondisk->object_prefix, len);
577 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600578
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700579 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500580 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
581
Alex Elder621901d2012-08-23 23:22:06 -0500582 /* Save a copy of the snapshot names */
583
Alex Elderf785cc12012-08-23 23:22:06 -0500584 if (snap_names_len > (u64) SIZE_MAX)
585 return -EIO;
586 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500588 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500589 /*
590 * Note that rbd_dev_v1_header_read() guarantees
591 * the ondisk buffer we're working with has
592 * snap_names_len bytes beyond the end of the
593 * snapshot id array, this memcpy() is safe.
594 */
595 memcpy(header->snap_names, &ondisk->snaps[snap_count],
596 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500597
Alex Elder621901d2012-08-23 23:22:06 -0500598 /* Record each snapshot's size */
599
Alex Elderd2bb24e2012-07-26 23:37:14 -0500600 size = snap_count * sizeof (*header->snap_sizes);
601 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500603 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500604 for (i = 0; i < snap_count; i++)
605 header->snap_sizes[i] =
606 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700607 } else {
Alex Elderccece232012-07-10 20:30:10 -0500608 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609 header->snap_names = NULL;
610 header->snap_sizes = NULL;
611 }
Alex Elder849b4262012-07-09 21:04:24 -0500612
Alex Elder34b13182012-07-13 20:35:12 -0500613 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700614 header->obj_order = ondisk->options.order;
615 header->crypt_type = ondisk->options.crypt_type;
616 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500617
Alex Elder621901d2012-08-23 23:22:06 -0500618 /* Allocate and fill in the snapshot context */
619
Alex Elderf84344f2012-08-31 17:29:51 -0500620 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500621 size = sizeof (struct ceph_snap_context);
622 size += snap_count * sizeof (header->snapc->snaps[0]);
623 header->snapc = kzalloc(size, GFP_KERNEL);
624 if (!header->snapc)
625 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626
627 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500628 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500630 for (i = 0; i < snap_count; i++)
631 header->snapc->snaps[i] =
632 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700633
634 return 0;
635
Alex Elder6a523252012-07-19 17:12:59 -0500636out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500637 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500638 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500640 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500641 kfree(header->object_prefix);
642 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500643
Alex Elder00f1f362012-02-07 12:03:36 -0600644 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700645}
646
Alex Elder8836b992012-08-30 14:42:15 -0500647static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700649
Alex Eldere86924a2012-07-10 20:30:11 -0500650 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600651
Alex Eldere86924a2012-07-10 20:30:11 -0500652 list_for_each_entry(snap, &rbd_dev->snaps, node) {
653 if (!strcmp(snap_name, snap->name)) {
654 rbd_dev->mapping.snap_id = snap->id;
655 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500656 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600657
Alex Eldere86924a2012-07-10 20:30:11 -0500658 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600659 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660 }
Alex Eldere86924a2012-07-10 20:30:11 -0500661
Alex Elder00f1f362012-02-07 12:03:36 -0600662 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663}
664
Alex Elder5ed16172012-08-29 17:11:07 -0500665static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666{
Alex Elder78dc4472012-07-19 08:49:18 -0500667 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668
Alex Elder4e1105a2012-08-31 17:29:52 -0500669 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800670 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500671 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500672 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500673 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500674 rbd_dev->mapping.snap_exists = false;
675 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500676 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500678 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679 if (ret < 0)
680 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500681 rbd_dev->mapping.snap_exists = true;
682 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500684 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686 return ret;
687}
688
689static void rbd_header_free(struct rbd_image_header *header)
690{
Alex Elder849b4262012-07-09 21:04:24 -0500691 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500692 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700693 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500694 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500695 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500696 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800697 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500698 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699}
700
Alex Elder65ccfe22012-08-09 10:33:26 -0700701static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702{
Alex Elder65ccfe22012-08-09 10:33:26 -0700703 char *name;
704 u64 segment;
705 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706
Alex Elder65ccfe22012-08-09 10:33:26 -0700707 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
708 if (!name)
709 return NULL;
710 segment = offset >> rbd_dev->header.obj_order;
711 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
712 rbd_dev->header.object_prefix, segment);
713 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
714 pr_err("error formatting segment name for #%llu (%d)\n",
715 segment, ret);
716 kfree(name);
717 name = NULL;
718 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719
Alex Elder65ccfe22012-08-09 10:33:26 -0700720 return name;
721}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700722
Alex Elder65ccfe22012-08-09 10:33:26 -0700723static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
724{
725 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726
Alex Elder65ccfe22012-08-09 10:33:26 -0700727 return offset & (segment_size - 1);
728}
729
730static u64 rbd_segment_length(struct rbd_device *rbd_dev,
731 u64 offset, u64 length)
732{
733 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
734
735 offset &= segment_size - 1;
736
Alex Elderaafb2302012-09-06 16:00:54 -0500737 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700738 if (offset + length > segment_size)
739 length = segment_size - offset;
740
741 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742}
743
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700744static int rbd_get_num_segments(struct rbd_image_header *header,
745 u64 ofs, u64 len)
746{
Alex Elderdf111be2012-08-09 10:33:26 -0700747 u64 start_seg;
748 u64 end_seg;
749
750 if (!len)
751 return 0;
752 if (len - 1 > U64_MAX - ofs)
753 return -ERANGE;
754
755 start_seg = ofs >> header->obj_order;
756 end_seg = (ofs + len - 1) >> header->obj_order;
757
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700758 return end_seg - start_seg + 1;
759}
760
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700761/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700762 * returns the size of an object in the image
763 */
764static u64 rbd_obj_bytes(struct rbd_image_header *header)
765{
766 return 1 << header->obj_order;
767}
768
769/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770 * bio helpers
771 */
772
773static void bio_chain_put(struct bio *chain)
774{
775 struct bio *tmp;
776
777 while (chain) {
778 tmp = chain;
779 chain = chain->bi_next;
780 bio_put(tmp);
781 }
782}
783
784/*
785 * zeros a bio chain, starting at specific offset
786 */
787static void zero_bio_chain(struct bio *chain, int start_ofs)
788{
789 struct bio_vec *bv;
790 unsigned long flags;
791 void *buf;
792 int i;
793 int pos = 0;
794
795 while (chain) {
796 bio_for_each_segment(bv, chain, i) {
797 if (pos + bv->bv_len > start_ofs) {
798 int remainder = max(start_ofs - pos, 0);
799 buf = bvec_kmap_irq(bv, &flags);
800 memset(buf + remainder, 0,
801 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200802 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803 }
804 pos += bv->bv_len;
805 }
806
807 chain = chain->bi_next;
808 }
809}
810
811/*
812 * bio_chain_clone - clone a chain of bios up to a certain length.
813 * might return a bio_pair that will need to be released.
814 */
815static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
816 struct bio_pair **bp,
817 int len, gfp_t gfpmask)
818{
Alex Elder542582f2012-08-09 10:33:25 -0700819 struct bio *old_chain = *old;
820 struct bio *new_chain = NULL;
821 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700822 int total = 0;
823
824 if (*bp) {
825 bio_pair_release(*bp);
826 *bp = NULL;
827 }
828
829 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700830 struct bio *tmp;
831
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700832 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
833 if (!tmp)
834 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700835 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700836
837 if (total + old_chain->bi_size > len) {
838 struct bio_pair *bp;
839
840 /*
841 * this split can only happen with a single paged bio,
842 * split_bio will BUG_ON if this is not the case
843 */
844 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500845 "bi_size=%u\n",
846 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700847
848 /* split the bio. We'll release it either in the next
849 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600850 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700851 if (!bp)
852 goto err_out;
853
854 __bio_clone(tmp, &bp->bio1);
855
856 *next = &bp->bio2;
857 } else {
858 __bio_clone(tmp, old_chain);
859 *next = old_chain->bi_next;
860 }
861
862 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700863 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700864 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700866 else
867 new_chain = tmp;
868 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700869 old_chain = old_chain->bi_next;
870
871 total += tmp->bi_size;
872 }
873
Alex Elderaafb2302012-09-06 16:00:54 -0500874 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700875
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876 *old = old_chain;
877
878 return new_chain;
879
880err_out:
881 dout("bio_chain_clone with err\n");
882 bio_chain_put(new_chain);
883 return NULL;
884}
885
886/*
887 * helpers for osd request op vectors.
888 */
Alex Elder57cfc102012-06-26 12:57:03 -0700889static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
890 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891{
Alex Elder57cfc102012-06-26 12:57:03 -0700892 struct ceph_osd_req_op *ops;
893
894 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
895 if (!ops)
896 return NULL;
897
898 ops[0].op = opcode;
899
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700900 /*
901 * op extent offset and length will be set later on
902 * in calc_raw_layout()
903 */
Alex Elder57cfc102012-06-26 12:57:03 -0700904 ops[0].payload_len = payload_len;
905
906 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700907}
908
909static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
910{
911 kfree(ops);
912}
913
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700914static void rbd_coll_end_req_index(struct request *rq,
915 struct rbd_req_coll *coll,
916 int index,
917 int ret, u64 len)
918{
919 struct request_queue *q;
920 int min, max, i;
921
Alex Elderbd919d42012-07-13 20:35:11 -0500922 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
923 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700924
925 if (!rq)
926 return;
927
928 if (!coll) {
929 blk_end_request(rq, ret, len);
930 return;
931 }
932
933 q = rq->q;
934
935 spin_lock_irq(q->queue_lock);
936 coll->status[index].done = 1;
937 coll->status[index].rc = ret;
938 coll->status[index].bytes = len;
939 max = min = coll->num_done;
940 while (max < coll->total && coll->status[max].done)
941 max++;
942
943 for (i = min; i<max; i++) {
944 __blk_end_request(rq, coll->status[i].rc,
945 coll->status[i].bytes);
946 coll->num_done++;
947 kref_put(&coll->kref, rbd_coll_release);
948 }
949 spin_unlock_irq(q->queue_lock);
950}
951
952static void rbd_coll_end_req(struct rbd_request *req,
953 int ret, u64 len)
954{
955 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
956}
957
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958/*
959 * Send ceph osd request
960 */
961static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500962 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963 struct ceph_snap_context *snapc,
964 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500965 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966 struct bio *bio,
967 struct page **pages,
968 int num_pages,
969 int flags,
970 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700971 struct rbd_req_coll *coll,
972 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700973 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700974 struct ceph_msg *msg),
975 struct ceph_osd_request **linger_req,
976 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700977{
978 struct ceph_osd_request *req;
979 struct ceph_file_layout *layout;
980 int ret;
981 u64 bno;
982 struct timespec mtime = CURRENT_TIME;
983 struct rbd_request *req_data;
984 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600985 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700986
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700987 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700988 if (!req_data) {
989 if (coll)
990 rbd_coll_end_req_index(rq, coll, coll_index,
991 -ENOMEM, len);
992 return -ENOMEM;
993 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700994
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700995 if (coll) {
996 req_data->coll = coll;
997 req_data->coll_index = coll_index;
998 }
999
Alex Elderbd919d42012-07-13 20:35:11 -05001000 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
1001 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001002
Alex Elder0ce1a792012-07-03 16:01:18 -05001003 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001004 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1005 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001006 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001007 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001008 goto done_pages;
1009 }
1010
1011 req->r_callback = rbd_cb;
1012
1013 req_data->rq = rq;
1014 req_data->bio = bio;
1015 req_data->pages = pages;
1016 req_data->len = len;
1017
1018 req->r_priv = req_data;
1019
1020 reqhead = req->r_request->front.iov_base;
1021 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1022
Alex Elderaded07e2012-07-03 16:01:18 -05001023 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024 req->r_oid_len = strlen(req->r_oid);
1025
1026 layout = &req->r_file_layout;
1027 memset(layout, 0, sizeof(*layout));
1028 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1029 layout->fl_stripe_count = cpu_to_le32(1);
1030 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001031 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001032 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1033 req, ops);
1034 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035
1036 ceph_osdc_build_request(req, ofs, &len,
1037 ops,
1038 snapc,
1039 &mtime,
1040 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001041
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001042 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001043 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001044 *linger_req = req;
1045 }
1046
Alex Elder1dbb4392012-01-24 10:08:37 -06001047 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001048 if (ret < 0)
1049 goto done_err;
1050
1051 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001052 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001053 if (ver)
1054 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001055 dout("reassert_ver=%llu\n",
1056 (unsigned long long)
1057 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058 ceph_osdc_put_request(req);
1059 }
1060 return ret;
1061
1062done_err:
1063 bio_chain_put(req_data->bio);
1064 ceph_osdc_put_request(req);
1065done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001066 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001067 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001068 return ret;
1069}
1070
1071/*
1072 * Ceph osd op callback
1073 */
1074static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1075{
1076 struct rbd_request *req_data = req->r_priv;
1077 struct ceph_osd_reply_head *replyhead;
1078 struct ceph_osd_op *op;
1079 __s32 rc;
1080 u64 bytes;
1081 int read_op;
1082
1083 /* parse reply */
1084 replyhead = msg->front.iov_base;
1085 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1086 op = (void *)(replyhead + 1);
1087 rc = le32_to_cpu(replyhead->result);
1088 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001089 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090
Alex Elderbd919d42012-07-13 20:35:11 -05001091 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1092 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001093
1094 if (rc == -ENOENT && read_op) {
1095 zero_bio_chain(req_data->bio, 0);
1096 rc = 0;
1097 } else if (rc == 0 && read_op && bytes < req_data->len) {
1098 zero_bio_chain(req_data->bio, bytes);
1099 bytes = req_data->len;
1100 }
1101
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001102 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103
1104 if (req_data->bio)
1105 bio_chain_put(req_data->bio);
1106
1107 ceph_osdc_put_request(req);
1108 kfree(req_data);
1109}
1110
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001111static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1112{
1113 ceph_osdc_put_request(req);
1114}
1115
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001116/*
1117 * Do a synchronous ceph osd operation
1118 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001119static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001120 struct ceph_snap_context *snapc,
1121 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001123 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001124 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001125 u64 ofs, u64 inbound_size,
1126 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001127 struct ceph_osd_request **linger_req,
1128 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129{
1130 int ret;
1131 struct page **pages;
1132 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001133
Alex Elderaafb2302012-09-06 16:00:54 -05001134 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135
Alex Elderf8d4de62012-07-03 16:01:19 -05001136 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001138 if (IS_ERR(pages))
1139 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001140
Alex Elder0ce1a792012-07-03 16:01:18 -05001141 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001142 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143 pages, num_pages,
1144 flags,
1145 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001146 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001147 NULL,
1148 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001150 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001151
Alex Elderf8d4de62012-07-03 16:01:19 -05001152 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1153 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001155done:
1156 ceph_release_page_vector(pages, num_pages);
1157 return ret;
1158}
1159
1160/*
1161 * Do an asynchronous ceph osd operation
1162 */
1163static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001164 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165 struct ceph_snap_context *snapc,
1166 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001168 struct bio *bio,
1169 struct rbd_req_coll *coll,
1170 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171{
1172 char *seg_name;
1173 u64 seg_ofs;
1174 u64 seg_len;
1175 int ret;
1176 struct ceph_osd_req_op *ops;
1177 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001178 int opcode;
1179 int flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180
Alex Elder65ccfe22012-08-09 10:33:26 -07001181 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001182 if (!seg_name)
1183 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001184 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1185 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001186
Alex Elderff2e4bb2012-10-10 18:59:29 -07001187 if (rq_data_dir(rq) == WRITE) {
1188 opcode = CEPH_OSD_OP_WRITE;
1189 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
1190 payload_len = seg_len;
1191 } else {
1192 opcode = CEPH_OSD_OP_READ;
1193 flags = CEPH_OSD_FLAG_READ;
1194 payload_len = 0;
1195 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001196
Alex Elder57cfc102012-06-26 12:57:03 -07001197 ret = -ENOMEM;
1198 ops = rbd_create_rw_ops(1, opcode, payload_len);
1199 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001200 goto done;
1201
1202 /* we've taken care of segment sizes earlier when we
1203 cloned the bios. We should never have a segment
1204 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001205 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001206
1207 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1208 seg_name, seg_ofs, seg_len,
1209 bio,
1210 NULL, 0,
1211 flags,
1212 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001213 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001214 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001215
1216 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001217done:
1218 kfree(seg_name);
1219 return ret;
1220}
1221
1222/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223 * Request sync osd read
1224 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001225static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001227 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001228 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001229 char *buf,
1230 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231{
Alex Elder913d2fd2012-06-26 12:57:03 -07001232 struct ceph_osd_req_op *ops;
1233 int ret;
1234
1235 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1236 if (!ops)
1237 return -ENOMEM;
1238
1239 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001240 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001241 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001242 ops, object_name, ofs, len, buf, NULL, ver);
1243 rbd_destroy_ops(ops);
1244
1245 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246}
1247
1248/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001249 * Request sync osd watch
1250 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001251static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001252 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001253 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001254{
1255 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001256 int ret;
1257
Alex Elder57cfc102012-06-26 12:57:03 -07001258 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1259 if (!ops)
1260 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001261
Josh Durgina71b8912011-12-05 18:10:44 -08001262 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001263 ops[0].watch.cookie = notify_id;
1264 ops[0].watch.flag = 0;
1265
Alex Elder0ce1a792012-07-03 16:01:18 -05001266 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001267 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001268 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269 CEPH_OSD_FLAG_READ,
1270 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001271 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272 rbd_simple_req_cb, 0, NULL);
1273
1274 rbd_destroy_ops(ops);
1275 return ret;
1276}
1277
1278static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1279{
Alex Elder0ce1a792012-07-03 16:01:18 -05001280 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001281 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001282 int rc;
1283
Alex Elder0ce1a792012-07-03 16:01:18 -05001284 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001285 return;
1286
Alex Elderbd919d42012-07-13 20:35:11 -05001287 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1288 rbd_dev->header_name, (unsigned long long) notify_id,
1289 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001290 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001291 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001292 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001293 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001294
Alex Elder7f0a24d2012-07-25 09:32:40 -05001295 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001296}
1297
1298/*
1299 * Request sync osd watch
1300 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001301static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001302{
1303 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001304 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001305 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001306
Alex Elder57cfc102012-06-26 12:57:03 -07001307 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1308 if (!ops)
1309 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310
1311 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001312 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001313 if (ret < 0)
1314 goto fail;
1315
Alex Elder0e6f3222012-07-25 09:32:40 -05001316 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001317 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001318 ops[0].watch.flag = 1;
1319
Alex Elder0ce1a792012-07-03 16:01:18 -05001320 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001322 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1323 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001324 rbd_dev->header_name,
1325 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001326 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327
1328 if (ret < 0)
1329 goto fail_event;
1330
1331 rbd_destroy_ops(ops);
1332 return 0;
1333
1334fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001335 ceph_osdc_cancel_event(rbd_dev->watch_event);
1336 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337fail:
1338 rbd_destroy_ops(ops);
1339 return ret;
1340}
1341
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001342/*
1343 * Request sync osd unwatch
1344 */
Alex Elder070c6332012-07-25 09:32:41 -05001345static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001346{
1347 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001348 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001349
Alex Elder57cfc102012-06-26 12:57:03 -07001350 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1351 if (!ops)
1352 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001353
1354 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001355 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001356 ops[0].watch.flag = 0;
1357
Alex Elder0ce1a792012-07-03 16:01:18 -05001358 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001359 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001360 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1361 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001362 rbd_dev->header_name,
1363 0, 0, NULL, NULL, NULL);
1364
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001365
1366 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001367 ceph_osdc_cancel_event(rbd_dev->watch_event);
1368 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001369 return ret;
1370}
1371
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001372/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001373 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001374 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001375static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001376 const char *object_name,
1377 const char *class_name,
1378 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001379 const char *outbound,
1380 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001381 char *inbound,
1382 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001383 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001384 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001385{
1386 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001387 int class_name_len = strlen(class_name);
1388 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001389 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001390 int ret;
1391
Alex Elder3cb4a682012-06-26 12:57:03 -07001392 /*
1393 * Any input parameters required by the method we're calling
1394 * will be sent along with the class and method names as
1395 * part of the message payload. That data and its size are
1396 * supplied via the indata and indata_len fields (named from
1397 * the perspective of the server side) in the OSD request
1398 * operation.
1399 */
1400 payload_size = class_name_len + method_name_len + outbound_size;
1401 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001402 if (!ops)
1403 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001404
Alex Elderaded07e2012-07-03 16:01:18 -05001405 ops[0].cls.class_name = class_name;
1406 ops[0].cls.class_len = (__u8) class_name_len;
1407 ops[0].cls.method_name = method_name;
1408 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001409 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001410 ops[0].cls.indata = outbound;
1411 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001412
Alex Elder0ce1a792012-07-03 16:01:18 -05001413 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001414 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001415 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001416 object_name, 0, inbound_size, inbound,
1417 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001418
1419 rbd_destroy_ops(ops);
1420
1421 dout("cls_exec returned %d\n", ret);
1422 return ret;
1423}
1424
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001425static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1426{
1427 struct rbd_req_coll *coll =
1428 kzalloc(sizeof(struct rbd_req_coll) +
1429 sizeof(struct rbd_req_status) * num_reqs,
1430 GFP_ATOMIC);
1431
1432 if (!coll)
1433 return NULL;
1434 coll->total = num_reqs;
1435 kref_init(&coll->kref);
1436 return coll;
1437}
1438
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001439/*
1440 * block device queue callback
1441 */
1442static void rbd_rq_fn(struct request_queue *q)
1443{
1444 struct rbd_device *rbd_dev = q->queuedata;
1445 struct request *rq;
1446 struct bio_pair *bp = NULL;
1447
Alex Elder00f1f362012-02-07 12:03:36 -06001448 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001449 struct bio *bio;
1450 struct bio *rq_bio, *next_bio = NULL;
1451 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001452 unsigned int size;
1453 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001454 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001455 int num_segs, cur_seg = 0;
1456 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001457 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001458
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001459 dout("fetched request\n");
1460
1461 /* filter out block requests we don't understand */
1462 if ((rq->cmd_type != REQ_TYPE_FS)) {
1463 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001464 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001465 }
1466
1467 /* deduce our operation (read, write) */
1468 do_write = (rq_data_dir(rq) == WRITE);
1469
1470 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001471 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001472 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001473 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001475 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001476 }
1477
1478 spin_unlock_irq(q->queue_lock);
1479
Josh Durgind1d25642011-12-05 14:03:05 -08001480 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001481
Alex Elderf84344f2012-08-31 17:29:51 -05001482 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1483 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001484 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001485 dout("request for non-existent snapshot");
1486 spin_lock_irq(q->queue_lock);
1487 __blk_end_request_all(rq, -ENXIO);
1488 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001489 }
1490
Josh Durgind1d25642011-12-05 14:03:05 -08001491 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1492
1493 up_read(&rbd_dev->header_rwsem);
1494
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001495 dout("%s 0x%x bytes at 0x%llx\n",
1496 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001497 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001498
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001499 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001500 if (num_segs <= 0) {
1501 spin_lock_irq(q->queue_lock);
1502 __blk_end_request_all(rq, num_segs);
1503 ceph_put_snap_context(snapc);
1504 continue;
1505 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001506 coll = rbd_alloc_coll(num_segs);
1507 if (!coll) {
1508 spin_lock_irq(q->queue_lock);
1509 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001510 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001511 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001512 }
1513
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001514 do {
1515 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001516 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001517 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001518 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001519 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1520 op_size, GFP_ATOMIC);
1521 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001522 rbd_coll_end_req_index(rq, coll, cur_seg,
1523 -ENOMEM, op_size);
1524 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 }
1526
1527 /* init OSD command: write or read */
1528 if (do_write)
Alex Elder13f40422012-10-10 18:59:29 -07001529 (void) rbd_do_op(rq, rbd_dev,
1530 snapc, CEPH_NOSNAP,
Alex Elder13f40422012-10-10 18:59:29 -07001531 ofs, op_size, bio,
1532 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 else
Alex Elder13f40422012-10-10 18:59:29 -07001534 (void) rbd_do_op(rq, rbd_dev,
1535 NULL, rbd_dev->mapping.snap_id,
Alex Elder13f40422012-10-10 18:59:29 -07001536 ofs, op_size, bio,
1537 coll, cur_seg);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001538next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 size -= op_size;
1540 ofs += op_size;
1541
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001542 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 rq_bio = next_bio;
1544 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001545 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546
1547 if (bp)
1548 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001549 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001550
1551 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001552 }
1553}
1554
1555/*
1556 * a queue callback. Makes sure that we don't create a bio that spans across
1557 * multiple osd objects. One exception would be with a single page bios,
1558 * which we handle later at bio_chain_clone
1559 */
1560static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1561 struct bio_vec *bvec)
1562{
1563 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001564 unsigned int chunk_sectors;
1565 sector_t sector;
1566 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 int max;
1568
Alex Elder593a9e72012-02-07 12:03:37 -06001569 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1570 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1571 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1572
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001573 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001574 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001575 if (max < 0)
1576 max = 0; /* bio_add cannot handle a negative return */
1577 if (max <= bvec->bv_len && bio_sectors == 0)
1578 return bvec->bv_len;
1579 return max;
1580}
1581
1582static void rbd_free_disk(struct rbd_device *rbd_dev)
1583{
1584 struct gendisk *disk = rbd_dev->disk;
1585
1586 if (!disk)
1587 return;
1588
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001589 if (disk->flags & GENHD_FL_UP)
1590 del_gendisk(disk);
1591 if (disk->queue)
1592 blk_cleanup_queue(disk->queue);
1593 put_disk(disk);
1594}
1595
1596/*
Alex Elder4156d992012-08-02 11:29:46 -05001597 * Read the complete header for the given rbd device.
1598 *
1599 * Returns a pointer to a dynamically-allocated buffer containing
1600 * the complete and validated header. Caller can pass the address
1601 * of a variable that will be filled in with the version of the
1602 * header object at the time it was read.
1603 *
1604 * Returns a pointer-coded errno if a failure occurs.
1605 */
1606static struct rbd_image_header_ondisk *
1607rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1608{
1609 struct rbd_image_header_ondisk *ondisk = NULL;
1610 u32 snap_count = 0;
1611 u64 names_size = 0;
1612 u32 want_count;
1613 int ret;
1614
1615 /*
1616 * The complete header will include an array of its 64-bit
1617 * snapshot ids, followed by the names of those snapshots as
1618 * a contiguous block of NUL-terminated strings. Note that
1619 * the number of snapshots could change by the time we read
1620 * it in, in which case we re-read it.
1621 */
1622 do {
1623 size_t size;
1624
1625 kfree(ondisk);
1626
1627 size = sizeof (*ondisk);
1628 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1629 size += names_size;
1630 ondisk = kmalloc(size, GFP_KERNEL);
1631 if (!ondisk)
1632 return ERR_PTR(-ENOMEM);
1633
1634 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1635 rbd_dev->header_name,
1636 0, size,
1637 (char *) ondisk, version);
1638
1639 if (ret < 0)
1640 goto out_err;
1641 if (WARN_ON((size_t) ret < size)) {
1642 ret = -ENXIO;
1643 pr_warning("short header read for image %s"
1644 " (want %zd got %d)\n",
1645 rbd_dev->image_name, size, ret);
1646 goto out_err;
1647 }
1648 if (!rbd_dev_ondisk_valid(ondisk)) {
1649 ret = -ENXIO;
1650 pr_warning("invalid header for image %s\n",
1651 rbd_dev->image_name);
1652 goto out_err;
1653 }
1654
1655 names_size = le64_to_cpu(ondisk->snap_names_len);
1656 want_count = snap_count;
1657 snap_count = le32_to_cpu(ondisk->snap_count);
1658 } while (snap_count != want_count);
1659
1660 return ondisk;
1661
1662out_err:
1663 kfree(ondisk);
1664
1665 return ERR_PTR(ret);
1666}
1667
1668/*
1669 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001670 */
1671static int rbd_read_header(struct rbd_device *rbd_dev,
1672 struct rbd_image_header *header)
1673{
Alex Elder4156d992012-08-02 11:29:46 -05001674 struct rbd_image_header_ondisk *ondisk;
1675 u64 ver = 0;
1676 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001677
Alex Elder4156d992012-08-02 11:29:46 -05001678 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1679 if (IS_ERR(ondisk))
1680 return PTR_ERR(ondisk);
1681 ret = rbd_header_from_disk(header, ondisk);
1682 if (ret >= 0)
1683 header->obj_version = ver;
1684 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001685
Alex Elder4156d992012-08-02 11:29:46 -05001686 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001687}
1688
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001689static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1690{
1691 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001692 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001693
Alex Eldera0593292012-07-19 09:09:27 -05001694 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001695 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001696}
1697
Alex Elder94785542012-10-09 13:50:17 -07001698static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1699{
1700 sector_t size;
1701
1702 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
1703 return;
1704
1705 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1706 dout("setting size to %llu sectors", (unsigned long long) size);
1707 rbd_dev->mapping.size = (u64) size;
1708 set_capacity(rbd_dev->disk, size);
1709}
1710
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001711/*
1712 * only read the first part of the ondisk header, without the snaps info
1713 */
Alex Elder117973f2012-08-31 17:29:55 -05001714static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001715{
1716 int ret;
1717 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001718
1719 ret = rbd_read_header(rbd_dev, &h);
1720 if (ret < 0)
1721 return ret;
1722
Josh Durgina51aa0c2011-12-05 10:35:04 -08001723 down_write(&rbd_dev->header_rwsem);
1724
Alex Elder94785542012-10-09 13:50:17 -07001725 /* Update image size, and check for resize of mapped image */
1726 rbd_dev->header.image_size = h.image_size;
1727 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001728
Alex Elder849b4262012-07-09 21:04:24 -05001729 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001730 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001731 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001732 /* osd requests may still refer to snapc */
1733 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001734
Alex Elderb8136232012-07-25 09:32:41 -05001735 if (hver)
1736 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001737 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001738 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001739 rbd_dev->header.snapc = h.snapc;
1740 rbd_dev->header.snap_names = h.snap_names;
1741 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001742 /* Free the extra copy of the object prefix */
1743 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1744 kfree(h.object_prefix);
1745
Alex Elder304f6802012-08-31 17:29:52 -05001746 ret = rbd_dev_snaps_update(rbd_dev);
1747 if (!ret)
1748 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001749
Josh Durginc6666012011-11-21 17:11:12 -08001750 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001751
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001752 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001753}
1754
Alex Elder117973f2012-08-31 17:29:55 -05001755static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001756{
1757 int ret;
1758
Alex Elder117973f2012-08-31 17:29:55 -05001759 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001760 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001761 if (rbd_dev->image_format == 1)
1762 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1763 else
1764 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001765 mutex_unlock(&ctl_mutex);
1766
1767 return ret;
1768}
1769
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001770static int rbd_init_disk(struct rbd_device *rbd_dev)
1771{
1772 struct gendisk *disk;
1773 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001774 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001776 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001777 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1778 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001779 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780
Alex Elderf0f8cef2012-01-29 13:57:44 -06001781 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001782 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001783 disk->major = rbd_dev->major;
1784 disk->first_minor = 0;
1785 disk->fops = &rbd_bd_ops;
1786 disk->private_data = rbd_dev;
1787
1788 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001789 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1790 if (!q)
1791 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001792
Alex Elder593a9e72012-02-07 12:03:37 -06001793 /* We use the default size, but let's be explicit about it. */
1794 blk_queue_physical_block_size(q, SECTOR_SIZE);
1795
Josh Durgin029bcbd2011-07-22 11:35:23 -07001796 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001797 segment_size = rbd_obj_bytes(&rbd_dev->header);
1798 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1799 blk_queue_max_segment_size(q, segment_size);
1800 blk_queue_io_min(q, segment_size);
1801 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001802
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001803 blk_queue_merge_bvec(q, rbd_merge_bvec);
1804 disk->queue = q;
1805
1806 q->queuedata = rbd_dev;
1807
1808 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001809
Alex Elder12f02942012-08-29 17:11:07 -05001810 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1811
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813out_disk:
1814 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001815
1816 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001817}
1818
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001819/*
1820 sysfs
1821*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001822
Alex Elder593a9e72012-02-07 12:03:37 -06001823static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1824{
1825 return container_of(dev, struct rbd_device, dev);
1826}
1827
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001828static ssize_t rbd_size_show(struct device *dev,
1829 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001830{
Alex Elder593a9e72012-02-07 12:03:37 -06001831 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001832 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001833
Josh Durgina51aa0c2011-12-05 10:35:04 -08001834 down_read(&rbd_dev->header_rwsem);
1835 size = get_capacity(rbd_dev->disk);
1836 up_read(&rbd_dev->header_rwsem);
1837
1838 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001839}
1840
Alex Elder34b13182012-07-13 20:35:12 -05001841/*
1842 * Note this shows the features for whatever's mapped, which is not
1843 * necessarily the base image.
1844 */
1845static ssize_t rbd_features_show(struct device *dev,
1846 struct device_attribute *attr, char *buf)
1847{
1848 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1849
1850 return sprintf(buf, "0x%016llx\n",
1851 (unsigned long long) rbd_dev->mapping.features);
1852}
1853
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001854static ssize_t rbd_major_show(struct device *dev,
1855 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001856{
Alex Elder593a9e72012-02-07 12:03:37 -06001857 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001858
1859 return sprintf(buf, "%d\n", rbd_dev->major);
1860}
1861
1862static ssize_t rbd_client_id_show(struct device *dev,
1863 struct device_attribute *attr, char *buf)
1864{
Alex Elder593a9e72012-02-07 12:03:37 -06001865 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001866
Alex Elder1dbb4392012-01-24 10:08:37 -06001867 return sprintf(buf, "client%lld\n",
1868 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001869}
1870
1871static ssize_t rbd_pool_show(struct device *dev,
1872 struct device_attribute *attr, char *buf)
1873{
Alex Elder593a9e72012-02-07 12:03:37 -06001874 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001875
1876 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1877}
1878
Alex Elder9bb2f332012-07-12 10:46:35 -05001879static ssize_t rbd_pool_id_show(struct device *dev,
1880 struct device_attribute *attr, char *buf)
1881{
1882 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1883
1884 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1885}
1886
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001887static ssize_t rbd_name_show(struct device *dev,
1888 struct device_attribute *attr, char *buf)
1889{
Alex Elder593a9e72012-02-07 12:03:37 -06001890 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001891
Alex Elder0bed54d2012-07-03 16:01:18 -05001892 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001893}
1894
Alex Elder589d30e2012-07-10 20:30:11 -05001895static ssize_t rbd_image_id_show(struct device *dev,
1896 struct device_attribute *attr, char *buf)
1897{
1898 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1899
1900 return sprintf(buf, "%s\n", rbd_dev->image_id);
1901}
1902
Alex Elder34b13182012-07-13 20:35:12 -05001903/*
1904 * Shows the name of the currently-mapped snapshot (or
1905 * RBD_SNAP_HEAD_NAME for the base image).
1906 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001907static ssize_t rbd_snap_show(struct device *dev,
1908 struct device_attribute *attr,
1909 char *buf)
1910{
Alex Elder593a9e72012-02-07 12:03:37 -06001911 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001912
Alex Elderf84344f2012-08-31 17:29:51 -05001913 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001914}
1915
1916static ssize_t rbd_image_refresh(struct device *dev,
1917 struct device_attribute *attr,
1918 const char *buf,
1919 size_t size)
1920{
Alex Elder593a9e72012-02-07 12:03:37 -06001921 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001922 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923
Alex Elder117973f2012-08-31 17:29:55 -05001924 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001925
1926 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001927}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001928
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001929static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05001930static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001931static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1932static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1933static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001934static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001935static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05001936static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001937static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1938static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001939
1940static struct attribute *rbd_attrs[] = {
1941 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05001942 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001943 &dev_attr_major.attr,
1944 &dev_attr_client_id.attr,
1945 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001946 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001947 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05001948 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001949 &dev_attr_current_snap.attr,
1950 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001951 NULL
1952};
1953
1954static struct attribute_group rbd_attr_group = {
1955 .attrs = rbd_attrs,
1956};
1957
1958static const struct attribute_group *rbd_attr_groups[] = {
1959 &rbd_attr_group,
1960 NULL
1961};
1962
1963static void rbd_sysfs_dev_release(struct device *dev)
1964{
1965}
1966
1967static struct device_type rbd_device_type = {
1968 .name = "rbd",
1969 .groups = rbd_attr_groups,
1970 .release = rbd_sysfs_dev_release,
1971};
1972
1973
1974/*
1975 sysfs - snapshots
1976*/
1977
1978static ssize_t rbd_snap_size_show(struct device *dev,
1979 struct device_attribute *attr,
1980 char *buf)
1981{
1982 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1983
Josh Durgin35915382011-12-05 18:25:13 -08001984 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001985}
1986
1987static ssize_t rbd_snap_id_show(struct device *dev,
1988 struct device_attribute *attr,
1989 char *buf)
1990{
1991 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1992
Josh Durgin35915382011-12-05 18:25:13 -08001993 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001994}
1995
Alex Elder34b13182012-07-13 20:35:12 -05001996static ssize_t rbd_snap_features_show(struct device *dev,
1997 struct device_attribute *attr,
1998 char *buf)
1999{
2000 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2001
2002 return sprintf(buf, "0x%016llx\n",
2003 (unsigned long long) snap->features);
2004}
2005
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002006static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2007static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002008static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002009
2010static struct attribute *rbd_snap_attrs[] = {
2011 &dev_attr_snap_size.attr,
2012 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002013 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002014 NULL,
2015};
2016
2017static struct attribute_group rbd_snap_attr_group = {
2018 .attrs = rbd_snap_attrs,
2019};
2020
2021static void rbd_snap_dev_release(struct device *dev)
2022{
2023 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2024 kfree(snap->name);
2025 kfree(snap);
2026}
2027
2028static const struct attribute_group *rbd_snap_attr_groups[] = {
2029 &rbd_snap_attr_group,
2030 NULL
2031};
2032
2033static struct device_type rbd_snap_device_type = {
2034 .groups = rbd_snap_attr_groups,
2035 .release = rbd_snap_dev_release,
2036};
2037
Alex Elder304f6802012-08-31 17:29:52 -05002038static bool rbd_snap_registered(struct rbd_snap *snap)
2039{
2040 bool ret = snap->dev.type == &rbd_snap_device_type;
2041 bool reg = device_is_registered(&snap->dev);
2042
2043 rbd_assert(!ret ^ reg);
2044
2045 return ret;
2046}
2047
Alex Elder14e70852012-07-19 09:09:27 -05002048static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002049{
2050 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002051 if (device_is_registered(&snap->dev))
2052 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002053}
2054
Alex Elder14e70852012-07-19 09:09:27 -05002055static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002056 struct device *parent)
2057{
2058 struct device *dev = &snap->dev;
2059 int ret;
2060
2061 dev->type = &rbd_snap_device_type;
2062 dev->parent = parent;
2063 dev->release = rbd_snap_dev_release;
2064 dev_set_name(dev, "snap_%s", snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002065 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2066
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002067 ret = device_register(dev);
2068
2069 return ret;
2070}
2071
Alex Elder4e891e02012-07-10 20:30:10 -05002072static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002073 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002074 u64 snap_id, u64 snap_size,
2075 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002076{
Alex Elder4e891e02012-07-10 20:30:10 -05002077 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002078 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002079
2080 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002081 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002082 return ERR_PTR(-ENOMEM);
2083
2084 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002085 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002086 if (!snap->name)
2087 goto err;
2088
Alex Elderc8d18422012-07-10 20:30:11 -05002089 snap->id = snap_id;
2090 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002091 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002092
2093 return snap;
2094
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002095err:
2096 kfree(snap->name);
2097 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002098
2099 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002100}
2101
Alex Eldercd892122012-07-03 16:01:19 -05002102static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2103 u64 *snap_size, u64 *snap_features)
2104{
2105 char *snap_name;
2106
2107 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2108
2109 *snap_size = rbd_dev->header.snap_sizes[which];
2110 *snap_features = 0; /* No features for v1 */
2111
2112 /* Skip over names until we find the one we are looking for */
2113
2114 snap_name = rbd_dev->header.snap_names;
2115 while (which--)
2116 snap_name += strlen(snap_name) + 1;
2117
2118 return snap_name;
2119}
2120
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002121/*
Alex Elder9d475de2012-07-03 16:01:19 -05002122 * Get the size and object order for an image snapshot, or if
2123 * snap_id is CEPH_NOSNAP, gets this information for the base
2124 * image.
2125 */
2126static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2127 u8 *order, u64 *snap_size)
2128{
2129 __le64 snapid = cpu_to_le64(snap_id);
2130 int ret;
2131 struct {
2132 u8 order;
2133 __le64 size;
2134 } __attribute__ ((packed)) size_buf = { 0 };
2135
2136 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2137 "rbd", "get_size",
2138 (char *) &snapid, sizeof (snapid),
2139 (char *) &size_buf, sizeof (size_buf),
2140 CEPH_OSD_FLAG_READ, NULL);
2141 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2142 if (ret < 0)
2143 return ret;
2144
2145 *order = size_buf.order;
2146 *snap_size = le64_to_cpu(size_buf.size);
2147
2148 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2149 (unsigned long long) snap_id, (unsigned int) *order,
2150 (unsigned long long) *snap_size);
2151
2152 return 0;
2153}
2154
2155static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2156{
2157 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2158 &rbd_dev->header.obj_order,
2159 &rbd_dev->header.image_size);
2160}
2161
Alex Elder1e130192012-07-03 16:01:19 -05002162static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2163{
2164 void *reply_buf;
2165 int ret;
2166 void *p;
2167
2168 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2169 if (!reply_buf)
2170 return -ENOMEM;
2171
2172 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2173 "rbd", "get_object_prefix",
2174 NULL, 0,
2175 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2176 CEPH_OSD_FLAG_READ, NULL);
2177 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2178 if (ret < 0)
2179 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002180 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002181
2182 p = reply_buf;
2183 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2184 p + RBD_OBJ_PREFIX_LEN_MAX,
2185 NULL, GFP_NOIO);
2186
2187 if (IS_ERR(rbd_dev->header.object_prefix)) {
2188 ret = PTR_ERR(rbd_dev->header.object_prefix);
2189 rbd_dev->header.object_prefix = NULL;
2190 } else {
2191 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2192 }
2193
2194out:
2195 kfree(reply_buf);
2196
2197 return ret;
2198}
2199
Alex Elderb1b54022012-07-03 16:01:19 -05002200static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2201 u64 *snap_features)
2202{
2203 __le64 snapid = cpu_to_le64(snap_id);
2204 struct {
2205 __le64 features;
2206 __le64 incompat;
2207 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002208 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002209 int ret;
2210
2211 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2212 "rbd", "get_features",
2213 (char *) &snapid, sizeof (snapid),
2214 (char *) &features_buf, sizeof (features_buf),
2215 CEPH_OSD_FLAG_READ, NULL);
2216 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2217 if (ret < 0)
2218 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002219
2220 incompat = le64_to_cpu(features_buf.incompat);
2221 if (incompat & ~RBD_FEATURES_ALL)
2222 return -ENOTSUPP;
2223
Alex Elderb1b54022012-07-03 16:01:19 -05002224 *snap_features = le64_to_cpu(features_buf.features);
2225
2226 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2227 (unsigned long long) snap_id,
2228 (unsigned long long) *snap_features,
2229 (unsigned long long) le64_to_cpu(features_buf.incompat));
2230
2231 return 0;
2232}
2233
2234static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2235{
2236 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2237 &rbd_dev->header.features);
2238}
2239
Alex Elder6e14b1a2012-07-03 16:01:19 -05002240static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002241{
2242 size_t size;
2243 int ret;
2244 void *reply_buf;
2245 void *p;
2246 void *end;
2247 u64 seq;
2248 u32 snap_count;
2249 struct ceph_snap_context *snapc;
2250 u32 i;
2251
2252 /*
2253 * We'll need room for the seq value (maximum snapshot id),
2254 * snapshot count, and array of that many snapshot ids.
2255 * For now we have a fixed upper limit on the number we're
2256 * prepared to receive.
2257 */
2258 size = sizeof (__le64) + sizeof (__le32) +
2259 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2260 reply_buf = kzalloc(size, GFP_KERNEL);
2261 if (!reply_buf)
2262 return -ENOMEM;
2263
2264 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2265 "rbd", "get_snapcontext",
2266 NULL, 0,
2267 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002268 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002269 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2270 if (ret < 0)
2271 goto out;
2272
2273 ret = -ERANGE;
2274 p = reply_buf;
2275 end = (char *) reply_buf + size;
2276 ceph_decode_64_safe(&p, end, seq, out);
2277 ceph_decode_32_safe(&p, end, snap_count, out);
2278
2279 /*
2280 * Make sure the reported number of snapshot ids wouldn't go
2281 * beyond the end of our buffer. But before checking that,
2282 * make sure the computed size of the snapshot context we
2283 * allocate is representable in a size_t.
2284 */
2285 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2286 / sizeof (u64)) {
2287 ret = -EINVAL;
2288 goto out;
2289 }
2290 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2291 goto out;
2292
2293 size = sizeof (struct ceph_snap_context) +
2294 snap_count * sizeof (snapc->snaps[0]);
2295 snapc = kmalloc(size, GFP_KERNEL);
2296 if (!snapc) {
2297 ret = -ENOMEM;
2298 goto out;
2299 }
2300
2301 atomic_set(&snapc->nref, 1);
2302 snapc->seq = seq;
2303 snapc->num_snaps = snap_count;
2304 for (i = 0; i < snap_count; i++)
2305 snapc->snaps[i] = ceph_decode_64(&p);
2306
2307 rbd_dev->header.snapc = snapc;
2308
2309 dout(" snap context seq = %llu, snap_count = %u\n",
2310 (unsigned long long) seq, (unsigned int) snap_count);
2311
2312out:
2313 kfree(reply_buf);
2314
2315 return 0;
2316}
2317
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002318static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2319{
2320 size_t size;
2321 void *reply_buf;
2322 __le64 snap_id;
2323 int ret;
2324 void *p;
2325 void *end;
2326 size_t snap_name_len;
2327 char *snap_name;
2328
2329 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2330 reply_buf = kmalloc(size, GFP_KERNEL);
2331 if (!reply_buf)
2332 return ERR_PTR(-ENOMEM);
2333
2334 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2335 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2336 "rbd", "get_snapshot_name",
2337 (char *) &snap_id, sizeof (snap_id),
2338 reply_buf, size,
2339 CEPH_OSD_FLAG_READ, NULL);
2340 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2341 if (ret < 0)
2342 goto out;
2343
2344 p = reply_buf;
2345 end = (char *) reply_buf + size;
2346 snap_name_len = 0;
2347 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2348 GFP_KERNEL);
2349 if (IS_ERR(snap_name)) {
2350 ret = PTR_ERR(snap_name);
2351 goto out;
2352 } else {
2353 dout(" snap_id 0x%016llx snap_name = %s\n",
2354 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2355 }
2356 kfree(reply_buf);
2357
2358 return snap_name;
2359out:
2360 kfree(reply_buf);
2361
2362 return ERR_PTR(ret);
2363}
2364
2365static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2366 u64 *snap_size, u64 *snap_features)
2367{
2368 __le64 snap_id;
2369 u8 order;
2370 int ret;
2371
2372 snap_id = rbd_dev->header.snapc->snaps[which];
2373 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2374 if (ret)
2375 return ERR_PTR(ret);
2376 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2377 if (ret)
2378 return ERR_PTR(ret);
2379
2380 return rbd_dev_v2_snap_name(rbd_dev, which);
2381}
2382
2383static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2384 u64 *snap_size, u64 *snap_features)
2385{
2386 if (rbd_dev->image_format == 1)
2387 return rbd_dev_v1_snap_info(rbd_dev, which,
2388 snap_size, snap_features);
2389 if (rbd_dev->image_format == 2)
2390 return rbd_dev_v2_snap_info(rbd_dev, which,
2391 snap_size, snap_features);
2392 return ERR_PTR(-EINVAL);
2393}
2394
Alex Elder117973f2012-08-31 17:29:55 -05002395static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2396{
2397 int ret;
2398 __u8 obj_order;
2399
2400 down_write(&rbd_dev->header_rwsem);
2401
2402 /* Grab old order first, to see if it changes */
2403
2404 obj_order = rbd_dev->header.obj_order,
2405 ret = rbd_dev_v2_image_size(rbd_dev);
2406 if (ret)
2407 goto out;
2408 if (rbd_dev->header.obj_order != obj_order) {
2409 ret = -EIO;
2410 goto out;
2411 }
2412 rbd_update_mapping_size(rbd_dev);
2413
2414 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2415 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2416 if (ret)
2417 goto out;
2418 ret = rbd_dev_snaps_update(rbd_dev);
2419 dout("rbd_dev_snaps_update returned %d\n", ret);
2420 if (ret)
2421 goto out;
2422 ret = rbd_dev_snaps_register(rbd_dev);
2423 dout("rbd_dev_snaps_register returned %d\n", ret);
2424out:
2425 up_write(&rbd_dev->header_rwsem);
2426
2427 return ret;
2428}
2429
Alex Elder9d475de2012-07-03 16:01:19 -05002430/*
Alex Elder35938152012-08-02 11:29:46 -05002431 * Scan the rbd device's current snapshot list and compare it to the
2432 * newly-received snapshot context. Remove any existing snapshots
2433 * not present in the new snapshot context. Add a new snapshot for
2434 * any snaphots in the snapshot context not in the current list.
2435 * And verify there are no changes to snapshots we already know
2436 * about.
2437 *
2438 * Assumes the snapshots in the snapshot context are sorted by
2439 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2440 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002441 */
Alex Elder304f6802012-08-31 17:29:52 -05002442static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002443{
Alex Elder35938152012-08-02 11:29:46 -05002444 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2445 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002446 struct list_head *head = &rbd_dev->snaps;
2447 struct list_head *links = head->next;
2448 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002449
Alex Elder9fcbb802012-08-23 23:48:49 -05002450 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002451 while (index < snap_count || links != head) {
2452 u64 snap_id;
2453 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002454 char *snap_name;
2455 u64 snap_size = 0;
2456 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002457
Alex Elder35938152012-08-02 11:29:46 -05002458 snap_id = index < snap_count ? snapc->snaps[index]
2459 : CEPH_NOSNAP;
2460 snap = links != head ? list_entry(links, struct rbd_snap, node)
2461 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002462 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002463
Alex Elder35938152012-08-02 11:29:46 -05002464 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2465 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002466
Alex Elder35938152012-08-02 11:29:46 -05002467 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002468
Alex Elderf84344f2012-08-31 17:29:51 -05002469 if (rbd_dev->mapping.snap_id == snap->id)
2470 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002471 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002472 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002473 rbd_dev->mapping.snap_id == snap->id ?
2474 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002475 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002476
Alex Elder35938152012-08-02 11:29:46 -05002477 /* Done with this list entry; advance */
2478
2479 links = next;
2480 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002481 }
Alex Elder35938152012-08-02 11:29:46 -05002482
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002483 snap_name = rbd_dev_snap_info(rbd_dev, index,
2484 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002485 if (IS_ERR(snap_name))
2486 return PTR_ERR(snap_name);
2487
Alex Elder9fcbb802012-08-23 23:48:49 -05002488 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2489 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002490 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2491 struct rbd_snap *new_snap;
2492
2493 /* We haven't seen this snapshot before */
2494
Alex Elderc8d18422012-07-10 20:30:11 -05002495 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002496 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002497 if (IS_ERR(new_snap)) {
2498 int err = PTR_ERR(new_snap);
2499
2500 dout(" failed to add dev, error %d\n", err);
2501
2502 return err;
2503 }
Alex Elder35938152012-08-02 11:29:46 -05002504
2505 /* New goes before existing, or at end of list */
2506
Alex Elder9fcbb802012-08-23 23:48:49 -05002507 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002508 if (snap)
2509 list_add_tail(&new_snap->node, &snap->node);
2510 else
Alex Elder523f3252012-08-30 00:16:37 -05002511 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002512 } else {
2513 /* Already have this one */
2514
Alex Elder9fcbb802012-08-23 23:48:49 -05002515 dout(" already present\n");
2516
Alex Eldercd892122012-07-03 16:01:19 -05002517 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002518 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002519 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002520
2521 /* Done with this list entry; advance */
2522
2523 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002524 }
Alex Elder35938152012-08-02 11:29:46 -05002525
2526 /* Advance to the next entry in the snapshot context */
2527
2528 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002529 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002530 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002531
2532 return 0;
2533}
2534
Alex Elder304f6802012-08-31 17:29:52 -05002535/*
2536 * Scan the list of snapshots and register the devices for any that
2537 * have not already been registered.
2538 */
2539static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2540{
2541 struct rbd_snap *snap;
2542 int ret = 0;
2543
2544 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002545 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2546 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002547
2548 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2549 if (!rbd_snap_registered(snap)) {
2550 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2551 if (ret < 0)
2552 break;
2553 }
2554 }
2555 dout("%s: returning %d\n", __func__, ret);
2556
2557 return ret;
2558}
2559
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002560static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2561{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002562 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002563 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002564
2565 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002566
Alex Eldercd789ab2012-08-30 00:16:38 -05002567 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002568 dev->bus = &rbd_bus_type;
2569 dev->type = &rbd_device_type;
2570 dev->parent = &rbd_root_dev;
2571 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002572 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002573 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002574
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002575 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002576
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002577 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002578}
2579
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002580static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2581{
2582 device_unregister(&rbd_dev->dev);
2583}
2584
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002585static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2586{
2587 int ret, rc;
2588
2589 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002590 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002591 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002592 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002593 if (rc < 0)
2594 return rc;
2595 }
2596 } while (ret == -ERANGE);
2597
2598 return ret;
2599}
2600
Alex Eldere2839302012-08-29 17:11:06 -05002601static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002602
2603/*
Alex Elder499afd52012-02-02 08:13:29 -06002604 * Get a unique rbd identifier for the given new rbd_dev, and add
2605 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002606 */
Alex Eldere2839302012-08-29 17:11:06 -05002607static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002608{
Alex Eldere2839302012-08-29 17:11:06 -05002609 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002610
2611 spin_lock(&rbd_dev_list_lock);
2612 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2613 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002614 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2615 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002616}
Alex Elderb7f23c32012-01-29 13:57:43 -06002617
Alex Elder1ddbe942012-01-29 13:57:44 -06002618/*
Alex Elder499afd52012-02-02 08:13:29 -06002619 * Remove an rbd_dev from the global list, and record that its
2620 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002621 */
Alex Eldere2839302012-08-29 17:11:06 -05002622static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002623{
Alex Elderd184f6b2012-01-29 13:57:44 -06002624 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002625 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002626 int max_id;
2627
Alex Elderaafb2302012-09-06 16:00:54 -05002628 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002629
Alex Eldere2839302012-08-29 17:11:06 -05002630 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2631 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002632 spin_lock(&rbd_dev_list_lock);
2633 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002634
2635 /*
2636 * If the id being "put" is not the current maximum, there
2637 * is nothing special we need to do.
2638 */
Alex Eldere2839302012-08-29 17:11:06 -05002639 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002640 spin_unlock(&rbd_dev_list_lock);
2641 return;
2642 }
2643
2644 /*
2645 * We need to update the current maximum id. Search the
2646 * list to find out what it is. We're more likely to find
2647 * the maximum at the end, so search the list backward.
2648 */
2649 max_id = 0;
2650 list_for_each_prev(tmp, &rbd_dev_list) {
2651 struct rbd_device *rbd_dev;
2652
2653 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002654 if (rbd_dev->dev_id > max_id)
2655 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002656 }
Alex Elder499afd52012-02-02 08:13:29 -06002657 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002658
Alex Elder1ddbe942012-01-29 13:57:44 -06002659 /*
Alex Eldere2839302012-08-29 17:11:06 -05002660 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002661 * which case it now accurately reflects the new maximum.
2662 * Be careful not to overwrite the maximum value in that
2663 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002664 */
Alex Eldere2839302012-08-29 17:11:06 -05002665 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2666 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002667}
2668
Alex Eldera725f65e2012-02-02 08:13:30 -06002669/*
Alex Eldere28fff262012-02-02 08:13:30 -06002670 * Skips over white space at *buf, and updates *buf to point to the
2671 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002672 * the token (string of non-white space characters) found. Note
2673 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002674 */
2675static inline size_t next_token(const char **buf)
2676{
2677 /*
2678 * These are the characters that produce nonzero for
2679 * isspace() in the "C" and "POSIX" locales.
2680 */
2681 const char *spaces = " \f\n\r\t\v";
2682
2683 *buf += strspn(*buf, spaces); /* Find start of token */
2684
2685 return strcspn(*buf, spaces); /* Return token length */
2686}
2687
2688/*
2689 * Finds the next token in *buf, and if the provided token buffer is
2690 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002691 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2692 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002693 *
2694 * Returns the length of the token found (not including the '\0').
2695 * Return value will be 0 if no token is found, and it will be >=
2696 * token_size if the token would not fit.
2697 *
Alex Elder593a9e72012-02-07 12:03:37 -06002698 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002699 * found token. Note that this occurs even if the token buffer is
2700 * too small to hold it.
2701 */
2702static inline size_t copy_token(const char **buf,
2703 char *token,
2704 size_t token_size)
2705{
2706 size_t len;
2707
2708 len = next_token(buf);
2709 if (len < token_size) {
2710 memcpy(token, *buf, len);
2711 *(token + len) = '\0';
2712 }
2713 *buf += len;
2714
2715 return len;
2716}
2717
2718/*
Alex Elderea3352f2012-07-09 21:04:23 -05002719 * Finds the next token in *buf, dynamically allocates a buffer big
2720 * enough to hold a copy of it, and copies the token into the new
2721 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2722 * that a duplicate buffer is created even for a zero-length token.
2723 *
2724 * Returns a pointer to the newly-allocated duplicate, or a null
2725 * pointer if memory for the duplicate was not available. If
2726 * the lenp argument is a non-null pointer, the length of the token
2727 * (not including the '\0') is returned in *lenp.
2728 *
2729 * If successful, the *buf pointer will be updated to point beyond
2730 * the end of the found token.
2731 *
2732 * Note: uses GFP_KERNEL for allocation.
2733 */
2734static inline char *dup_token(const char **buf, size_t *lenp)
2735{
2736 char *dup;
2737 size_t len;
2738
2739 len = next_token(buf);
2740 dup = kmalloc(len + 1, GFP_KERNEL);
2741 if (!dup)
2742 return NULL;
2743
2744 memcpy(dup, *buf, len);
2745 *(dup + len) = '\0';
2746 *buf += len;
2747
2748 if (lenp)
2749 *lenp = len;
2750
2751 return dup;
2752}
2753
2754/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002755 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2756 * rbd_md_name, and name fields of the given rbd_dev, based on the
2757 * list of monitor addresses and other options provided via
2758 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2759 * copy of the snapshot name to map if successful, or a
2760 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002761 *
2762 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002763 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002764static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2765 const char *buf,
2766 const char **mon_addrs,
2767 size_t *mon_addrs_size,
2768 char *options,
2769 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002770{
Alex Elderd22f76e2012-07-12 10:46:35 -05002771 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002772 char *err_ptr = ERR_PTR(-EINVAL);
2773 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002774
2775 /* The first four tokens are required */
2776
Alex Elder7ef32142012-02-02 08:13:30 -06002777 len = next_token(&buf);
2778 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002779 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002780 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002781 *mon_addrs = buf;
2782
2783 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002784
Alex Eldere28fff262012-02-02 08:13:30 -06002785 len = copy_token(&buf, options, options_size);
2786 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002787 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002788
Alex Elder3feeb8942012-08-31 17:29:52 -05002789 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002790 rbd_dev->pool_name = dup_token(&buf, NULL);
2791 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002792 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002793
Alex Elder0bed54d2012-07-03 16:01:18 -05002794 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2795 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002796 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002797
Alex Elder3feeb8942012-08-31 17:29:52 -05002798 /* Snapshot name is optional */
2799 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002800 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002801 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2802 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002803 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002804 snap_name = kmalloc(len + 1, GFP_KERNEL);
2805 if (!snap_name)
2806 goto out_err;
2807 memcpy(snap_name, buf, len);
2808 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002809
Alex Elder3feeb8942012-08-31 17:29:52 -05002810dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2811
2812 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002813
2814out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002815 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002816 rbd_dev->image_name = NULL;
2817 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002818 kfree(rbd_dev->pool_name);
2819 rbd_dev->pool_name = NULL;
2820
Alex Elder3feeb8942012-08-31 17:29:52 -05002821 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002822}
2823
Alex Elder589d30e2012-07-10 20:30:11 -05002824/*
2825 * An rbd format 2 image has a unique identifier, distinct from the
2826 * name given to it by the user. Internally, that identifier is
2827 * what's used to specify the names of objects related to the image.
2828 *
2829 * A special "rbd id" object is used to map an rbd image name to its
2830 * id. If that object doesn't exist, then there is no v2 rbd image
2831 * with the supplied name.
2832 *
2833 * This function will record the given rbd_dev's image_id field if
2834 * it can be determined, and in that case will return 0. If any
2835 * errors occur a negative errno will be returned and the rbd_dev's
2836 * image_id field will be unchanged (and should be NULL).
2837 */
2838static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2839{
2840 int ret;
2841 size_t size;
2842 char *object_name;
2843 void *response;
2844 void *p;
2845
2846 /*
2847 * First, see if the format 2 image id file exists, and if
2848 * so, get the image's persistent id from it.
2849 */
2850 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2851 object_name = kmalloc(size, GFP_NOIO);
2852 if (!object_name)
2853 return -ENOMEM;
2854 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2855 dout("rbd id object name is %s\n", object_name);
2856
2857 /* Response will be an encoded string, which includes a length */
2858
2859 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2860 response = kzalloc(size, GFP_NOIO);
2861 if (!response) {
2862 ret = -ENOMEM;
2863 goto out;
2864 }
2865
2866 ret = rbd_req_sync_exec(rbd_dev, object_name,
2867 "rbd", "get_id",
2868 NULL, 0,
2869 response, RBD_IMAGE_ID_LEN_MAX,
2870 CEPH_OSD_FLAG_READ, NULL);
2871 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2872 if (ret < 0)
2873 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002874 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05002875
2876 p = response;
2877 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2878 p + RBD_IMAGE_ID_LEN_MAX,
2879 &rbd_dev->image_id_len,
2880 GFP_NOIO);
2881 if (IS_ERR(rbd_dev->image_id)) {
2882 ret = PTR_ERR(rbd_dev->image_id);
2883 rbd_dev->image_id = NULL;
2884 } else {
2885 dout("image_id is %s\n", rbd_dev->image_id);
2886 }
2887out:
2888 kfree(response);
2889 kfree(object_name);
2890
2891 return ret;
2892}
2893
Alex Eldera30b71b2012-07-10 20:30:11 -05002894static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2895{
2896 int ret;
2897 size_t size;
2898
2899 /* Version 1 images have no id; empty string is used */
2900
2901 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2902 if (!rbd_dev->image_id)
2903 return -ENOMEM;
2904 rbd_dev->image_id_len = 0;
2905
2906 /* Record the header object name for this rbd image. */
2907
2908 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2909 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2910 if (!rbd_dev->header_name) {
2911 ret = -ENOMEM;
2912 goto out_err;
2913 }
2914 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2915
2916 /* Populate rbd image metadata */
2917
2918 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2919 if (ret < 0)
2920 goto out_err;
2921 rbd_dev->image_format = 1;
2922
2923 dout("discovered version 1 image, header name is %s\n",
2924 rbd_dev->header_name);
2925
2926 return 0;
2927
2928out_err:
2929 kfree(rbd_dev->header_name);
2930 rbd_dev->header_name = NULL;
2931 kfree(rbd_dev->image_id);
2932 rbd_dev->image_id = NULL;
2933
2934 return ret;
2935}
2936
2937static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2938{
2939 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05002940 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05002941 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05002942
2943 /*
2944 * Image id was filled in by the caller. Record the header
2945 * object name for this rbd image.
2946 */
2947 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2948 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2949 if (!rbd_dev->header_name)
2950 return -ENOMEM;
2951 sprintf(rbd_dev->header_name, "%s%s",
2952 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05002953
2954 /* Get the size and object order for the image */
2955
2956 ret = rbd_dev_v2_image_size(rbd_dev);
2957 if (ret < 0)
2958 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05002959
2960 /* Get the object prefix (a.k.a. block_name) for the image */
2961
2962 ret = rbd_dev_v2_object_prefix(rbd_dev);
2963 if (ret < 0)
2964 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05002965
Alex Elderd8891402012-10-09 13:50:17 -07002966 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05002967
2968 ret = rbd_dev_v2_features(rbd_dev);
2969 if (ret < 0)
2970 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05002971
Alex Elder6e14b1a2012-07-03 16:01:19 -05002972 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05002973
Alex Elder6e14b1a2012-07-03 16:01:19 -05002974 rbd_dev->header.crypt_type = 0;
2975 rbd_dev->header.comp_type = 0;
2976
2977 /* Get the snapshot context, plus the header version */
2978
2979 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002980 if (ret)
2981 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05002982 rbd_dev->header.obj_version = ver;
2983
Alex Eldera30b71b2012-07-10 20:30:11 -05002984 rbd_dev->image_format = 2;
2985
2986 dout("discovered version 2 image, header name is %s\n",
2987 rbd_dev->header_name);
2988
Alex Elder35152972012-08-31 17:29:55 -05002989 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05002990out_err:
2991 kfree(rbd_dev->header_name);
2992 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05002993 kfree(rbd_dev->header.object_prefix);
2994 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05002995
2996 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05002997}
2998
2999/*
3000 * Probe for the existence of the header object for the given rbd
3001 * device. For format 2 images this includes determining the image
3002 * id.
3003 */
3004static int rbd_dev_probe(struct rbd_device *rbd_dev)
3005{
3006 int ret;
3007
3008 /*
3009 * Get the id from the image id object. If it's not a
3010 * format 2 image, we'll get ENOENT back, and we'll assume
3011 * it's a format 1 image.
3012 */
3013 ret = rbd_dev_image_id(rbd_dev);
3014 if (ret)
3015 ret = rbd_dev_v1_probe(rbd_dev);
3016 else
3017 ret = rbd_dev_v2_probe(rbd_dev);
3018 if (ret)
3019 dout("probe failed, returning %d\n", ret);
3020
3021 return ret;
3022}
3023
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003024static ssize_t rbd_add(struct bus_type *bus,
3025 const char *buf,
3026 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003027{
Alex Eldercb8627c2012-07-09 21:04:23 -05003028 char *options;
3029 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06003030 const char *mon_addrs = NULL;
3031 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06003032 struct ceph_osd_client *osdc;
3033 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05003034 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003035
3036 if (!try_module_get(THIS_MODULE))
3037 return -ENODEV;
3038
Alex Elder27cc2592012-02-02 08:13:30 -06003039 options = kmalloc(count, GFP_KERNEL);
3040 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05003041 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05003042 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3043 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05003044 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003045
3046 /* static rbd_device initialization */
3047 spin_lock_init(&rbd_dev->lock);
3048 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003049 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003050 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003051
Alex Eldera725f65e2012-02-02 08:13:30 -06003052 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05003053 snap_name = rbd_add_parse_args(rbd_dev, buf,
3054 &mon_addrs, &mon_addrs_size, options, count);
3055 if (IS_ERR(snap_name)) {
3056 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003057 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05003058 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003059
Alex Elderf8c38922012-08-10 13:12:07 -07003060 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3061 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003062 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003063
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003064 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003065 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003066 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3067 if (rc < 0)
3068 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05003069 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003070
Alex Eldera30b71b2012-07-10 20:30:11 -05003071 rc = rbd_dev_probe(rbd_dev);
3072 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003073 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003074
3075 /* no need to lock here, as rbd_dev is not registered yet */
3076 rc = rbd_dev_snaps_update(rbd_dev);
3077 if (rc)
3078 goto err_out_header;
3079
3080 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3081 if (rc)
3082 goto err_out_header;
3083
Alex Elder85ae8922012-07-26 23:37:14 -05003084 /* generate unique id: find highest unique id, add one */
3085 rbd_dev_id_get(rbd_dev);
3086
3087 /* Fill in the device name, now that we have its id. */
3088 BUILD_BUG_ON(DEV_NAME_LEN
3089 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3090 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3091
3092 /* Get our block major device number. */
3093
Alex Elder27cc2592012-02-02 08:13:30 -06003094 rc = register_blkdev(0, rbd_dev->name);
3095 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003096 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003097 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003098
Alex Elder0f308a32012-08-29 17:11:07 -05003099 /* Set up the blkdev mapping. */
3100
3101 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003102 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003103 goto err_out_blkdev;
3104
Alex Elder0f308a32012-08-29 17:11:07 -05003105 rc = rbd_bus_add_dev(rbd_dev);
3106 if (rc)
3107 goto err_out_disk;
3108
Alex Elder32eec682012-02-08 16:11:14 -06003109 /*
3110 * At this point cleanup in the event of an error is the job
3111 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003112 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003113
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003114 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003115 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003116 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003117 if (rc)
3118 goto err_out_bus;
3119
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003120 rc = rbd_init_watch_dev(rbd_dev);
3121 if (rc)
3122 goto err_out_bus;
3123
Alex Elder3ee40012012-08-29 17:11:07 -05003124 /* Everything's ready. Announce the disk to the world. */
3125
3126 add_disk(rbd_dev->disk);
3127
3128 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3129 (unsigned long long) rbd_dev->mapping.size);
3130
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003131 return count;
3132
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003133err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003134 /* this will also clean up rest of rbd_dev stuff */
3135
3136 rbd_bus_del_dev(rbd_dev);
3137 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003138 return rc;
3139
Alex Elder0f308a32012-08-29 17:11:07 -05003140err_out_disk:
3141 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003142err_out_blkdev:
3143 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003144err_out_id:
3145 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05003146err_out_header:
3147 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003148err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003149 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003150 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05003151 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05003152err_out_args:
3153 kfree(rbd_dev->mapping.snap_name);
3154 kfree(rbd_dev->image_name);
3155 kfree(rbd_dev->pool_name);
3156err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06003157 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05003158 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06003159
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003160 dout("Error adding device %s\n", buf);
3161 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003162
3163 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003164}
3165
Alex Elderde71a292012-07-03 16:01:19 -05003166static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003167{
3168 struct list_head *tmp;
3169 struct rbd_device *rbd_dev;
3170
Alex Eldere124a822012-01-29 13:57:44 -06003171 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003172 list_for_each(tmp, &rbd_dev_list) {
3173 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003174 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003175 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003176 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003177 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003178 }
Alex Eldere124a822012-01-29 13:57:44 -06003179 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003180 return NULL;
3181}
3182
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003183static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003184{
Alex Elder593a9e72012-02-07 12:03:37 -06003185 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003186
Alex Elder1dbb4392012-01-24 10:08:37 -06003187 if (rbd_dev->watch_request) {
3188 struct ceph_client *client = rbd_dev->rbd_client->client;
3189
3190 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003191 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003192 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003193 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003194 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003195
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003196 rbd_put_client(rbd_dev);
3197
3198 /* clean up and free blkdev */
3199 rbd_free_disk(rbd_dev);
3200 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003201
Alex Elder2ac4e752012-07-10 20:30:10 -05003202 /* release allocated disk header fields */
3203 rbd_header_free(&rbd_dev->header);
3204
Alex Elder32eec682012-02-08 16:11:14 -06003205 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05003206 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003207 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003208 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003209 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003210 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003211 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003212 kfree(rbd_dev);
3213
3214 /* release module ref */
3215 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003216}
3217
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003218static ssize_t rbd_remove(struct bus_type *bus,
3219 const char *buf,
3220 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003221{
3222 struct rbd_device *rbd_dev = NULL;
3223 int target_id, rc;
3224 unsigned long ul;
3225 int ret = count;
3226
3227 rc = strict_strtoul(buf, 10, &ul);
3228 if (rc)
3229 return rc;
3230
3231 /* convert to int; abort if we lost anything in the conversion */
3232 target_id = (int) ul;
3233 if (target_id != ul)
3234 return -EINVAL;
3235
3236 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3237
3238 rbd_dev = __rbd_get_dev(target_id);
3239 if (!rbd_dev) {
3240 ret = -ENOENT;
3241 goto done;
3242 }
3243
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003244 __rbd_remove_all_snaps(rbd_dev);
3245 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003246
3247done:
3248 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003249
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003250 return ret;
3251}
3252
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003253/*
3254 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003255 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003256 */
3257static int rbd_sysfs_init(void)
3258{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003259 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003260
Alex Elderfed4c142012-02-07 12:03:36 -06003261 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003262 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003263 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003264
Alex Elderfed4c142012-02-07 12:03:36 -06003265 ret = bus_register(&rbd_bus_type);
3266 if (ret < 0)
3267 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003268
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003269 return ret;
3270}
3271
3272static void rbd_sysfs_cleanup(void)
3273{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003274 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003275 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003276}
3277
3278int __init rbd_init(void)
3279{
3280 int rc;
3281
3282 rc = rbd_sysfs_init();
3283 if (rc)
3284 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003285 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003286 return 0;
3287}
3288
3289void __exit rbd_exit(void)
3290{
3291 rbd_sysfs_cleanup();
3292}
3293
3294module_init(rbd_init);
3295module_exit(rbd_exit);
3296
3297MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3298MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3299MODULE_DESCRIPTION("rados block device");
3300
3301/* following authorship retained from original osdblk.c */
3302MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3303
3304MODULE_LICENSE("GPL");