blob: 128978c6a4e0f53bd8d5b7a36ea82fe733d3e037 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder9e15b772012-10-30 19:40:33 -050073/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050075#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050076
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050078
Alex Elderd8891402012-10-09 13:50:17 -070079/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
Alex Elder81a89792012-02-02 08:13:30 -060087/*
88 * An RBD device name will be "rbd#", where the "rbd" comes from
89 * RBD_DRV_NAME above, and # is a unique integer identifier.
90 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
91 * enough to hold all possible device names.
92 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060094#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095
Alex Eldercc0538b2012-08-10 13:12:07 -070096#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070097
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
148 size_t image_id_len;
149 char *image_name;
150 size_t image_name_len;
151
152 u64 snap_id;
153 char *snap_name;
154
155 struct kref kref;
156};
157
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700158struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700159 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700160};
161
162/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600163 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700164 */
165struct rbd_client {
166 struct ceph_client *client;
167 struct kref kref;
168 struct list_head node;
169};
170
171/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600172 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700173 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700174struct rbd_req_status {
175 int done;
176 int rc;
177 u64 bytes;
178};
179
180/*
181 * a collection of requests
182 */
183struct rbd_req_coll {
184 int total;
185 int num_done;
186 struct kref kref;
187 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188};
189
Alex Elderf0f8cef2012-01-29 13:57:44 -0600190/*
191 * a single io request
192 */
193struct rbd_request {
194 struct request *rq; /* blk layer request */
195 struct bio *bio; /* cloned bio */
196 struct page **pages; /* list of used pages */
197 u64 len;
198 int coll_index;
199 struct rbd_req_coll *coll;
200};
201
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800202struct rbd_snap {
203 struct device dev;
204 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800205 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800206 struct list_head node;
207 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500208 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800209};
210
Alex Elderf84344f2012-08-31 17:29:51 -0500211struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500212 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500213 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500214 bool read_only;
215};
216
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700217/*
218 * a single device
219 */
220struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500221 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700222
223 int major; /* blkdev assigned major */
224 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700225
Alex Eldera30b71b2012-07-10 20:30:11 -0500226 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700227 struct rbd_client *rbd_client;
228
229 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
230
231 spinlock_t lock; /* queue lock */
232
233 struct rbd_image_header header;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500234 bool exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500235 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700236
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500237 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500238
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700239 struct ceph_osd_event *watch_event;
240 struct ceph_osd_request *watch_request;
241
Alex Elder86b00e02012-10-25 23:34:42 -0500242 struct rbd_spec *parent_spec;
243 u64 parent_overlap;
244
Josh Durginc6666012011-11-21 17:11:12 -0800245 /* protects updating the header */
246 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500247
248 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249
250 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800251
252 /* list of snapshots */
253 struct list_head snaps;
254
255 /* sysfs related */
256 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600257 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258};
259
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600261
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700262static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600263static DEFINE_SPINLOCK(rbd_dev_list_lock);
264
Alex Elder432b8582012-01-29 13:57:44 -0600265static LIST_HEAD(rbd_client_list); /* clients */
266static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267
Alex Elder304f6802012-08-31 17:29:52 -0500268static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
269static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
270
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500272static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800273
Alex Elderf0f8cef2012-01-29 13:57:44 -0600274static ssize_t rbd_add(struct bus_type *bus, const char *buf,
275 size_t count);
276static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
277 size_t count);
278
279static struct bus_attribute rbd_bus_attrs[] = {
280 __ATTR(add, S_IWUSR, NULL, rbd_add),
281 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
282 __ATTR_NULL
283};
284
285static struct bus_type rbd_bus_type = {
286 .name = "rbd",
287 .bus_attrs = rbd_bus_attrs,
288};
289
290static void rbd_root_dev_release(struct device *dev)
291{
292}
293
294static struct device rbd_root_dev = {
295 .init_name = "rbd",
296 .release = rbd_root_dev_release,
297};
298
Alex Elderaafb2302012-09-06 16:00:54 -0500299#ifdef RBD_DEBUG
300#define rbd_assert(expr) \
301 if (unlikely(!(expr))) { \
302 printk(KERN_ERR "\nAssertion failure in %s() " \
303 "at line %d:\n\n" \
304 "\trbd_assert(%s);\n\n", \
305 __func__, __LINE__, #expr); \
306 BUG(); \
307 }
308#else /* !RBD_DEBUG */
309# define rbd_assert(expr) ((void) 0)
310#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800311
Alex Elder117973f2012-08-31 17:29:55 -0500312static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
313static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700314
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700315static int rbd_open(struct block_device *bdev, fmode_t mode)
316{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600317 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318
Alex Elderf84344f2012-08-31 17:29:51 -0500319 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700320 return -EROFS;
321
Alex Elder42382b72012-11-16 09:29:16 -0600322 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600323 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500324 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600325 rbd_dev->open_count++;
326 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700327
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328 return 0;
329}
330
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800331static int rbd_release(struct gendisk *disk, fmode_t mode)
332{
333 struct rbd_device *rbd_dev = disk->private_data;
334
Alex Elder42382b72012-11-16 09:29:16 -0600335 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
336 rbd_assert(rbd_dev->open_count > 0);
337 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600338 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600339 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800340
341 return 0;
342}
343
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344static const struct block_device_operations rbd_bd_ops = {
345 .owner = THIS_MODULE,
346 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800347 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700348};
349
350/*
351 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500352 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700353 */
Alex Elderf8c38922012-08-10 13:12:07 -0700354static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700355{
356 struct rbd_client *rbdc;
357 int ret = -ENOMEM;
358
359 dout("rbd_client_create\n");
360 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
361 if (!rbdc)
362 goto out_opt;
363
364 kref_init(&rbdc->kref);
365 INIT_LIST_HEAD(&rbdc->node);
366
Alex Elderbc534d862012-01-29 13:57:44 -0600367 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
368
Alex Elder43ae4702012-07-03 16:01:18 -0500369 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700370 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600371 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500372 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700373
374 ret = ceph_open_session(rbdc->client);
375 if (ret < 0)
376 goto out_err;
377
Alex Elder432b8582012-01-29 13:57:44 -0600378 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700379 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600380 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700381
Alex Elderbc534d862012-01-29 13:57:44 -0600382 mutex_unlock(&ctl_mutex);
383
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384 dout("rbd_client_create created %p\n", rbdc);
385 return rbdc;
386
387out_err:
388 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600389out_mutex:
390 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700391 kfree(rbdc);
392out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500393 if (ceph_opts)
394 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400395 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700396}
397
398/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700399 * Find a ceph client with specific addr and configuration. If
400 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700401 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700402static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700403{
404 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700405 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406
Alex Elder43ae4702012-07-03 16:01:18 -0500407 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700408 return NULL;
409
Alex Elder1f7ba332012-08-10 13:12:07 -0700410 spin_lock(&rbd_client_list_lock);
411 list_for_each_entry(client_node, &rbd_client_list, node) {
412 if (!ceph_compare_options(ceph_opts, client_node->client)) {
413 kref_get(&client_node->kref);
414 found = true;
415 break;
416 }
417 }
418 spin_unlock(&rbd_client_list_lock);
419
420 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421}
422
423/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700424 * mount options
425 */
426enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700427 Opt_last_int,
428 /* int args above */
429 Opt_last_string,
430 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700431 Opt_read_only,
432 Opt_read_write,
433 /* Boolean args above */
434 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700435};
436
Alex Elder43ae4702012-07-03 16:01:18 -0500437static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700438 /* int args above */
439 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500440 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700441 {Opt_read_only, "ro"}, /* Alternate spelling */
442 {Opt_read_write, "read_write"},
443 {Opt_read_write, "rw"}, /* Alternate spelling */
444 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700445 {-1, NULL}
446};
447
448static int parse_rbd_opts_token(char *c, void *private)
449{
Alex Elder43ae4702012-07-03 16:01:18 -0500450 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700451 substring_t argstr[MAX_OPT_ARGS];
452 int token, intval, ret;
453
Alex Elder43ae4702012-07-03 16:01:18 -0500454 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700455 if (token < 0)
456 return -EINVAL;
457
458 if (token < Opt_last_int) {
459 ret = match_int(&argstr[0], &intval);
460 if (ret < 0) {
461 pr_err("bad mount option arg (not int) "
462 "at '%s'\n", c);
463 return ret;
464 }
465 dout("got int token %d val %d\n", token, intval);
466 } else if (token > Opt_last_int && token < Opt_last_string) {
467 dout("got string token %d val %s\n", token,
468 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700469 } else if (token > Opt_last_string && token < Opt_last_bool) {
470 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700471 } else {
472 dout("got token %d\n", token);
473 }
474
475 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700476 case Opt_read_only:
477 rbd_opts->read_only = true;
478 break;
479 case Opt_read_write:
480 rbd_opts->read_only = false;
481 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700482 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500483 rbd_assert(false);
484 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700485 }
486 return 0;
487}
488
489/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490 * Get a ceph client with specific addr and configuration, if one does
491 * not exist create it.
492 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500493static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700494{
Alex Elderf8c38922012-08-10 13:12:07 -0700495 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700496
Alex Elder1f7ba332012-08-10 13:12:07 -0700497 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500498 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500499 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500500 else
Alex Elderf8c38922012-08-10 13:12:07 -0700501 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700502
Alex Elder9d3997f2012-10-25 23:34:42 -0500503 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700504}
505
506/*
507 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600508 *
Alex Elder432b8582012-01-29 13:57:44 -0600509 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510 */
511static void rbd_client_release(struct kref *kref)
512{
513 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
514
515 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500516 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700517 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500518 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700519
520 ceph_destroy_client(rbdc->client);
521 kfree(rbdc);
522}
523
524/*
525 * Drop reference to ceph client node. If it's not referenced anymore, release
526 * it.
527 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500528static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700529{
Alex Elderc53d5892012-10-25 23:34:42 -0500530 if (rbdc)
531 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700532}
533
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700534/*
535 * Destroy requests collection
536 */
537static void rbd_coll_release(struct kref *kref)
538{
539 struct rbd_req_coll *coll =
540 container_of(kref, struct rbd_req_coll, kref);
541
542 dout("rbd_coll_release %p\n", coll);
543 kfree(coll);
544}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545
Alex Eldera30b71b2012-07-10 20:30:11 -0500546static bool rbd_image_format_valid(u32 image_format)
547{
548 return image_format == 1 || image_format == 2;
549}
550
Alex Elder8e94af82012-07-25 09:32:40 -0500551static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
552{
Alex Elder103a1502012-08-02 11:29:45 -0500553 size_t size;
554 u32 snap_count;
555
556 /* The header has to start with the magic rbd header text */
557 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
558 return false;
559
Alex Elderdb2388b2012-10-20 22:17:27 -0500560 /* The bio layer requires at least sector-sized I/O */
561
562 if (ondisk->options.order < SECTOR_SHIFT)
563 return false;
564
565 /* If we use u64 in a few spots we may be able to loosen this */
566
567 if (ondisk->options.order > 8 * sizeof (int) - 1)
568 return false;
569
Alex Elder103a1502012-08-02 11:29:45 -0500570 /*
571 * The size of a snapshot header has to fit in a size_t, and
572 * that limits the number of snapshots.
573 */
574 snap_count = le32_to_cpu(ondisk->snap_count);
575 size = SIZE_MAX - sizeof (struct ceph_snap_context);
576 if (snap_count > size / sizeof (__le64))
577 return false;
578
579 /*
580 * Not only that, but the size of the entire the snapshot
581 * header must also be representable in a size_t.
582 */
583 size -= snap_count * sizeof (__le64);
584 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
585 return false;
586
587 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500588}
589
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590/*
591 * Create a new header structure, translate header format from the on-disk
592 * header.
593 */
594static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500595 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700596{
Alex Elderccece232012-07-10 20:30:10 -0500597 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500598 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500599 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500600 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700601
Alex Elder6a523252012-07-19 17:12:59 -0500602 memset(header, 0, sizeof (*header));
603
Alex Elder103a1502012-08-02 11:29:45 -0500604 snap_count = le32_to_cpu(ondisk->snap_count);
605
Alex Elder58c17b02012-08-23 23:22:06 -0500606 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
607 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500608 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500610 memcpy(header->object_prefix, ondisk->object_prefix, len);
611 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600612
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700613 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500614 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
615
Alex Elder621901d2012-08-23 23:22:06 -0500616 /* Save a copy of the snapshot names */
617
Alex Elderf785cc12012-08-23 23:22:06 -0500618 if (snap_names_len > (u64) SIZE_MAX)
619 return -EIO;
620 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500622 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500623 /*
624 * Note that rbd_dev_v1_header_read() guarantees
625 * the ondisk buffer we're working with has
626 * snap_names_len bytes beyond the end of the
627 * snapshot id array, this memcpy() is safe.
628 */
629 memcpy(header->snap_names, &ondisk->snaps[snap_count],
630 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500631
Alex Elder621901d2012-08-23 23:22:06 -0500632 /* Record each snapshot's size */
633
Alex Elderd2bb24e2012-07-26 23:37:14 -0500634 size = snap_count * sizeof (*header->snap_sizes);
635 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500637 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500638 for (i = 0; i < snap_count; i++)
639 header->snap_sizes[i] =
640 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700641 } else {
Alex Elderccece232012-07-10 20:30:10 -0500642 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643 header->snap_names = NULL;
644 header->snap_sizes = NULL;
645 }
Alex Elder849b4262012-07-09 21:04:24 -0500646
Alex Elder34b13182012-07-13 20:35:12 -0500647 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648 header->obj_order = ondisk->options.order;
649 header->crypt_type = ondisk->options.crypt_type;
650 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500651
Alex Elder621901d2012-08-23 23:22:06 -0500652 /* Allocate and fill in the snapshot context */
653
Alex Elderf84344f2012-08-31 17:29:51 -0500654 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500655 size = sizeof (struct ceph_snap_context);
656 size += snap_count * sizeof (header->snapc->snaps[0]);
657 header->snapc = kzalloc(size, GFP_KERNEL);
658 if (!header->snapc)
659 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660
661 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500662 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500664 for (i = 0; i < snap_count; i++)
665 header->snapc->snaps[i] =
666 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667
668 return 0;
669
Alex Elder6a523252012-07-19 17:12:59 -0500670out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500671 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500672 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500674 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500675 kfree(header->object_prefix);
676 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500677
Alex Elder00f1f362012-02-07 12:03:36 -0600678 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679}
680
Alex Elder9e15b772012-10-30 19:40:33 -0500681static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
682{
683 struct rbd_snap *snap;
684
685 if (snap_id == CEPH_NOSNAP)
686 return RBD_SNAP_HEAD_NAME;
687
688 list_for_each_entry(snap, &rbd_dev->snaps, node)
689 if (snap_id == snap->id)
690 return snap->name;
691
692 return NULL;
693}
694
Alex Elder8836b992012-08-30 14:42:15 -0500695static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697
Alex Eldere86924a2012-07-10 20:30:11 -0500698 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600699
Alex Eldere86924a2012-07-10 20:30:11 -0500700 list_for_each_entry(snap, &rbd_dev->snaps, node) {
701 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500702 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500703 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500704 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600705
Alex Eldere86924a2012-07-10 20:30:11 -0500706 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600707 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708 }
Alex Eldere86924a2012-07-10 20:30:11 -0500709
Alex Elder00f1f362012-02-07 12:03:36 -0600710 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700711}
712
Alex Elder819d52b2012-10-25 23:34:41 -0500713static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700714{
Alex Elder78dc4472012-07-19 08:49:18 -0500715 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500717 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800718 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500719 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500720 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500721 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500722 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500724 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700725 if (ret < 0)
726 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500727 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700728 }
Alex Elderdaba5fd2012-10-26 17:25:23 -0500729 rbd_dev->exists = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700730done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700731 return ret;
732}
733
734static void rbd_header_free(struct rbd_image_header *header)
735{
Alex Elder849b4262012-07-09 21:04:24 -0500736 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500737 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500739 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500740 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500741 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800742 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500743 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744}
745
Alex Elder65ccfe22012-08-09 10:33:26 -0700746static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700747{
Alex Elder65ccfe22012-08-09 10:33:26 -0700748 char *name;
749 u64 segment;
750 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751
Alex Elder2fd82b92012-11-09 15:05:54 -0600752 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700753 if (!name)
754 return NULL;
755 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600756 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700757 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600758 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700759 pr_err("error formatting segment name for #%llu (%d)\n",
760 segment, ret);
761 kfree(name);
762 name = NULL;
763 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700764
Alex Elder65ccfe22012-08-09 10:33:26 -0700765 return name;
766}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767
Alex Elder65ccfe22012-08-09 10:33:26 -0700768static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
769{
770 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700771
Alex Elder65ccfe22012-08-09 10:33:26 -0700772 return offset & (segment_size - 1);
773}
774
775static u64 rbd_segment_length(struct rbd_device *rbd_dev,
776 u64 offset, u64 length)
777{
778 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
779
780 offset &= segment_size - 1;
781
Alex Elderaafb2302012-09-06 16:00:54 -0500782 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700783 if (offset + length > segment_size)
784 length = segment_size - offset;
785
786 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787}
788
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700789static int rbd_get_num_segments(struct rbd_image_header *header,
790 u64 ofs, u64 len)
791{
Alex Elderdf111be2012-08-09 10:33:26 -0700792 u64 start_seg;
793 u64 end_seg;
794
795 if (!len)
796 return 0;
797 if (len - 1 > U64_MAX - ofs)
798 return -ERANGE;
799
800 start_seg = ofs >> header->obj_order;
801 end_seg = (ofs + len - 1) >> header->obj_order;
802
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700803 return end_seg - start_seg + 1;
804}
805
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700806/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700807 * returns the size of an object in the image
808 */
809static u64 rbd_obj_bytes(struct rbd_image_header *header)
810{
811 return 1 << header->obj_order;
812}
813
814/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815 * bio helpers
816 */
817
818static void bio_chain_put(struct bio *chain)
819{
820 struct bio *tmp;
821
822 while (chain) {
823 tmp = chain;
824 chain = chain->bi_next;
825 bio_put(tmp);
826 }
827}
828
829/*
830 * zeros a bio chain, starting at specific offset
831 */
832static void zero_bio_chain(struct bio *chain, int start_ofs)
833{
834 struct bio_vec *bv;
835 unsigned long flags;
836 void *buf;
837 int i;
838 int pos = 0;
839
840 while (chain) {
841 bio_for_each_segment(bv, chain, i) {
842 if (pos + bv->bv_len > start_ofs) {
843 int remainder = max(start_ofs - pos, 0);
844 buf = bvec_kmap_irq(bv, &flags);
845 memset(buf + remainder, 0,
846 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200847 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700848 }
849 pos += bv->bv_len;
850 }
851
852 chain = chain->bi_next;
853 }
854}
855
856/*
Alex Elderf7760da2012-10-20 22:17:27 -0500857 * Clone a portion of a bio, starting at the given byte offset
858 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859 */
Alex Elderf7760da2012-10-20 22:17:27 -0500860static struct bio *bio_clone_range(struct bio *bio_src,
861 unsigned int offset,
862 unsigned int len,
863 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700864{
Alex Elderf7760da2012-10-20 22:17:27 -0500865 struct bio_vec *bv;
866 unsigned int resid;
867 unsigned short idx;
868 unsigned int voff;
869 unsigned short end_idx;
870 unsigned short vcnt;
871 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700872
Alex Elderf7760da2012-10-20 22:17:27 -0500873 /* Handle the easy case for the caller */
874
875 if (!offset && len == bio_src->bi_size)
876 return bio_clone(bio_src, gfpmask);
877
878 if (WARN_ON_ONCE(!len))
879 return NULL;
880 if (WARN_ON_ONCE(len > bio_src->bi_size))
881 return NULL;
882 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
883 return NULL;
884
885 /* Find first affected segment... */
886
887 resid = offset;
888 __bio_for_each_segment(bv, bio_src, idx, 0) {
889 if (resid < bv->bv_len)
890 break;
891 resid -= bv->bv_len;
892 }
893 voff = resid;
894
895 /* ...and the last affected segment */
896
897 resid += len;
898 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
899 if (resid <= bv->bv_len)
900 break;
901 resid -= bv->bv_len;
902 }
903 vcnt = end_idx - idx + 1;
904
905 /* Build the clone */
906
907 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
908 if (!bio)
909 return NULL; /* ENOMEM */
910
911 bio->bi_bdev = bio_src->bi_bdev;
912 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
913 bio->bi_rw = bio_src->bi_rw;
914 bio->bi_flags |= 1 << BIO_CLONED;
915
916 /*
917 * Copy over our part of the bio_vec, then update the first
918 * and last (or only) entries.
919 */
920 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
921 vcnt * sizeof (struct bio_vec));
922 bio->bi_io_vec[0].bv_offset += voff;
923 if (vcnt > 1) {
924 bio->bi_io_vec[0].bv_len -= voff;
925 bio->bi_io_vec[vcnt - 1].bv_len = resid;
926 } else {
927 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700928 }
929
Alex Elderf7760da2012-10-20 22:17:27 -0500930 bio->bi_vcnt = vcnt;
931 bio->bi_size = len;
932 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700933
Alex Elderf7760da2012-10-20 22:17:27 -0500934 return bio;
935}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700936
Alex Elderf7760da2012-10-20 22:17:27 -0500937/*
938 * Clone a portion of a bio chain, starting at the given byte offset
939 * into the first bio in the source chain and continuing for the
940 * number of bytes indicated. The result is another bio chain of
941 * exactly the given length, or a null pointer on error.
942 *
943 * The bio_src and offset parameters are both in-out. On entry they
944 * refer to the first source bio and the offset into that bio where
945 * the start of data to be cloned is located.
946 *
947 * On return, bio_src is updated to refer to the bio in the source
948 * chain that contains first un-cloned byte, and *offset will
949 * contain the offset of that byte within that bio.
950 */
951static struct bio *bio_chain_clone_range(struct bio **bio_src,
952 unsigned int *offset,
953 unsigned int len,
954 gfp_t gfpmask)
955{
956 struct bio *bi = *bio_src;
957 unsigned int off = *offset;
958 struct bio *chain = NULL;
959 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700960
Alex Elderf7760da2012-10-20 22:17:27 -0500961 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962
Alex Elderf7760da2012-10-20 22:17:27 -0500963 if (!bi || off >= bi->bi_size || !len)
964 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965
Alex Elderf7760da2012-10-20 22:17:27 -0500966 end = &chain;
967 while (len) {
968 unsigned int bi_size;
969 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700970
Alex Elderf7760da2012-10-20 22:17:27 -0500971 if (!bi)
972 goto out_err; /* EINVAL; ran out of bio's */
973 bi_size = min_t(unsigned int, bi->bi_size - off, len);
974 bio = bio_clone_range(bi, off, bi_size, gfpmask);
975 if (!bio)
976 goto out_err; /* ENOMEM */
977
978 *end = bio;
979 end = &bio->bi_next;
980
981 off += bi_size;
982 if (off == bi->bi_size) {
983 bi = bi->bi_next;
984 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985 }
Alex Elderf7760da2012-10-20 22:17:27 -0500986 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700987 }
Alex Elderf7760da2012-10-20 22:17:27 -0500988 *bio_src = bi;
989 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990
Alex Elderf7760da2012-10-20 22:17:27 -0500991 return chain;
992out_err:
993 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700994
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995 return NULL;
996}
997
998/*
999 * helpers for osd request op vectors.
1000 */
Alex Elder57cfc102012-06-26 12:57:03 -07001001static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
1002 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001003{
Alex Elder57cfc102012-06-26 12:57:03 -07001004 struct ceph_osd_req_op *ops;
1005
1006 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
1007 if (!ops)
1008 return NULL;
1009
1010 ops[0].op = opcode;
1011
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001012 /*
1013 * op extent offset and length will be set later on
1014 * in calc_raw_layout()
1015 */
Alex Elder57cfc102012-06-26 12:57:03 -07001016 ops[0].payload_len = payload_len;
1017
1018 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001019}
1020
1021static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1022{
1023 kfree(ops);
1024}
1025
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001026static void rbd_coll_end_req_index(struct request *rq,
1027 struct rbd_req_coll *coll,
1028 int index,
1029 int ret, u64 len)
1030{
1031 struct request_queue *q;
1032 int min, max, i;
1033
Alex Elderbd919d42012-07-13 20:35:11 -05001034 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1035 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001036
1037 if (!rq)
1038 return;
1039
1040 if (!coll) {
1041 blk_end_request(rq, ret, len);
1042 return;
1043 }
1044
1045 q = rq->q;
1046
1047 spin_lock_irq(q->queue_lock);
1048 coll->status[index].done = 1;
1049 coll->status[index].rc = ret;
1050 coll->status[index].bytes = len;
1051 max = min = coll->num_done;
1052 while (max < coll->total && coll->status[max].done)
1053 max++;
1054
1055 for (i = min; i<max; i++) {
1056 __blk_end_request(rq, coll->status[i].rc,
1057 coll->status[i].bytes);
1058 coll->num_done++;
1059 kref_put(&coll->kref, rbd_coll_release);
1060 }
1061 spin_unlock_irq(q->queue_lock);
1062}
1063
1064static void rbd_coll_end_req(struct rbd_request *req,
1065 int ret, u64 len)
1066{
1067 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1068}
1069
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001070/*
1071 * Send ceph osd request
1072 */
1073static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001074 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001075 struct ceph_snap_context *snapc,
1076 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001077 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001078 struct bio *bio,
1079 struct page **pages,
1080 int num_pages,
1081 int flags,
1082 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001083 struct rbd_req_coll *coll,
1084 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001085 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001086 struct ceph_msg *msg),
1087 struct ceph_osd_request **linger_req,
1088 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001089{
1090 struct ceph_osd_request *req;
1091 struct ceph_file_layout *layout;
1092 int ret;
1093 u64 bno;
1094 struct timespec mtime = CURRENT_TIME;
1095 struct rbd_request *req_data;
1096 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001097 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001098
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001099 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001100 if (!req_data) {
1101 if (coll)
1102 rbd_coll_end_req_index(rq, coll, coll_index,
1103 -ENOMEM, len);
1104 return -ENOMEM;
1105 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001107 if (coll) {
1108 req_data->coll = coll;
1109 req_data->coll_index = coll_index;
1110 }
1111
Alex Elderf7760da2012-10-20 22:17:27 -05001112 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1113 object_name, (unsigned long long) ofs,
1114 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001115
Alex Elder0ce1a792012-07-03 16:01:18 -05001116 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001117 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1118 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001119 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001120 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001121 goto done_pages;
1122 }
1123
1124 req->r_callback = rbd_cb;
1125
1126 req_data->rq = rq;
1127 req_data->bio = bio;
1128 req_data->pages = pages;
1129 req_data->len = len;
1130
1131 req->r_priv = req_data;
1132
1133 reqhead = req->r_request->front.iov_base;
1134 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1135
Alex Elderaded07e2012-07-03 16:01:18 -05001136 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137 req->r_oid_len = strlen(req->r_oid);
1138
1139 layout = &req->r_file_layout;
1140 memset(layout, 0, sizeof(*layout));
1141 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1142 layout->fl_stripe_count = cpu_to_le32(1);
1143 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001144 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001145 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1146 req, ops);
1147 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148
1149 ceph_osdc_build_request(req, ofs, &len,
1150 ops,
1151 snapc,
1152 &mtime,
1153 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001155 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001156 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001157 *linger_req = req;
1158 }
1159
Alex Elder1dbb4392012-01-24 10:08:37 -06001160 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161 if (ret < 0)
1162 goto done_err;
1163
1164 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001165 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001166 if (ver)
1167 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001168 dout("reassert_ver=%llu\n",
1169 (unsigned long long)
1170 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171 ceph_osdc_put_request(req);
1172 }
1173 return ret;
1174
1175done_err:
1176 bio_chain_put(req_data->bio);
1177 ceph_osdc_put_request(req);
1178done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001179 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181 return ret;
1182}
1183
1184/*
1185 * Ceph osd op callback
1186 */
1187static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1188{
1189 struct rbd_request *req_data = req->r_priv;
1190 struct ceph_osd_reply_head *replyhead;
1191 struct ceph_osd_op *op;
1192 __s32 rc;
1193 u64 bytes;
1194 int read_op;
1195
1196 /* parse reply */
1197 replyhead = msg->front.iov_base;
1198 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1199 op = (void *)(replyhead + 1);
1200 rc = le32_to_cpu(replyhead->result);
1201 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001202 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001203
Alex Elderbd919d42012-07-13 20:35:11 -05001204 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1205 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001206
1207 if (rc == -ENOENT && read_op) {
1208 zero_bio_chain(req_data->bio, 0);
1209 rc = 0;
1210 } else if (rc == 0 && read_op && bytes < req_data->len) {
1211 zero_bio_chain(req_data->bio, bytes);
1212 bytes = req_data->len;
1213 }
1214
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001215 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001216
1217 if (req_data->bio)
1218 bio_chain_put(req_data->bio);
1219
1220 ceph_osdc_put_request(req);
1221 kfree(req_data);
1222}
1223
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001224static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1225{
1226 ceph_osdc_put_request(req);
1227}
1228
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001229/*
1230 * Do a synchronous ceph osd operation
1231 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001232static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001233 struct ceph_snap_context *snapc,
1234 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001235 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001236 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001237 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001238 u64 ofs, u64 inbound_size,
1239 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001240 struct ceph_osd_request **linger_req,
1241 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242{
1243 int ret;
1244 struct page **pages;
1245 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001246
Alex Elderaafb2302012-09-06 16:00:54 -05001247 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001248
Alex Elderf8d4de62012-07-03 16:01:19 -05001249 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001250 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001251 if (IS_ERR(pages))
1252 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001253
Alex Elder0ce1a792012-07-03 16:01:18 -05001254 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001255 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001256 pages, num_pages,
1257 flags,
1258 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001259 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260 NULL,
1261 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001262 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001263 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001264
Alex Elderf8d4de62012-07-03 16:01:19 -05001265 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1266 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001267
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001268done:
1269 ceph_release_page_vector(pages, num_pages);
1270 return ret;
1271}
1272
1273/*
1274 * Do an asynchronous ceph osd operation
1275 */
1276static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001277 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001278 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001279 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001280 struct bio *bio,
1281 struct rbd_req_coll *coll,
1282 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001283{
1284 char *seg_name;
1285 u64 seg_ofs;
1286 u64 seg_len;
1287 int ret;
1288 struct ceph_osd_req_op *ops;
1289 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001290 int opcode;
1291 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001292 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001293
Alex Elder65ccfe22012-08-09 10:33:26 -07001294 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001295 if (!seg_name)
1296 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001297 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1298 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001299
Alex Elderff2e4bb2012-10-10 18:59:29 -07001300 if (rq_data_dir(rq) == WRITE) {
1301 opcode = CEPH_OSD_OP_WRITE;
1302 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001303 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001304 payload_len = seg_len;
1305 } else {
1306 opcode = CEPH_OSD_OP_READ;
1307 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001308 snapc = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001309 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001310 payload_len = 0;
1311 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001312
Alex Elder57cfc102012-06-26 12:57:03 -07001313 ret = -ENOMEM;
1314 ops = rbd_create_rw_ops(1, opcode, payload_len);
1315 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001316 goto done;
1317
1318 /* we've taken care of segment sizes earlier when we
1319 cloned the bios. We should never have a segment
1320 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001321 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001322
1323 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1324 seg_name, seg_ofs, seg_len,
1325 bio,
1326 NULL, 0,
1327 flags,
1328 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001329 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001331
1332 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001333done:
1334 kfree(seg_name);
1335 return ret;
1336}
1337
1338/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001339 * Request sync osd read
1340 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001341static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001342 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001343 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001344 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001345 char *buf,
1346 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001347{
Alex Elder913d2fd2012-06-26 12:57:03 -07001348 struct ceph_osd_req_op *ops;
1349 int ret;
1350
1351 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1352 if (!ops)
1353 return -ENOMEM;
1354
1355 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001356 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001357 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001358 ops, object_name, ofs, len, buf, NULL, ver);
1359 rbd_destroy_ops(ops);
1360
1361 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001362}
1363
1364/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001365 * Request sync osd watch
1366 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001367static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001368 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001369 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001370{
1371 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001372 int ret;
1373
Alex Elder57cfc102012-06-26 12:57:03 -07001374 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1375 if (!ops)
1376 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001377
Josh Durgina71b8912011-12-05 18:10:44 -08001378 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001379 ops[0].watch.cookie = notify_id;
1380 ops[0].watch.flag = 0;
1381
Alex Elder0ce1a792012-07-03 16:01:18 -05001382 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001383 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001384 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001385 CEPH_OSD_FLAG_READ,
1386 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001387 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001388 rbd_simple_req_cb, 0, NULL);
1389
1390 rbd_destroy_ops(ops);
1391 return ret;
1392}
1393
1394static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1395{
Alex Elder0ce1a792012-07-03 16:01:18 -05001396 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001397 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001398 int rc;
1399
Alex Elder0ce1a792012-07-03 16:01:18 -05001400 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001401 return;
1402
Alex Elderbd919d42012-07-13 20:35:11 -05001403 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1404 rbd_dev->header_name, (unsigned long long) notify_id,
1405 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001406 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001407 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001408 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001409 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001410
Alex Elder7f0a24d2012-07-25 09:32:40 -05001411 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001412}
1413
1414/*
1415 * Request sync osd watch
1416 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001417static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001418{
1419 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001420 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001421 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001422
Alex Elder57cfc102012-06-26 12:57:03 -07001423 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1424 if (!ops)
1425 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001426
1427 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001428 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001429 if (ret < 0)
1430 goto fail;
1431
Alex Elder0e6f3222012-07-25 09:32:40 -05001432 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001433 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001434 ops[0].watch.flag = 1;
1435
Alex Elder0ce1a792012-07-03 16:01:18 -05001436 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001437 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001438 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1439 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001440 rbd_dev->header_name,
1441 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001442 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001443
1444 if (ret < 0)
1445 goto fail_event;
1446
1447 rbd_destroy_ops(ops);
1448 return 0;
1449
1450fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001451 ceph_osdc_cancel_event(rbd_dev->watch_event);
1452 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001453fail:
1454 rbd_destroy_ops(ops);
1455 return ret;
1456}
1457
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001458/*
1459 * Request sync osd unwatch
1460 */
Alex Elder070c6332012-07-25 09:32:41 -05001461static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001462{
1463 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001464 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001465
Alex Elder57cfc102012-06-26 12:57:03 -07001466 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1467 if (!ops)
1468 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001469
1470 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001471 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001472 ops[0].watch.flag = 0;
1473
Alex Elder0ce1a792012-07-03 16:01:18 -05001474 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001475 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001476 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1477 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001478 rbd_dev->header_name,
1479 0, 0, NULL, NULL, NULL);
1480
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001481
1482 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001483 ceph_osdc_cancel_event(rbd_dev->watch_event);
1484 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001485 return ret;
1486}
1487
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001488/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001489 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001491static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001492 const char *object_name,
1493 const char *class_name,
1494 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001495 const char *outbound,
1496 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001497 char *inbound,
1498 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001499 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001500 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001501{
1502 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001503 int class_name_len = strlen(class_name);
1504 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001505 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001506 int ret;
1507
Alex Elder3cb4a682012-06-26 12:57:03 -07001508 /*
1509 * Any input parameters required by the method we're calling
1510 * will be sent along with the class and method names as
1511 * part of the message payload. That data and its size are
1512 * supplied via the indata and indata_len fields (named from
1513 * the perspective of the server side) in the OSD request
1514 * operation.
1515 */
1516 payload_size = class_name_len + method_name_len + outbound_size;
1517 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001518 if (!ops)
1519 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520
Alex Elderaded07e2012-07-03 16:01:18 -05001521 ops[0].cls.class_name = class_name;
1522 ops[0].cls.class_len = (__u8) class_name_len;
1523 ops[0].cls.method_name = method_name;
1524 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001526 ops[0].cls.indata = outbound;
1527 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528
Alex Elder0ce1a792012-07-03 16:01:18 -05001529 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001531 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001532 object_name, 0, inbound_size, inbound,
1533 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534
1535 rbd_destroy_ops(ops);
1536
1537 dout("cls_exec returned %d\n", ret);
1538 return ret;
1539}
1540
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001541static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1542{
1543 struct rbd_req_coll *coll =
1544 kzalloc(sizeof(struct rbd_req_coll) +
1545 sizeof(struct rbd_req_status) * num_reqs,
1546 GFP_ATOMIC);
1547
1548 if (!coll)
1549 return NULL;
1550 coll->total = num_reqs;
1551 kref_init(&coll->kref);
1552 return coll;
1553}
1554
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555/*
1556 * block device queue callback
1557 */
1558static void rbd_rq_fn(struct request_queue *q)
1559{
1560 struct rbd_device *rbd_dev = q->queuedata;
1561 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001562
Alex Elder00f1f362012-02-07 12:03:36 -06001563 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001564 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001565 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001566 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001568 int num_segs, cur_seg = 0;
1569 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001570 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001571 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001572
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001573 dout("fetched request\n");
1574
1575 /* filter out block requests we don't understand */
1576 if ((rq->cmd_type != REQ_TYPE_FS)) {
1577 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001578 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001579 }
1580
1581 /* deduce our operation (read, write) */
1582 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001583 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001584 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001585 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001586 }
1587
1588 spin_unlock_irq(q->queue_lock);
1589
Josh Durgind1d25642011-12-05 14:03:05 -08001590 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001591
Alex Elderdaba5fd2012-10-26 17:25:23 -05001592 if (!rbd_dev->exists) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001593 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
Josh Durgine88a36e2011-11-21 18:14:25 -08001594 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001595 dout("request for non-existent snapshot");
1596 spin_lock_irq(q->queue_lock);
1597 __blk_end_request_all(rq, -ENXIO);
1598 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001599 }
1600
Josh Durgind1d25642011-12-05 14:03:05 -08001601 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1602
1603 up_read(&rbd_dev->header_rwsem);
1604
Alex Elderf7760da2012-10-20 22:17:27 -05001605 size = blk_rq_bytes(rq);
1606 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1607 bio = rq->bio;
1608
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001609 dout("%s 0x%x bytes at 0x%llx\n",
1610 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001611 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001612
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001613 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001614 if (num_segs <= 0) {
1615 spin_lock_irq(q->queue_lock);
1616 __blk_end_request_all(rq, num_segs);
1617 ceph_put_snap_context(snapc);
1618 continue;
1619 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001620 coll = rbd_alloc_coll(num_segs);
1621 if (!coll) {
1622 spin_lock_irq(q->queue_lock);
1623 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001624 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001625 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001626 }
1627
Alex Elderf7760da2012-10-20 22:17:27 -05001628 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001629 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001630 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1631 unsigned int chain_size;
1632 struct bio *bio_chain;
1633
1634 BUG_ON(limit > (u64) UINT_MAX);
1635 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001636 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001637
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001638 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001639
1640 /* Pass a cloned bio chain via an osd request */
1641
1642 bio_chain = bio_chain_clone_range(&bio,
1643 &bio_offset, chain_size,
1644 GFP_ATOMIC);
1645 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001646 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001647 ofs, chain_size,
1648 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001649 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001650 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001651 -ENOMEM, chain_size);
1652 size -= chain_size;
1653 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001654
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001655 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001656 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001657 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001658
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001659 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001660
1661 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001662 }
1663}
1664
1665/*
1666 * a queue callback. Makes sure that we don't create a bio that spans across
1667 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001668 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001669 */
1670static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1671 struct bio_vec *bvec)
1672{
1673 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed2012-10-20 22:17:27 -05001674 sector_t sector_offset;
1675 sector_t sectors_per_obj;
1676 sector_t obj_sector_offset;
1677 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001678
Alex Eldere5cfeed2012-10-20 22:17:27 -05001679 /*
1680 * Find how far into its rbd object the partition-relative
1681 * bio start sector is to offset relative to the enclosing
1682 * device.
1683 */
1684 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1685 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1686 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001687
Alex Eldere5cfeed2012-10-20 22:17:27 -05001688 /*
1689 * Compute the number of bytes from that offset to the end
1690 * of the object. Account for what's already used by the bio.
1691 */
1692 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1693 if (ret > bmd->bi_size)
1694 ret -= bmd->bi_size;
1695 else
1696 ret = 0;
1697
1698 /*
1699 * Don't send back more than was asked for. And if the bio
1700 * was empty, let the whole thing through because: "Note
1701 * that a block device *must* allow a single page to be
1702 * added to an empty bio."
1703 */
1704 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1705 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1706 ret = (int) bvec->bv_len;
1707
1708 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001709}
1710
1711static void rbd_free_disk(struct rbd_device *rbd_dev)
1712{
1713 struct gendisk *disk = rbd_dev->disk;
1714
1715 if (!disk)
1716 return;
1717
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001718 if (disk->flags & GENHD_FL_UP)
1719 del_gendisk(disk);
1720 if (disk->queue)
1721 blk_cleanup_queue(disk->queue);
1722 put_disk(disk);
1723}
1724
1725/*
Alex Elder4156d992012-08-02 11:29:46 -05001726 * Read the complete header for the given rbd device.
1727 *
1728 * Returns a pointer to a dynamically-allocated buffer containing
1729 * the complete and validated header. Caller can pass the address
1730 * of a variable that will be filled in with the version of the
1731 * header object at the time it was read.
1732 *
1733 * Returns a pointer-coded errno if a failure occurs.
1734 */
1735static struct rbd_image_header_ondisk *
1736rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1737{
1738 struct rbd_image_header_ondisk *ondisk = NULL;
1739 u32 snap_count = 0;
1740 u64 names_size = 0;
1741 u32 want_count;
1742 int ret;
1743
1744 /*
1745 * The complete header will include an array of its 64-bit
1746 * snapshot ids, followed by the names of those snapshots as
1747 * a contiguous block of NUL-terminated strings. Note that
1748 * the number of snapshots could change by the time we read
1749 * it in, in which case we re-read it.
1750 */
1751 do {
1752 size_t size;
1753
1754 kfree(ondisk);
1755
1756 size = sizeof (*ondisk);
1757 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1758 size += names_size;
1759 ondisk = kmalloc(size, GFP_KERNEL);
1760 if (!ondisk)
1761 return ERR_PTR(-ENOMEM);
1762
1763 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1764 rbd_dev->header_name,
1765 0, size,
1766 (char *) ondisk, version);
1767
1768 if (ret < 0)
1769 goto out_err;
1770 if (WARN_ON((size_t) ret < size)) {
1771 ret = -ENXIO;
1772 pr_warning("short header read for image %s"
1773 " (want %zd got %d)\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001774 rbd_dev->spec->image_name, size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05001775 goto out_err;
1776 }
1777 if (!rbd_dev_ondisk_valid(ondisk)) {
1778 ret = -ENXIO;
1779 pr_warning("invalid header for image %s\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001780 rbd_dev->spec->image_name);
Alex Elder4156d992012-08-02 11:29:46 -05001781 goto out_err;
1782 }
1783
1784 names_size = le64_to_cpu(ondisk->snap_names_len);
1785 want_count = snap_count;
1786 snap_count = le32_to_cpu(ondisk->snap_count);
1787 } while (snap_count != want_count);
1788
1789 return ondisk;
1790
1791out_err:
1792 kfree(ondisk);
1793
1794 return ERR_PTR(ret);
1795}
1796
1797/*
1798 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799 */
1800static int rbd_read_header(struct rbd_device *rbd_dev,
1801 struct rbd_image_header *header)
1802{
Alex Elder4156d992012-08-02 11:29:46 -05001803 struct rbd_image_header_ondisk *ondisk;
1804 u64 ver = 0;
1805 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001806
Alex Elder4156d992012-08-02 11:29:46 -05001807 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1808 if (IS_ERR(ondisk))
1809 return PTR_ERR(ondisk);
1810 ret = rbd_header_from_disk(header, ondisk);
1811 if (ret >= 0)
1812 header->obj_version = ver;
1813 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001814
Alex Elder4156d992012-08-02 11:29:46 -05001815 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001816}
1817
Alex Elder41f38c22012-10-25 23:34:40 -05001818static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001819{
1820 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001821 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001822
Alex Eldera0593292012-07-19 09:09:27 -05001823 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001824 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001825}
1826
Alex Elder94785542012-10-09 13:50:17 -07001827static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1828{
1829 sector_t size;
1830
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001831 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001832 return;
1833
1834 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1835 dout("setting size to %llu sectors", (unsigned long long) size);
1836 rbd_dev->mapping.size = (u64) size;
1837 set_capacity(rbd_dev->disk, size);
1838}
1839
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840/*
1841 * only read the first part of the ondisk header, without the snaps info
1842 */
Alex Elder117973f2012-08-31 17:29:55 -05001843static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001844{
1845 int ret;
1846 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001847
1848 ret = rbd_read_header(rbd_dev, &h);
1849 if (ret < 0)
1850 return ret;
1851
Josh Durgina51aa0c2011-12-05 10:35:04 -08001852 down_write(&rbd_dev->header_rwsem);
1853
Alex Elder94785542012-10-09 13:50:17 -07001854 /* Update image size, and check for resize of mapped image */
1855 rbd_dev->header.image_size = h.image_size;
1856 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001857
Alex Elder849b4262012-07-09 21:04:24 -05001858 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001860 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001861 /* osd requests may still refer to snapc */
1862 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001863
Alex Elderb8136232012-07-25 09:32:41 -05001864 if (hver)
1865 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001866 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001867 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001868 rbd_dev->header.snapc = h.snapc;
1869 rbd_dev->header.snap_names = h.snap_names;
1870 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001871 /* Free the extra copy of the object prefix */
1872 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1873 kfree(h.object_prefix);
1874
Alex Elder304f6802012-08-31 17:29:52 -05001875 ret = rbd_dev_snaps_update(rbd_dev);
1876 if (!ret)
1877 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001878
Josh Durginc6666012011-11-21 17:11:12 -08001879 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001880
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001881 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001882}
1883
Alex Elder117973f2012-08-31 17:29:55 -05001884static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001885{
1886 int ret;
1887
Alex Elder117973f2012-08-31 17:29:55 -05001888 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001889 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001890 if (rbd_dev->image_format == 1)
1891 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1892 else
1893 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001894 mutex_unlock(&ctl_mutex);
1895
1896 return ret;
1897}
1898
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001899static int rbd_init_disk(struct rbd_device *rbd_dev)
1900{
1901 struct gendisk *disk;
1902 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001903 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001904
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001905 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001906 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1907 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001908 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001909
Alex Elderf0f8cef2012-01-29 13:57:44 -06001910 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001911 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001912 disk->major = rbd_dev->major;
1913 disk->first_minor = 0;
1914 disk->fops = &rbd_bd_ops;
1915 disk->private_data = rbd_dev;
1916
1917 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001918 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1919 if (!q)
1920 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001921
Alex Elder593a9e72012-02-07 12:03:37 -06001922 /* We use the default size, but let's be explicit about it. */
1923 blk_queue_physical_block_size(q, SECTOR_SIZE);
1924
Josh Durgin029bcbd2011-07-22 11:35:23 -07001925 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001926 segment_size = rbd_obj_bytes(&rbd_dev->header);
1927 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1928 blk_queue_max_segment_size(q, segment_size);
1929 blk_queue_io_min(q, segment_size);
1930 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001931
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001932 blk_queue_merge_bvec(q, rbd_merge_bvec);
1933 disk->queue = q;
1934
1935 q->queuedata = rbd_dev;
1936
1937 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001938
Alex Elder12f02942012-08-29 17:11:07 -05001939 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1940
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001941 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001942out_disk:
1943 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001944
1945 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001946}
1947
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001948/*
1949 sysfs
1950*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001951
Alex Elder593a9e72012-02-07 12:03:37 -06001952static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1953{
1954 return container_of(dev, struct rbd_device, dev);
1955}
1956
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957static ssize_t rbd_size_show(struct device *dev,
1958 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001959{
Alex Elder593a9e72012-02-07 12:03:37 -06001960 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001961 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001962
Josh Durgina51aa0c2011-12-05 10:35:04 -08001963 down_read(&rbd_dev->header_rwsem);
1964 size = get_capacity(rbd_dev->disk);
1965 up_read(&rbd_dev->header_rwsem);
1966
1967 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001968}
1969
Alex Elder34b13182012-07-13 20:35:12 -05001970/*
1971 * Note this shows the features for whatever's mapped, which is not
1972 * necessarily the base image.
1973 */
1974static ssize_t rbd_features_show(struct device *dev,
1975 struct device_attribute *attr, char *buf)
1976{
1977 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1978
1979 return sprintf(buf, "0x%016llx\n",
1980 (unsigned long long) rbd_dev->mapping.features);
1981}
1982
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001983static ssize_t rbd_major_show(struct device *dev,
1984 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001985{
Alex Elder593a9e72012-02-07 12:03:37 -06001986 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001987
1988 return sprintf(buf, "%d\n", rbd_dev->major);
1989}
1990
1991static ssize_t rbd_client_id_show(struct device *dev,
1992 struct device_attribute *attr, char *buf)
1993{
Alex Elder593a9e72012-02-07 12:03:37 -06001994 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001995
Alex Elder1dbb4392012-01-24 10:08:37 -06001996 return sprintf(buf, "client%lld\n",
1997 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001998}
1999
2000static ssize_t rbd_pool_show(struct device *dev,
2001 struct device_attribute *attr, char *buf)
2002{
Alex Elder593a9e72012-02-07 12:03:37 -06002003 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002004
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002005 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002006}
2007
Alex Elder9bb2f332012-07-12 10:46:35 -05002008static ssize_t rbd_pool_id_show(struct device *dev,
2009 struct device_attribute *attr, char *buf)
2010{
2011 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2012
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002013 return sprintf(buf, "%llu\n",
2014 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002015}
2016
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002017static ssize_t rbd_name_show(struct device *dev,
2018 struct device_attribute *attr, char *buf)
2019{
Alex Elder593a9e72012-02-07 12:03:37 -06002020 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002021
Alex Eldera92ffdf2012-10-30 19:40:33 -05002022 if (rbd_dev->spec->image_name)
2023 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2024
2025 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002026}
2027
Alex Elder589d30e2012-07-10 20:30:11 -05002028static ssize_t rbd_image_id_show(struct device *dev,
2029 struct device_attribute *attr, char *buf)
2030{
2031 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2032
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002033 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002034}
2035
Alex Elder34b13182012-07-13 20:35:12 -05002036/*
2037 * Shows the name of the currently-mapped snapshot (or
2038 * RBD_SNAP_HEAD_NAME for the base image).
2039 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002040static ssize_t rbd_snap_show(struct device *dev,
2041 struct device_attribute *attr,
2042 char *buf)
2043{
Alex Elder593a9e72012-02-07 12:03:37 -06002044 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002045
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002046 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002047}
2048
Alex Elder86b00e02012-10-25 23:34:42 -05002049/*
2050 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2051 * for the parent image. If there is no parent, simply shows
2052 * "(no parent image)".
2053 */
2054static ssize_t rbd_parent_show(struct device *dev,
2055 struct device_attribute *attr,
2056 char *buf)
2057{
2058 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2059 struct rbd_spec *spec = rbd_dev->parent_spec;
2060 int count;
2061 char *bufp = buf;
2062
2063 if (!spec)
2064 return sprintf(buf, "(no parent image)\n");
2065
2066 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2067 (unsigned long long) spec->pool_id, spec->pool_name);
2068 if (count < 0)
2069 return count;
2070 bufp += count;
2071
2072 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2073 spec->image_name ? spec->image_name : "(unknown)");
2074 if (count < 0)
2075 return count;
2076 bufp += count;
2077
2078 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2079 (unsigned long long) spec->snap_id, spec->snap_name);
2080 if (count < 0)
2081 return count;
2082 bufp += count;
2083
2084 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2085 if (count < 0)
2086 return count;
2087 bufp += count;
2088
2089 return (ssize_t) (bufp - buf);
2090}
2091
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002092static ssize_t rbd_image_refresh(struct device *dev,
2093 struct device_attribute *attr,
2094 const char *buf,
2095 size_t size)
2096{
Alex Elder593a9e72012-02-07 12:03:37 -06002097 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002098 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002099
Alex Elder117973f2012-08-31 17:29:55 -05002100 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002101
2102 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002103}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002104
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002106static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002107static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2108static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2109static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002110static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002111static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002112static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002113static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2114static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002115static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002116
2117static struct attribute *rbd_attrs[] = {
2118 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002119 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002120 &dev_attr_major.attr,
2121 &dev_attr_client_id.attr,
2122 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002123 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002124 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002125 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002126 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002127 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002128 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002129 NULL
2130};
2131
2132static struct attribute_group rbd_attr_group = {
2133 .attrs = rbd_attrs,
2134};
2135
2136static const struct attribute_group *rbd_attr_groups[] = {
2137 &rbd_attr_group,
2138 NULL
2139};
2140
2141static void rbd_sysfs_dev_release(struct device *dev)
2142{
2143}
2144
2145static struct device_type rbd_device_type = {
2146 .name = "rbd",
2147 .groups = rbd_attr_groups,
2148 .release = rbd_sysfs_dev_release,
2149};
2150
2151
2152/*
2153 sysfs - snapshots
2154*/
2155
2156static ssize_t rbd_snap_size_show(struct device *dev,
2157 struct device_attribute *attr,
2158 char *buf)
2159{
2160 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2161
Josh Durgin35915382011-12-05 18:25:13 -08002162 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002163}
2164
2165static ssize_t rbd_snap_id_show(struct device *dev,
2166 struct device_attribute *attr,
2167 char *buf)
2168{
2169 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2170
Josh Durgin35915382011-12-05 18:25:13 -08002171 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002172}
2173
Alex Elder34b13182012-07-13 20:35:12 -05002174static ssize_t rbd_snap_features_show(struct device *dev,
2175 struct device_attribute *attr,
2176 char *buf)
2177{
2178 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2179
2180 return sprintf(buf, "0x%016llx\n",
2181 (unsigned long long) snap->features);
2182}
2183
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002184static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2185static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002186static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002187
2188static struct attribute *rbd_snap_attrs[] = {
2189 &dev_attr_snap_size.attr,
2190 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002191 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002192 NULL,
2193};
2194
2195static struct attribute_group rbd_snap_attr_group = {
2196 .attrs = rbd_snap_attrs,
2197};
2198
2199static void rbd_snap_dev_release(struct device *dev)
2200{
2201 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2202 kfree(snap->name);
2203 kfree(snap);
2204}
2205
2206static const struct attribute_group *rbd_snap_attr_groups[] = {
2207 &rbd_snap_attr_group,
2208 NULL
2209};
2210
2211static struct device_type rbd_snap_device_type = {
2212 .groups = rbd_snap_attr_groups,
2213 .release = rbd_snap_dev_release,
2214};
2215
Alex Elder8b8fb992012-10-26 17:25:24 -05002216static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2217{
2218 kref_get(&spec->kref);
2219
2220 return spec;
2221}
2222
2223static void rbd_spec_free(struct kref *kref);
2224static void rbd_spec_put(struct rbd_spec *spec)
2225{
2226 if (spec)
2227 kref_put(&spec->kref, rbd_spec_free);
2228}
2229
2230static struct rbd_spec *rbd_spec_alloc(void)
2231{
2232 struct rbd_spec *spec;
2233
2234 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2235 if (!spec)
2236 return NULL;
2237 kref_init(&spec->kref);
2238
2239 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2240
2241 return spec;
2242}
2243
2244static void rbd_spec_free(struct kref *kref)
2245{
2246 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2247
2248 kfree(spec->pool_name);
2249 kfree(spec->image_id);
2250 kfree(spec->image_name);
2251 kfree(spec->snap_name);
2252 kfree(spec);
2253}
2254
Alex Elderc53d5892012-10-25 23:34:42 -05002255struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2256 struct rbd_spec *spec)
2257{
2258 struct rbd_device *rbd_dev;
2259
2260 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2261 if (!rbd_dev)
2262 return NULL;
2263
2264 spin_lock_init(&rbd_dev->lock);
2265 INIT_LIST_HEAD(&rbd_dev->node);
2266 INIT_LIST_HEAD(&rbd_dev->snaps);
2267 init_rwsem(&rbd_dev->header_rwsem);
2268
2269 rbd_dev->spec = spec;
2270 rbd_dev->rbd_client = rbdc;
2271
2272 return rbd_dev;
2273}
2274
2275static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2276{
Alex Elder86b00e02012-10-25 23:34:42 -05002277 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002278 kfree(rbd_dev->header_name);
2279 rbd_put_client(rbd_dev->rbd_client);
2280 rbd_spec_put(rbd_dev->spec);
2281 kfree(rbd_dev);
2282}
2283
Alex Elder304f6802012-08-31 17:29:52 -05002284static bool rbd_snap_registered(struct rbd_snap *snap)
2285{
2286 bool ret = snap->dev.type == &rbd_snap_device_type;
2287 bool reg = device_is_registered(&snap->dev);
2288
2289 rbd_assert(!ret ^ reg);
2290
2291 return ret;
2292}
2293
Alex Elder41f38c22012-10-25 23:34:40 -05002294static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002295{
2296 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002297 if (device_is_registered(&snap->dev))
2298 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002299}
2300
Alex Elder14e70852012-07-19 09:09:27 -05002301static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002302 struct device *parent)
2303{
2304 struct device *dev = &snap->dev;
2305 int ret;
2306
2307 dev->type = &rbd_snap_device_type;
2308 dev->parent = parent;
2309 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002310 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002311 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2312
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002313 ret = device_register(dev);
2314
2315 return ret;
2316}
2317
Alex Elder4e891e02012-07-10 20:30:10 -05002318static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002319 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002320 u64 snap_id, u64 snap_size,
2321 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002322{
Alex Elder4e891e02012-07-10 20:30:10 -05002323 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002324 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002325
2326 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002327 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002328 return ERR_PTR(-ENOMEM);
2329
2330 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002331 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002332 if (!snap->name)
2333 goto err;
2334
Alex Elderc8d18422012-07-10 20:30:11 -05002335 snap->id = snap_id;
2336 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002337 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002338
2339 return snap;
2340
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002341err:
2342 kfree(snap->name);
2343 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002344
2345 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002346}
2347
Alex Eldercd892122012-07-03 16:01:19 -05002348static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2349 u64 *snap_size, u64 *snap_features)
2350{
2351 char *snap_name;
2352
2353 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2354
2355 *snap_size = rbd_dev->header.snap_sizes[which];
2356 *snap_features = 0; /* No features for v1 */
2357
2358 /* Skip over names until we find the one we are looking for */
2359
2360 snap_name = rbd_dev->header.snap_names;
2361 while (which--)
2362 snap_name += strlen(snap_name) + 1;
2363
2364 return snap_name;
2365}
2366
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002367/*
Alex Elder9d475de2012-07-03 16:01:19 -05002368 * Get the size and object order for an image snapshot, or if
2369 * snap_id is CEPH_NOSNAP, gets this information for the base
2370 * image.
2371 */
2372static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2373 u8 *order, u64 *snap_size)
2374{
2375 __le64 snapid = cpu_to_le64(snap_id);
2376 int ret;
2377 struct {
2378 u8 order;
2379 __le64 size;
2380 } __attribute__ ((packed)) size_buf = { 0 };
2381
2382 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2383 "rbd", "get_size",
2384 (char *) &snapid, sizeof (snapid),
2385 (char *) &size_buf, sizeof (size_buf),
2386 CEPH_OSD_FLAG_READ, NULL);
2387 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2388 if (ret < 0)
2389 return ret;
2390
2391 *order = size_buf.order;
2392 *snap_size = le64_to_cpu(size_buf.size);
2393
2394 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2395 (unsigned long long) snap_id, (unsigned int) *order,
2396 (unsigned long long) *snap_size);
2397
2398 return 0;
2399}
2400
2401static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2402{
2403 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2404 &rbd_dev->header.obj_order,
2405 &rbd_dev->header.image_size);
2406}
2407
Alex Elder1e130192012-07-03 16:01:19 -05002408static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2409{
2410 void *reply_buf;
2411 int ret;
2412 void *p;
2413
2414 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2415 if (!reply_buf)
2416 return -ENOMEM;
2417
2418 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2419 "rbd", "get_object_prefix",
2420 NULL, 0,
2421 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2422 CEPH_OSD_FLAG_READ, NULL);
2423 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2424 if (ret < 0)
2425 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002426 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002427
2428 p = reply_buf;
2429 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2430 p + RBD_OBJ_PREFIX_LEN_MAX,
2431 NULL, GFP_NOIO);
2432
2433 if (IS_ERR(rbd_dev->header.object_prefix)) {
2434 ret = PTR_ERR(rbd_dev->header.object_prefix);
2435 rbd_dev->header.object_prefix = NULL;
2436 } else {
2437 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2438 }
2439
2440out:
2441 kfree(reply_buf);
2442
2443 return ret;
2444}
2445
Alex Elderb1b54022012-07-03 16:01:19 -05002446static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2447 u64 *snap_features)
2448{
2449 __le64 snapid = cpu_to_le64(snap_id);
2450 struct {
2451 __le64 features;
2452 __le64 incompat;
2453 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002454 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002455 int ret;
2456
2457 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2458 "rbd", "get_features",
2459 (char *) &snapid, sizeof (snapid),
2460 (char *) &features_buf, sizeof (features_buf),
2461 CEPH_OSD_FLAG_READ, NULL);
2462 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2463 if (ret < 0)
2464 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002465
2466 incompat = le64_to_cpu(features_buf.incompat);
2467 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002468 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002469
Alex Elderb1b54022012-07-03 16:01:19 -05002470 *snap_features = le64_to_cpu(features_buf.features);
2471
2472 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2473 (unsigned long long) snap_id,
2474 (unsigned long long) *snap_features,
2475 (unsigned long long) le64_to_cpu(features_buf.incompat));
2476
2477 return 0;
2478}
2479
2480static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2481{
2482 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2483 &rbd_dev->header.features);
2484}
2485
Alex Elder86b00e02012-10-25 23:34:42 -05002486static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2487{
2488 struct rbd_spec *parent_spec;
2489 size_t size;
2490 void *reply_buf = NULL;
2491 __le64 snapid;
2492 void *p;
2493 void *end;
2494 char *image_id;
2495 u64 overlap;
2496 size_t len = 0;
2497 int ret;
2498
2499 parent_spec = rbd_spec_alloc();
2500 if (!parent_spec)
2501 return -ENOMEM;
2502
2503 size = sizeof (__le64) + /* pool_id */
2504 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2505 sizeof (__le64) + /* snap_id */
2506 sizeof (__le64); /* overlap */
2507 reply_buf = kmalloc(size, GFP_KERNEL);
2508 if (!reply_buf) {
2509 ret = -ENOMEM;
2510 goto out_err;
2511 }
2512
2513 snapid = cpu_to_le64(CEPH_NOSNAP);
2514 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2515 "rbd", "get_parent",
2516 (char *) &snapid, sizeof (snapid),
2517 (char *) reply_buf, size,
2518 CEPH_OSD_FLAG_READ, NULL);
2519 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2520 if (ret < 0)
2521 goto out_err;
2522
2523 ret = -ERANGE;
2524 p = reply_buf;
2525 end = (char *) reply_buf + size;
2526 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2527 if (parent_spec->pool_id == CEPH_NOPOOL)
2528 goto out; /* No parent? No problem. */
2529
2530 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2531 if (IS_ERR(image_id)) {
2532 ret = PTR_ERR(image_id);
2533 goto out_err;
2534 }
2535 parent_spec->image_id = image_id;
Alex Elder9e15b772012-10-30 19:40:33 -05002536 parent_spec->image_id_len = len;
Alex Elder86b00e02012-10-25 23:34:42 -05002537 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2538 ceph_decode_64_safe(&p, end, overlap, out_err);
2539
2540 rbd_dev->parent_overlap = overlap;
2541 rbd_dev->parent_spec = parent_spec;
2542 parent_spec = NULL; /* rbd_dev now owns this */
2543out:
2544 ret = 0;
2545out_err:
2546 kfree(reply_buf);
2547 rbd_spec_put(parent_spec);
2548
2549 return ret;
2550}
2551
Alex Elder9e15b772012-10-30 19:40:33 -05002552static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2553{
2554 size_t image_id_size;
2555 char *image_id;
2556 void *p;
2557 void *end;
2558 size_t size;
2559 void *reply_buf = NULL;
2560 size_t len = 0;
2561 char *image_name = NULL;
2562 int ret;
2563
2564 rbd_assert(!rbd_dev->spec->image_name);
2565
2566 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
2567 image_id = kmalloc(image_id_size, GFP_KERNEL);
2568 if (!image_id)
2569 return NULL;
2570
2571 p = image_id;
2572 end = (char *) image_id + image_id_size;
2573 ceph_encode_string(&p, end, rbd_dev->spec->image_id,
2574 (u32) rbd_dev->spec->image_id_len);
2575
2576 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2577 reply_buf = kmalloc(size, GFP_KERNEL);
2578 if (!reply_buf)
2579 goto out;
2580
2581 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2582 "rbd", "dir_get_name",
2583 image_id, image_id_size,
2584 (char *) reply_buf, size,
2585 CEPH_OSD_FLAG_READ, NULL);
2586 if (ret < 0)
2587 goto out;
2588 p = reply_buf;
2589 end = (char *) reply_buf + size;
2590 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2591 if (IS_ERR(image_name))
2592 image_name = NULL;
2593 else
2594 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2595out:
2596 kfree(reply_buf);
2597 kfree(image_id);
2598
2599 return image_name;
2600}
2601
2602/*
2603 * When a parent image gets probed, we only have the pool, image,
2604 * and snapshot ids but not the names of any of them. This call
2605 * is made later to fill in those names. It has to be done after
2606 * rbd_dev_snaps_update() has completed because some of the
2607 * information (in particular, snapshot name) is not available
2608 * until then.
2609 */
2610static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2611{
2612 struct ceph_osd_client *osdc;
2613 const char *name;
2614 void *reply_buf = NULL;
2615 int ret;
2616
2617 if (rbd_dev->spec->pool_name)
2618 return 0; /* Already have the names */
2619
2620 /* Look up the pool name */
2621
2622 osdc = &rbd_dev->rbd_client->client->osdc;
2623 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2624 if (!name)
2625 return -EIO; /* pool id too large (>= 2^31) */
2626
2627 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2628 if (!rbd_dev->spec->pool_name)
2629 return -ENOMEM;
2630
2631 /* Fetch the image name; tolerate failure here */
2632
2633 name = rbd_dev_image_name(rbd_dev);
2634 if (name) {
2635 rbd_dev->spec->image_name_len = strlen(name);
2636 rbd_dev->spec->image_name = (char *) name;
2637 } else {
2638 pr_warning(RBD_DRV_NAME "%d "
2639 "unable to get image name for image id %s\n",
2640 rbd_dev->major, rbd_dev->spec->image_id);
2641 }
2642
2643 /* Look up the snapshot name. */
2644
2645 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2646 if (!name) {
2647 ret = -EIO;
2648 goto out_err;
2649 }
2650 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2651 if(!rbd_dev->spec->snap_name)
2652 goto out_err;
2653
2654 return 0;
2655out_err:
2656 kfree(reply_buf);
2657 kfree(rbd_dev->spec->pool_name);
2658 rbd_dev->spec->pool_name = NULL;
2659
2660 return ret;
2661}
2662
Alex Elder6e14b1a2012-07-03 16:01:19 -05002663static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002664{
2665 size_t size;
2666 int ret;
2667 void *reply_buf;
2668 void *p;
2669 void *end;
2670 u64 seq;
2671 u32 snap_count;
2672 struct ceph_snap_context *snapc;
2673 u32 i;
2674
2675 /*
2676 * We'll need room for the seq value (maximum snapshot id),
2677 * snapshot count, and array of that many snapshot ids.
2678 * For now we have a fixed upper limit on the number we're
2679 * prepared to receive.
2680 */
2681 size = sizeof (__le64) + sizeof (__le32) +
2682 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2683 reply_buf = kzalloc(size, GFP_KERNEL);
2684 if (!reply_buf)
2685 return -ENOMEM;
2686
2687 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2688 "rbd", "get_snapcontext",
2689 NULL, 0,
2690 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002691 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002692 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2693 if (ret < 0)
2694 goto out;
2695
2696 ret = -ERANGE;
2697 p = reply_buf;
2698 end = (char *) reply_buf + size;
2699 ceph_decode_64_safe(&p, end, seq, out);
2700 ceph_decode_32_safe(&p, end, snap_count, out);
2701
2702 /*
2703 * Make sure the reported number of snapshot ids wouldn't go
2704 * beyond the end of our buffer. But before checking that,
2705 * make sure the computed size of the snapshot context we
2706 * allocate is representable in a size_t.
2707 */
2708 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2709 / sizeof (u64)) {
2710 ret = -EINVAL;
2711 goto out;
2712 }
2713 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2714 goto out;
2715
2716 size = sizeof (struct ceph_snap_context) +
2717 snap_count * sizeof (snapc->snaps[0]);
2718 snapc = kmalloc(size, GFP_KERNEL);
2719 if (!snapc) {
2720 ret = -ENOMEM;
2721 goto out;
2722 }
2723
2724 atomic_set(&snapc->nref, 1);
2725 snapc->seq = seq;
2726 snapc->num_snaps = snap_count;
2727 for (i = 0; i < snap_count; i++)
2728 snapc->snaps[i] = ceph_decode_64(&p);
2729
2730 rbd_dev->header.snapc = snapc;
2731
2732 dout(" snap context seq = %llu, snap_count = %u\n",
2733 (unsigned long long) seq, (unsigned int) snap_count);
2734
2735out:
2736 kfree(reply_buf);
2737
2738 return 0;
2739}
2740
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002741static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2742{
2743 size_t size;
2744 void *reply_buf;
2745 __le64 snap_id;
2746 int ret;
2747 void *p;
2748 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002749 char *snap_name;
2750
2751 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2752 reply_buf = kmalloc(size, GFP_KERNEL);
2753 if (!reply_buf)
2754 return ERR_PTR(-ENOMEM);
2755
2756 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2757 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2758 "rbd", "get_snapshot_name",
2759 (char *) &snap_id, sizeof (snap_id),
2760 reply_buf, size,
2761 CEPH_OSD_FLAG_READ, NULL);
2762 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2763 if (ret < 0)
2764 goto out;
2765
2766 p = reply_buf;
2767 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002768 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002769 if (IS_ERR(snap_name)) {
2770 ret = PTR_ERR(snap_name);
2771 goto out;
2772 } else {
2773 dout(" snap_id 0x%016llx snap_name = %s\n",
2774 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2775 }
2776 kfree(reply_buf);
2777
2778 return snap_name;
2779out:
2780 kfree(reply_buf);
2781
2782 return ERR_PTR(ret);
2783}
2784
2785static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2786 u64 *snap_size, u64 *snap_features)
2787{
2788 __le64 snap_id;
2789 u8 order;
2790 int ret;
2791
2792 snap_id = rbd_dev->header.snapc->snaps[which];
2793 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2794 if (ret)
2795 return ERR_PTR(ret);
2796 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2797 if (ret)
2798 return ERR_PTR(ret);
2799
2800 return rbd_dev_v2_snap_name(rbd_dev, which);
2801}
2802
2803static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2804 u64 *snap_size, u64 *snap_features)
2805{
2806 if (rbd_dev->image_format == 1)
2807 return rbd_dev_v1_snap_info(rbd_dev, which,
2808 snap_size, snap_features);
2809 if (rbd_dev->image_format == 2)
2810 return rbd_dev_v2_snap_info(rbd_dev, which,
2811 snap_size, snap_features);
2812 return ERR_PTR(-EINVAL);
2813}
2814
Alex Elder117973f2012-08-31 17:29:55 -05002815static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2816{
2817 int ret;
2818 __u8 obj_order;
2819
2820 down_write(&rbd_dev->header_rwsem);
2821
2822 /* Grab old order first, to see if it changes */
2823
2824 obj_order = rbd_dev->header.obj_order,
2825 ret = rbd_dev_v2_image_size(rbd_dev);
2826 if (ret)
2827 goto out;
2828 if (rbd_dev->header.obj_order != obj_order) {
2829 ret = -EIO;
2830 goto out;
2831 }
2832 rbd_update_mapping_size(rbd_dev);
2833
2834 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2835 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2836 if (ret)
2837 goto out;
2838 ret = rbd_dev_snaps_update(rbd_dev);
2839 dout("rbd_dev_snaps_update returned %d\n", ret);
2840 if (ret)
2841 goto out;
2842 ret = rbd_dev_snaps_register(rbd_dev);
2843 dout("rbd_dev_snaps_register returned %d\n", ret);
2844out:
2845 up_write(&rbd_dev->header_rwsem);
2846
2847 return ret;
2848}
2849
Alex Elder9d475de2012-07-03 16:01:19 -05002850/*
Alex Elder35938152012-08-02 11:29:46 -05002851 * Scan the rbd device's current snapshot list and compare it to the
2852 * newly-received snapshot context. Remove any existing snapshots
2853 * not present in the new snapshot context. Add a new snapshot for
2854 * any snaphots in the snapshot context not in the current list.
2855 * And verify there are no changes to snapshots we already know
2856 * about.
2857 *
2858 * Assumes the snapshots in the snapshot context are sorted by
2859 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2860 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002861 */
Alex Elder304f6802012-08-31 17:29:52 -05002862static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002863{
Alex Elder35938152012-08-02 11:29:46 -05002864 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2865 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002866 struct list_head *head = &rbd_dev->snaps;
2867 struct list_head *links = head->next;
2868 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002869
Alex Elder9fcbb802012-08-23 23:48:49 -05002870 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002871 while (index < snap_count || links != head) {
2872 u64 snap_id;
2873 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002874 char *snap_name;
2875 u64 snap_size = 0;
2876 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002877
Alex Elder35938152012-08-02 11:29:46 -05002878 snap_id = index < snap_count ? snapc->snaps[index]
2879 : CEPH_NOSNAP;
2880 snap = links != head ? list_entry(links, struct rbd_snap, node)
2881 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002882 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002883
Alex Elder35938152012-08-02 11:29:46 -05002884 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2885 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002886
Alex Elder35938152012-08-02 11:29:46 -05002887 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002888
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002889 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderdaba5fd2012-10-26 17:25:23 -05002890 rbd_dev->exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002891 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002892 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002893 rbd_dev->spec->snap_id == snap->id ?
2894 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002895 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002896
Alex Elder35938152012-08-02 11:29:46 -05002897 /* Done with this list entry; advance */
2898
2899 links = next;
2900 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002901 }
Alex Elder35938152012-08-02 11:29:46 -05002902
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002903 snap_name = rbd_dev_snap_info(rbd_dev, index,
2904 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002905 if (IS_ERR(snap_name))
2906 return PTR_ERR(snap_name);
2907
Alex Elder9fcbb802012-08-23 23:48:49 -05002908 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2909 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002910 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2911 struct rbd_snap *new_snap;
2912
2913 /* We haven't seen this snapshot before */
2914
Alex Elderc8d18422012-07-10 20:30:11 -05002915 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002916 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002917 if (IS_ERR(new_snap)) {
2918 int err = PTR_ERR(new_snap);
2919
2920 dout(" failed to add dev, error %d\n", err);
2921
2922 return err;
2923 }
Alex Elder35938152012-08-02 11:29:46 -05002924
2925 /* New goes before existing, or at end of list */
2926
Alex Elder9fcbb802012-08-23 23:48:49 -05002927 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002928 if (snap)
2929 list_add_tail(&new_snap->node, &snap->node);
2930 else
Alex Elder523f3252012-08-30 00:16:37 -05002931 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002932 } else {
2933 /* Already have this one */
2934
Alex Elder9fcbb802012-08-23 23:48:49 -05002935 dout(" already present\n");
2936
Alex Eldercd892122012-07-03 16:01:19 -05002937 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002938 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002939 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002940
2941 /* Done with this list entry; advance */
2942
2943 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002944 }
Alex Elder35938152012-08-02 11:29:46 -05002945
2946 /* Advance to the next entry in the snapshot context */
2947
2948 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002949 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002950 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002951
2952 return 0;
2953}
2954
Alex Elder304f6802012-08-31 17:29:52 -05002955/*
2956 * Scan the list of snapshots and register the devices for any that
2957 * have not already been registered.
2958 */
2959static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2960{
2961 struct rbd_snap *snap;
2962 int ret = 0;
2963
2964 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002965 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2966 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002967
2968 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2969 if (!rbd_snap_registered(snap)) {
2970 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2971 if (ret < 0)
2972 break;
2973 }
2974 }
2975 dout("%s: returning %d\n", __func__, ret);
2976
2977 return ret;
2978}
2979
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002980static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2981{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002982 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002983 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002984
2985 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002986
Alex Eldercd789ab2012-08-30 00:16:38 -05002987 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002988 dev->bus = &rbd_bus_type;
2989 dev->type = &rbd_device_type;
2990 dev->parent = &rbd_root_dev;
2991 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002992 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002993 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002994
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002995 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002996
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002997 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002998}
2999
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003000static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3001{
3002 device_unregister(&rbd_dev->dev);
3003}
3004
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003005static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
3006{
3007 int ret, rc;
3008
3009 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05003010 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003011 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05003012 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003013 if (rc < 0)
3014 return rc;
3015 }
3016 } while (ret == -ERANGE);
3017
3018 return ret;
3019}
3020
Alex Eldere2839302012-08-29 17:11:06 -05003021static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003022
3023/*
Alex Elder499afd52012-02-02 08:13:29 -06003024 * Get a unique rbd identifier for the given new rbd_dev, and add
3025 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003026 */
Alex Eldere2839302012-08-29 17:11:06 -05003027static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003028{
Alex Eldere2839302012-08-29 17:11:06 -05003029 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003030
3031 spin_lock(&rbd_dev_list_lock);
3032 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3033 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003034 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3035 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003036}
Alex Elderb7f23c32012-01-29 13:57:43 -06003037
Alex Elder1ddbe942012-01-29 13:57:44 -06003038/*
Alex Elder499afd52012-02-02 08:13:29 -06003039 * Remove an rbd_dev from the global list, and record that its
3040 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003041 */
Alex Eldere2839302012-08-29 17:11:06 -05003042static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003043{
Alex Elderd184f6b2012-01-29 13:57:44 -06003044 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003045 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003046 int max_id;
3047
Alex Elderaafb2302012-09-06 16:00:54 -05003048 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003049
Alex Eldere2839302012-08-29 17:11:06 -05003050 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3051 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003052 spin_lock(&rbd_dev_list_lock);
3053 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003054
3055 /*
3056 * If the id being "put" is not the current maximum, there
3057 * is nothing special we need to do.
3058 */
Alex Eldere2839302012-08-29 17:11:06 -05003059 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003060 spin_unlock(&rbd_dev_list_lock);
3061 return;
3062 }
3063
3064 /*
3065 * We need to update the current maximum id. Search the
3066 * list to find out what it is. We're more likely to find
3067 * the maximum at the end, so search the list backward.
3068 */
3069 max_id = 0;
3070 list_for_each_prev(tmp, &rbd_dev_list) {
3071 struct rbd_device *rbd_dev;
3072
3073 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003074 if (rbd_dev->dev_id > max_id)
3075 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003076 }
Alex Elder499afd52012-02-02 08:13:29 -06003077 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003078
Alex Elder1ddbe942012-01-29 13:57:44 -06003079 /*
Alex Eldere2839302012-08-29 17:11:06 -05003080 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003081 * which case it now accurately reflects the new maximum.
3082 * Be careful not to overwrite the maximum value in that
3083 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003084 */
Alex Eldere2839302012-08-29 17:11:06 -05003085 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3086 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003087}
3088
Alex Eldera725f65e2012-02-02 08:13:30 -06003089/*
Alex Eldere28fff262012-02-02 08:13:30 -06003090 * Skips over white space at *buf, and updates *buf to point to the
3091 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003092 * the token (string of non-white space characters) found. Note
3093 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003094 */
3095static inline size_t next_token(const char **buf)
3096{
3097 /*
3098 * These are the characters that produce nonzero for
3099 * isspace() in the "C" and "POSIX" locales.
3100 */
3101 const char *spaces = " \f\n\r\t\v";
3102
3103 *buf += strspn(*buf, spaces); /* Find start of token */
3104
3105 return strcspn(*buf, spaces); /* Return token length */
3106}
3107
3108/*
3109 * Finds the next token in *buf, and if the provided token buffer is
3110 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003111 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3112 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003113 *
3114 * Returns the length of the token found (not including the '\0').
3115 * Return value will be 0 if no token is found, and it will be >=
3116 * token_size if the token would not fit.
3117 *
Alex Elder593a9e72012-02-07 12:03:37 -06003118 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003119 * found token. Note that this occurs even if the token buffer is
3120 * too small to hold it.
3121 */
3122static inline size_t copy_token(const char **buf,
3123 char *token,
3124 size_t token_size)
3125{
3126 size_t len;
3127
3128 len = next_token(buf);
3129 if (len < token_size) {
3130 memcpy(token, *buf, len);
3131 *(token + len) = '\0';
3132 }
3133 *buf += len;
3134
3135 return len;
3136}
3137
3138/*
Alex Elderea3352f2012-07-09 21:04:23 -05003139 * Finds the next token in *buf, dynamically allocates a buffer big
3140 * enough to hold a copy of it, and copies the token into the new
3141 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3142 * that a duplicate buffer is created even for a zero-length token.
3143 *
3144 * Returns a pointer to the newly-allocated duplicate, or a null
3145 * pointer if memory for the duplicate was not available. If
3146 * the lenp argument is a non-null pointer, the length of the token
3147 * (not including the '\0') is returned in *lenp.
3148 *
3149 * If successful, the *buf pointer will be updated to point beyond
3150 * the end of the found token.
3151 *
3152 * Note: uses GFP_KERNEL for allocation.
3153 */
3154static inline char *dup_token(const char **buf, size_t *lenp)
3155{
3156 char *dup;
3157 size_t len;
3158
3159 len = next_token(buf);
3160 dup = kmalloc(len + 1, GFP_KERNEL);
3161 if (!dup)
3162 return NULL;
3163
3164 memcpy(dup, *buf, len);
3165 *(dup + len) = '\0';
3166 *buf += len;
3167
3168 if (lenp)
3169 *lenp = len;
3170
3171 return dup;
3172}
3173
3174/*
Alex Elder859c31d2012-10-25 23:34:42 -05003175 * Parse the options provided for an "rbd add" (i.e., rbd image
3176 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3177 * and the data written is passed here via a NUL-terminated buffer.
3178 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003179 *
Alex Elder859c31d2012-10-25 23:34:42 -05003180 * The information extracted from these options is recorded in
3181 * the other parameters which return dynamically-allocated
3182 * structures:
3183 * ceph_opts
3184 * The address of a pointer that will refer to a ceph options
3185 * structure. Caller must release the returned pointer using
3186 * ceph_destroy_options() when it is no longer needed.
3187 * rbd_opts
3188 * Address of an rbd options pointer. Fully initialized by
3189 * this function; caller must release with kfree().
3190 * spec
3191 * Address of an rbd image specification pointer. Fully
3192 * initialized by this function based on parsed options.
3193 * Caller must release with rbd_spec_put().
3194 *
3195 * The options passed take this form:
3196 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3197 * where:
3198 * <mon_addrs>
3199 * A comma-separated list of one or more monitor addresses.
3200 * A monitor address is an ip address, optionally followed
3201 * by a port number (separated by a colon).
3202 * I.e.: ip1[:port1][,ip2[:port2]...]
3203 * <options>
3204 * A comma-separated list of ceph and/or rbd options.
3205 * <pool_name>
3206 * The name of the rados pool containing the rbd image.
3207 * <image_name>
3208 * The name of the image in that pool to map.
3209 * <snap_id>
3210 * An optional snapshot id. If provided, the mapping will
3211 * present data from the image at the time that snapshot was
3212 * created. The image head is used if no snapshot id is
3213 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003214 */
Alex Elder859c31d2012-10-25 23:34:42 -05003215static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003216 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003217 struct rbd_options **opts,
3218 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003219{
Alex Elderd22f76e2012-07-12 10:46:35 -05003220 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003221 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003222 const char *mon_addrs;
3223 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003224 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003225 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003226 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003227 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003228
3229 /* The first four tokens are required */
3230
Alex Elder7ef32142012-02-02 08:13:30 -06003231 len = next_token(&buf);
3232 if (!len)
Alex Elderdc79b112012-10-25 23:34:41 -05003233 return -EINVAL; /* Missing monitor address(es) */
Alex Elder0ddebc02012-10-25 23:34:41 -05003234 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003235 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003236 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003237
Alex Elderdc79b112012-10-25 23:34:41 -05003238 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003239 options = dup_token(&buf, NULL);
3240 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003241 return -ENOMEM;
Alex Elderf28e5652012-10-25 23:34:41 -05003242 if (!*options)
3243 goto out_err; /* Missing options */
Alex Eldera725f65e2012-02-02 08:13:30 -06003244
Alex Elder859c31d2012-10-25 23:34:42 -05003245 spec = rbd_spec_alloc();
3246 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003247 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003248
3249 spec->pool_name = dup_token(&buf, NULL);
3250 if (!spec->pool_name)
3251 goto out_mem;
3252 if (!*spec->pool_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003253 goto out_err; /* Missing pool name */
Alex Eldere28fff262012-02-02 08:13:30 -06003254
Alex Elder859c31d2012-10-25 23:34:42 -05003255 spec->image_name = dup_token(&buf, &spec->image_name_len);
3256 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003257 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003258 if (!*spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003259 goto out_err; /* Missing image name */
Alex Eldere28fff262012-02-02 08:13:30 -06003260
Alex Elderf28e5652012-10-25 23:34:41 -05003261 /*
3262 * Snapshot name is optional; default is to use "-"
3263 * (indicating the head/no snapshot).
3264 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003265 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003266 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003267 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3268 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003269 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003270 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003271 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003272 }
Alex Elder859c31d2012-10-25 23:34:42 -05003273 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3274 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003275 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003276 memcpy(spec->snap_name, buf, len);
3277 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003278
Alex Elder0ddebc02012-10-25 23:34:41 -05003279 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003280
Alex Elder4e9afeb2012-10-25 23:34:41 -05003281 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3282 if (!rbd_opts)
3283 goto out_mem;
3284
3285 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003286
Alex Elder859c31d2012-10-25 23:34:42 -05003287 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003288 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003289 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003290 if (IS_ERR(copts)) {
3291 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003292 goto out_err;
3293 }
Alex Elder859c31d2012-10-25 23:34:42 -05003294 kfree(options);
3295
3296 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003297 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003298 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003299
Alex Elderdc79b112012-10-25 23:34:41 -05003300 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003301out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003302 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003303out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003304 kfree(rbd_opts);
3305 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003306 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003307
Alex Elderdc79b112012-10-25 23:34:41 -05003308 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003309}
3310
Alex Elder589d30e2012-07-10 20:30:11 -05003311/*
3312 * An rbd format 2 image has a unique identifier, distinct from the
3313 * name given to it by the user. Internally, that identifier is
3314 * what's used to specify the names of objects related to the image.
3315 *
3316 * A special "rbd id" object is used to map an rbd image name to its
3317 * id. If that object doesn't exist, then there is no v2 rbd image
3318 * with the supplied name.
3319 *
3320 * This function will record the given rbd_dev's image_id field if
3321 * it can be determined, and in that case will return 0. If any
3322 * errors occur a negative errno will be returned and the rbd_dev's
3323 * image_id field will be unchanged (and should be NULL).
3324 */
3325static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3326{
3327 int ret;
3328 size_t size;
3329 char *object_name;
3330 void *response;
3331 void *p;
3332
3333 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003334 * When probing a parent image, the image id is already
3335 * known (and the image name likely is not). There's no
3336 * need to fetch the image id again in this case.
3337 */
3338 if (rbd_dev->spec->image_id)
3339 return 0;
3340
3341 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003342 * First, see if the format 2 image id file exists, and if
3343 * so, get the image's persistent id from it.
3344 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003345 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
Alex Elder589d30e2012-07-10 20:30:11 -05003346 object_name = kmalloc(size, GFP_NOIO);
3347 if (!object_name)
3348 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003349 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003350 dout("rbd id object name is %s\n", object_name);
3351
3352 /* Response will be an encoded string, which includes a length */
3353
3354 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3355 response = kzalloc(size, GFP_NOIO);
3356 if (!response) {
3357 ret = -ENOMEM;
3358 goto out;
3359 }
3360
3361 ret = rbd_req_sync_exec(rbd_dev, object_name,
3362 "rbd", "get_id",
3363 NULL, 0,
3364 response, RBD_IMAGE_ID_LEN_MAX,
3365 CEPH_OSD_FLAG_READ, NULL);
3366 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3367 if (ret < 0)
3368 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003369 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003370
3371 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003372 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003373 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003374 &rbd_dev->spec->image_id_len,
Alex Elder589d30e2012-07-10 20:30:11 -05003375 GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003376 if (IS_ERR(rbd_dev->spec->image_id)) {
3377 ret = PTR_ERR(rbd_dev->spec->image_id);
3378 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003379 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003380 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003381 }
3382out:
3383 kfree(response);
3384 kfree(object_name);
3385
3386 return ret;
3387}
3388
Alex Eldera30b71b2012-07-10 20:30:11 -05003389static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3390{
3391 int ret;
3392 size_t size;
3393
3394 /* Version 1 images have no id; empty string is used */
3395
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003396 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3397 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003398 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003399 rbd_dev->spec->image_id_len = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003400
3401 /* Record the header object name for this rbd image. */
3402
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003403 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003404 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3405 if (!rbd_dev->header_name) {
3406 ret = -ENOMEM;
3407 goto out_err;
3408 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003409 sprintf(rbd_dev->header_name, "%s%s",
3410 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003411
3412 /* Populate rbd image metadata */
3413
3414 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3415 if (ret < 0)
3416 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003417
3418 /* Version 1 images have no parent (no layering) */
3419
3420 rbd_dev->parent_spec = NULL;
3421 rbd_dev->parent_overlap = 0;
3422
Alex Eldera30b71b2012-07-10 20:30:11 -05003423 rbd_dev->image_format = 1;
3424
3425 dout("discovered version 1 image, header name is %s\n",
3426 rbd_dev->header_name);
3427
3428 return 0;
3429
3430out_err:
3431 kfree(rbd_dev->header_name);
3432 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003433 kfree(rbd_dev->spec->image_id);
3434 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003435
3436 return ret;
3437}
3438
3439static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3440{
3441 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003442 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003443 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003444
3445 /*
3446 * Image id was filled in by the caller. Record the header
3447 * object name for this rbd image.
3448 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003449 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
Alex Eldera30b71b2012-07-10 20:30:11 -05003450 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3451 if (!rbd_dev->header_name)
3452 return -ENOMEM;
3453 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003454 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003455
3456 /* Get the size and object order for the image */
3457
3458 ret = rbd_dev_v2_image_size(rbd_dev);
3459 if (ret < 0)
3460 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003461
3462 /* Get the object prefix (a.k.a. block_name) for the image */
3463
3464 ret = rbd_dev_v2_object_prefix(rbd_dev);
3465 if (ret < 0)
3466 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003467
Alex Elderd8891402012-10-09 13:50:17 -07003468 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003469
3470 ret = rbd_dev_v2_features(rbd_dev);
3471 if (ret < 0)
3472 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003473
Alex Elder86b00e02012-10-25 23:34:42 -05003474 /* If the image supports layering, get the parent info */
3475
3476 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3477 ret = rbd_dev_v2_parent_info(rbd_dev);
3478 if (ret < 0)
3479 goto out_err;
3480 }
3481
Alex Elder6e14b1a2012-07-03 16:01:19 -05003482 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003483
Alex Elder6e14b1a2012-07-03 16:01:19 -05003484 rbd_dev->header.crypt_type = 0;
3485 rbd_dev->header.comp_type = 0;
3486
3487 /* Get the snapshot context, plus the header version */
3488
3489 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003490 if (ret)
3491 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003492 rbd_dev->header.obj_version = ver;
3493
Alex Eldera30b71b2012-07-10 20:30:11 -05003494 rbd_dev->image_format = 2;
3495
3496 dout("discovered version 2 image, header name is %s\n",
3497 rbd_dev->header_name);
3498
Alex Elder35152972012-08-31 17:29:55 -05003499 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003500out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003501 rbd_dev->parent_overlap = 0;
3502 rbd_spec_put(rbd_dev->parent_spec);
3503 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003504 kfree(rbd_dev->header_name);
3505 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003506 kfree(rbd_dev->header.object_prefix);
3507 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003508
3509 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003510}
3511
Alex Elder83a06262012-10-30 15:47:17 -05003512static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3513{
3514 int ret;
3515
3516 /* no need to lock here, as rbd_dev is not registered yet */
3517 ret = rbd_dev_snaps_update(rbd_dev);
3518 if (ret)
3519 return ret;
3520
Alex Elder9e15b772012-10-30 19:40:33 -05003521 ret = rbd_dev_probe_update_spec(rbd_dev);
3522 if (ret)
3523 goto err_out_snaps;
3524
Alex Elder83a06262012-10-30 15:47:17 -05003525 ret = rbd_dev_set_mapping(rbd_dev);
3526 if (ret)
3527 goto err_out_snaps;
3528
3529 /* generate unique id: find highest unique id, add one */
3530 rbd_dev_id_get(rbd_dev);
3531
3532 /* Fill in the device name, now that we have its id. */
3533 BUILD_BUG_ON(DEV_NAME_LEN
3534 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3535 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3536
3537 /* Get our block major device number. */
3538
3539 ret = register_blkdev(0, rbd_dev->name);
3540 if (ret < 0)
3541 goto err_out_id;
3542 rbd_dev->major = ret;
3543
3544 /* Set up the blkdev mapping. */
3545
3546 ret = rbd_init_disk(rbd_dev);
3547 if (ret)
3548 goto err_out_blkdev;
3549
3550 ret = rbd_bus_add_dev(rbd_dev);
3551 if (ret)
3552 goto err_out_disk;
3553
3554 /*
3555 * At this point cleanup in the event of an error is the job
3556 * of the sysfs code (initiated by rbd_bus_del_dev()).
3557 */
3558 down_write(&rbd_dev->header_rwsem);
3559 ret = rbd_dev_snaps_register(rbd_dev);
3560 up_write(&rbd_dev->header_rwsem);
3561 if (ret)
3562 goto err_out_bus;
3563
3564 ret = rbd_init_watch_dev(rbd_dev);
3565 if (ret)
3566 goto err_out_bus;
3567
3568 /* Everything's ready. Announce the disk to the world. */
3569
3570 add_disk(rbd_dev->disk);
3571
3572 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3573 (unsigned long long) rbd_dev->mapping.size);
3574
3575 return ret;
3576err_out_bus:
3577 /* this will also clean up rest of rbd_dev stuff */
3578
3579 rbd_bus_del_dev(rbd_dev);
3580
3581 return ret;
3582err_out_disk:
3583 rbd_free_disk(rbd_dev);
3584err_out_blkdev:
3585 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3586err_out_id:
3587 rbd_dev_id_put(rbd_dev);
3588err_out_snaps:
3589 rbd_remove_all_snaps(rbd_dev);
3590
3591 return ret;
3592}
3593
Alex Eldera30b71b2012-07-10 20:30:11 -05003594/*
3595 * Probe for the existence of the header object for the given rbd
3596 * device. For format 2 images this includes determining the image
3597 * id.
3598 */
3599static int rbd_dev_probe(struct rbd_device *rbd_dev)
3600{
3601 int ret;
3602
3603 /*
3604 * Get the id from the image id object. If it's not a
3605 * format 2 image, we'll get ENOENT back, and we'll assume
3606 * it's a format 1 image.
3607 */
3608 ret = rbd_dev_image_id(rbd_dev);
3609 if (ret)
3610 ret = rbd_dev_v1_probe(rbd_dev);
3611 else
3612 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003613 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003614 dout("probe failed, returning %d\n", ret);
3615
Alex Elder83a06262012-10-30 15:47:17 -05003616 return ret;
3617 }
3618
3619 ret = rbd_dev_probe_finish(rbd_dev);
3620 if (ret)
3621 rbd_header_free(&rbd_dev->header);
3622
Alex Eldera30b71b2012-07-10 20:30:11 -05003623 return ret;
3624}
3625
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003626static ssize_t rbd_add(struct bus_type *bus,
3627 const char *buf,
3628 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003629{
Alex Eldercb8627c2012-07-09 21:04:23 -05003630 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003631 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003632 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003633 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003634 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003635 struct ceph_osd_client *osdc;
3636 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003637
3638 if (!try_module_get(THIS_MODULE))
3639 return -ENODEV;
3640
Alex Eldera725f65e2012-02-02 08:13:30 -06003641 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003642 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003643 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003644 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003645
Alex Elder9d3997f2012-10-25 23:34:42 -05003646 rbdc = rbd_get_client(ceph_opts);
3647 if (IS_ERR(rbdc)) {
3648 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003649 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003650 }
Alex Elderc53d5892012-10-25 23:34:42 -05003651 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003652
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003653 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003654 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003655 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003656 if (rc < 0)
3657 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003658 spec->pool_id = (u64) rc;
3659
Alex Elderc53d5892012-10-25 23:34:42 -05003660 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003661 if (!rbd_dev)
3662 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003663 rbdc = NULL; /* rbd_dev now owns this */
3664 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003665
Alex Elderbd4ba652012-10-25 23:34:42 -05003666 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003667 kfree(rbd_opts);
3668 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003669
Alex Eldera30b71b2012-07-10 20:30:11 -05003670 rc = rbd_dev_probe(rbd_dev);
3671 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05003672 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05003673
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003674 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05003675err_out_rbd_dev:
3676 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05003677err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003678 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003679err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003680 if (ceph_opts)
3681 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003682 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003683 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003684err_out_module:
3685 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003686
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003687 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003688
3689 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003690}
3691
Alex Elderde71a292012-07-03 16:01:19 -05003692static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003693{
3694 struct list_head *tmp;
3695 struct rbd_device *rbd_dev;
3696
Alex Eldere124a822012-01-29 13:57:44 -06003697 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003698 list_for_each(tmp, &rbd_dev_list) {
3699 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003700 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06003701 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003702 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06003703 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003704 }
Alex Eldere124a822012-01-29 13:57:44 -06003705 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003706 return NULL;
3707}
3708
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003709static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003710{
Alex Elder593a9e72012-02-07 12:03:37 -06003711 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003712
Alex Elder1dbb4392012-01-24 10:08:37 -06003713 if (rbd_dev->watch_request) {
3714 struct ceph_client *client = rbd_dev->rbd_client->client;
3715
3716 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003717 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003718 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003719 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003720 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003721
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003722
3723 /* clean up and free blkdev */
3724 rbd_free_disk(rbd_dev);
3725 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003726
Alex Elder2ac4e752012-07-10 20:30:10 -05003727 /* release allocated disk header fields */
3728 rbd_header_free(&rbd_dev->header);
3729
Alex Elder32eec682012-02-08 16:11:14 -06003730 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05003731 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05003732 rbd_assert(rbd_dev->rbd_client != NULL);
3733 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003734
3735 /* release module ref */
3736 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003737}
3738
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003739static ssize_t rbd_remove(struct bus_type *bus,
3740 const char *buf,
3741 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003742{
3743 struct rbd_device *rbd_dev = NULL;
3744 int target_id, rc;
3745 unsigned long ul;
3746 int ret = count;
3747
3748 rc = strict_strtoul(buf, 10, &ul);
3749 if (rc)
3750 return rc;
3751
3752 /* convert to int; abort if we lost anything in the conversion */
3753 target_id = (int) ul;
3754 if (target_id != ul)
3755 return -EINVAL;
3756
3757 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3758
3759 rbd_dev = __rbd_get_dev(target_id);
3760 if (!rbd_dev) {
3761 ret = -ENOENT;
3762 goto done;
3763 }
3764
Alex Elder42382b72012-11-16 09:29:16 -06003765 if (rbd_dev->open_count) {
3766 ret = -EBUSY;
3767 goto done;
3768 }
3769
Alex Elder41f38c22012-10-25 23:34:40 -05003770 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003771 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003772
3773done:
3774 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003775
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003776 return ret;
3777}
3778
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003779/*
3780 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003781 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003782 */
3783static int rbd_sysfs_init(void)
3784{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003785 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003786
Alex Elderfed4c142012-02-07 12:03:36 -06003787 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003788 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003789 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003790
Alex Elderfed4c142012-02-07 12:03:36 -06003791 ret = bus_register(&rbd_bus_type);
3792 if (ret < 0)
3793 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003794
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003795 return ret;
3796}
3797
3798static void rbd_sysfs_cleanup(void)
3799{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003800 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003801 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003802}
3803
3804int __init rbd_init(void)
3805{
3806 int rc;
3807
3808 rc = rbd_sysfs_init();
3809 if (rc)
3810 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003811 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003812 return 0;
3813}
3814
3815void __exit rbd_exit(void)
3816{
3817 rbd_sysfs_cleanup();
3818}
3819
3820module_init(rbd_init);
3821module_exit(rbd_exit);
3822
3823MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3824MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3825MODULE_DESCRIPTION("rados block device");
3826
3827/* following authorship retained from original osdblk.c */
3828MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3829
3830MODULE_LICENSE("GPL");