blob: 3b284d53a56644ab55876b0b2e60046ac2bad650 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder589d30e2012-07-10 20:30:11 -050069#define RBD_IMAGE_ID_LEN_MAX 64
70
Alex Elder81a89792012-02-02 08:13:30 -060071/*
72 * An RBD device name will be "rbd#", where the "rbd" comes from
73 * RBD_DRV_NAME above, and # is a unique integer identifier.
74 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
75 * enough to hold all possible device names.
76 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070077#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060078#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070079
Alex Eldercc0538b2012-08-10 13:12:07 -070080#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070081
Yehuda Sadeh602adf42010-08-12 16:11:25 -070082/*
83 * block device image metadata (in-memory version)
84 */
85struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050086 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050087 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -050088 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070089 __u8 obj_order;
90 __u8 crypt_type;
91 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Elderf84344f2012-08-31 17:29:51 -050093 /* The remaining fields need to be updated occasionally */
94 u64 image_size;
95 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096 char *snap_names;
97 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070098
99 u64 obj_version;
100};
101
102struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700103 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104};
105
106/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600107 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108 */
109struct rbd_client {
110 struct ceph_client *client;
111 struct kref kref;
112 struct list_head node;
113};
114
115/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600116 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700118struct rbd_req_status {
119 int done;
120 int rc;
121 u64 bytes;
122};
123
124/*
125 * a collection of requests
126 */
127struct rbd_req_coll {
128 int total;
129 int num_done;
130 struct kref kref;
131 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700132};
133
Alex Elderf0f8cef2012-01-29 13:57:44 -0600134/*
135 * a single io request
136 */
137struct rbd_request {
138 struct request *rq; /* blk layer request */
139 struct bio *bio; /* cloned bio */
140 struct page **pages; /* list of used pages */
141 u64 len;
142 int coll_index;
143 struct rbd_req_coll *coll;
144};
145
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800146struct rbd_snap {
147 struct device dev;
148 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800149 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800150 struct list_head node;
151 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500152 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800153};
154
Alex Elderf84344f2012-08-31 17:29:51 -0500155struct rbd_mapping {
156 char *snap_name;
157 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500158 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500159 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500160 bool snap_exists;
161 bool read_only;
162};
163
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700164/*
165 * a single device
166 */
167struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500168 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
170 int major; /* blkdev assigned major */
171 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700172
Alex Eldera30b71b2012-07-10 20:30:11 -0500173 u32 image_format; /* Either 1 or 2 */
Alex Elderf8c38922012-08-10 13:12:07 -0700174 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700175 struct rbd_client *rbd_client;
176
177 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
178
179 spinlock_t lock; /* queue lock */
180
181 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500182 char *image_id;
183 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500184 char *image_name;
185 size_t image_name_len;
186 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500187 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500188 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700189
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700190 struct ceph_osd_event *watch_event;
191 struct ceph_osd_request *watch_request;
192
Josh Durginc6666012011-11-21 17:11:12 -0800193 /* protects updating the header */
194 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500195
196 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197
198 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199
200 /* list of snapshots */
201 struct list_head snaps;
202
203 /* sysfs related */
204 struct device dev;
205};
206
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700207static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600208
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700209static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600210static DEFINE_SPINLOCK(rbd_dev_list_lock);
211
Alex Elder432b8582012-01-29 13:57:44 -0600212static LIST_HEAD(rbd_client_list); /* clients */
213static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700214
Alex Elder304f6802012-08-31 17:29:52 -0500215static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
216static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
217
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800218static void rbd_dev_release(struct device *dev);
Alex Elder14e70852012-07-19 09:09:27 -0500219static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800220
Alex Elderf0f8cef2012-01-29 13:57:44 -0600221static ssize_t rbd_add(struct bus_type *bus, const char *buf,
222 size_t count);
223static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
224 size_t count);
225
226static struct bus_attribute rbd_bus_attrs[] = {
227 __ATTR(add, S_IWUSR, NULL, rbd_add),
228 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
229 __ATTR_NULL
230};
231
232static struct bus_type rbd_bus_type = {
233 .name = "rbd",
234 .bus_attrs = rbd_bus_attrs,
235};
236
237static void rbd_root_dev_release(struct device *dev)
238{
239}
240
241static struct device rbd_root_dev = {
242 .init_name = "rbd",
243 .release = rbd_root_dev_release,
244};
245
Alex Elderaafb2302012-09-06 16:00:54 -0500246#ifdef RBD_DEBUG
247#define rbd_assert(expr) \
248 if (unlikely(!(expr))) { \
249 printk(KERN_ERR "\nAssertion failure in %s() " \
250 "at line %d:\n\n" \
251 "\trbd_assert(%s);\n\n", \
252 __func__, __LINE__, #expr); \
253 BUG(); \
254 }
255#else /* !RBD_DEBUG */
256# define rbd_assert(expr) ((void) 0)
257#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800259static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
260{
261 return get_device(&rbd_dev->dev);
262}
263
264static void rbd_put_dev(struct rbd_device *rbd_dev)
265{
266 put_device(&rbd_dev->dev);
267}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700268
Alex Elder1fe5e992012-07-25 09:32:41 -0500269static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700270
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271static int rbd_open(struct block_device *bdev, fmode_t mode)
272{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600273 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700274
Alex Elderf84344f2012-08-31 17:29:51 -0500275 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276 return -EROFS;
277
Alex Elder340c7a22012-08-10 13:12:07 -0700278 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500279 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700280
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281 return 0;
282}
283
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800284static int rbd_release(struct gendisk *disk, fmode_t mode)
285{
286 struct rbd_device *rbd_dev = disk->private_data;
287
288 rbd_put_dev(rbd_dev);
289
290 return 0;
291}
292
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293static const struct block_device_operations rbd_bd_ops = {
294 .owner = THIS_MODULE,
295 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800296 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297};
298
299/*
300 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500301 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302 */
Alex Elderf8c38922012-08-10 13:12:07 -0700303static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304{
305 struct rbd_client *rbdc;
306 int ret = -ENOMEM;
307
308 dout("rbd_client_create\n");
309 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
310 if (!rbdc)
311 goto out_opt;
312
313 kref_init(&rbdc->kref);
314 INIT_LIST_HEAD(&rbdc->node);
315
Alex Elderbc534d862012-01-29 13:57:44 -0600316 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
317
Alex Elder43ae4702012-07-03 16:01:18 -0500318 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700319 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600320 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500321 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322
323 ret = ceph_open_session(rbdc->client);
324 if (ret < 0)
325 goto out_err;
326
Alex Elder432b8582012-01-29 13:57:44 -0600327 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600329 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700330
Alex Elderbc534d862012-01-29 13:57:44 -0600331 mutex_unlock(&ctl_mutex);
332
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333 dout("rbd_client_create created %p\n", rbdc);
334 return rbdc;
335
336out_err:
337 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600338out_mutex:
339 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700340 kfree(rbdc);
341out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500342 if (ceph_opts)
343 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400344 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700345}
346
347/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700348 * Find a ceph client with specific addr and configuration. If
349 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700351static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700352{
353 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700354 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700355
Alex Elder43ae4702012-07-03 16:01:18 -0500356 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700357 return NULL;
358
Alex Elder1f7ba332012-08-10 13:12:07 -0700359 spin_lock(&rbd_client_list_lock);
360 list_for_each_entry(client_node, &rbd_client_list, node) {
361 if (!ceph_compare_options(ceph_opts, client_node->client)) {
362 kref_get(&client_node->kref);
363 found = true;
364 break;
365 }
366 }
367 spin_unlock(&rbd_client_list_lock);
368
369 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700370}
371
372/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700373 * mount options
374 */
375enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700376 Opt_last_int,
377 /* int args above */
378 Opt_last_string,
379 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700380 Opt_read_only,
381 Opt_read_write,
382 /* Boolean args above */
383 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700384};
385
Alex Elder43ae4702012-07-03 16:01:18 -0500386static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 /* int args above */
388 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500389 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700390 {Opt_read_only, "ro"}, /* Alternate spelling */
391 {Opt_read_write, "read_write"},
392 {Opt_read_write, "rw"}, /* Alternate spelling */
393 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700394 {-1, NULL}
395};
396
397static int parse_rbd_opts_token(char *c, void *private)
398{
Alex Elder43ae4702012-07-03 16:01:18 -0500399 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700400 substring_t argstr[MAX_OPT_ARGS];
401 int token, intval, ret;
402
Alex Elder43ae4702012-07-03 16:01:18 -0500403 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700404 if (token < 0)
405 return -EINVAL;
406
407 if (token < Opt_last_int) {
408 ret = match_int(&argstr[0], &intval);
409 if (ret < 0) {
410 pr_err("bad mount option arg (not int) "
411 "at '%s'\n", c);
412 return ret;
413 }
414 dout("got int token %d val %d\n", token, intval);
415 } else if (token > Opt_last_int && token < Opt_last_string) {
416 dout("got string token %d val %s\n", token,
417 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700418 } else if (token > Opt_last_string && token < Opt_last_bool) {
419 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700420 } else {
421 dout("got token %d\n", token);
422 }
423
424 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700425 case Opt_read_only:
426 rbd_opts->read_only = true;
427 break;
428 case Opt_read_write:
429 rbd_opts->read_only = false;
430 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700431 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500432 rbd_assert(false);
433 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434 }
435 return 0;
436}
437
438/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439 * Get a ceph client with specific addr and configuration, if one does
440 * not exist create it.
441 */
Alex Elderf8c38922012-08-10 13:12:07 -0700442static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
443 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444{
Alex Elderf8c38922012-08-10 13:12:07 -0700445 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500446 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700447 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448
Alex Eldercc0538b2012-08-10 13:12:07 -0700449 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450
Alex Elder43ae4702012-07-03 16:01:18 -0500451 ceph_opts = ceph_parse_options(options, mon_addr,
452 mon_addr + mon_addr_len,
453 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700454 if (IS_ERR(ceph_opts))
455 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456
Alex Elder1f7ba332012-08-10 13:12:07 -0700457 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600459 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500460 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700461 } else {
462 rbdc = rbd_client_create(ceph_opts);
463 if (IS_ERR(rbdc))
464 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465 }
Alex Elderf8c38922012-08-10 13:12:07 -0700466 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467
Alex Elderf8c38922012-08-10 13:12:07 -0700468 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469}
470
471/*
472 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600473 *
Alex Elder432b8582012-01-29 13:57:44 -0600474 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700475 */
476static void rbd_client_release(struct kref *kref)
477{
478 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
479
480 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500481 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700482 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500483 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700484
485 ceph_destroy_client(rbdc->client);
486 kfree(rbdc);
487}
488
489/*
490 * Drop reference to ceph client node. If it's not referenced anymore, release
491 * it.
492 */
493static void rbd_put_client(struct rbd_device *rbd_dev)
494{
495 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
496 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700497}
498
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700499/*
500 * Destroy requests collection
501 */
502static void rbd_coll_release(struct kref *kref)
503{
504 struct rbd_req_coll *coll =
505 container_of(kref, struct rbd_req_coll, kref);
506
507 dout("rbd_coll_release %p\n", coll);
508 kfree(coll);
509}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510
Alex Eldera30b71b2012-07-10 20:30:11 -0500511static bool rbd_image_format_valid(u32 image_format)
512{
513 return image_format == 1 || image_format == 2;
514}
515
Alex Elder8e94af82012-07-25 09:32:40 -0500516static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
517{
Alex Elder103a1502012-08-02 11:29:45 -0500518 size_t size;
519 u32 snap_count;
520
521 /* The header has to start with the magic rbd header text */
522 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
523 return false;
524
525 /*
526 * The size of a snapshot header has to fit in a size_t, and
527 * that limits the number of snapshots.
528 */
529 snap_count = le32_to_cpu(ondisk->snap_count);
530 size = SIZE_MAX - sizeof (struct ceph_snap_context);
531 if (snap_count > size / sizeof (__le64))
532 return false;
533
534 /*
535 * Not only that, but the size of the entire the snapshot
536 * header must also be representable in a size_t.
537 */
538 size -= snap_count * sizeof (__le64);
539 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
540 return false;
541
542 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500543}
544
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545/*
546 * Create a new header structure, translate header format from the on-disk
547 * header.
548 */
549static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500550 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700551{
Alex Elderccece232012-07-10 20:30:10 -0500552 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500553 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500554 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500555 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700556
Alex Elder6a523252012-07-19 17:12:59 -0500557 memset(header, 0, sizeof (*header));
558
Alex Elder103a1502012-08-02 11:29:45 -0500559 snap_count = le32_to_cpu(ondisk->snap_count);
560
Alex Elder58c17b02012-08-23 23:22:06 -0500561 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
562 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500563 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500565 memcpy(header->object_prefix, ondisk->object_prefix, len);
566 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600567
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700568 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500569 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
570
Alex Elder621901d2012-08-23 23:22:06 -0500571 /* Save a copy of the snapshot names */
572
Alex Elderf785cc12012-08-23 23:22:06 -0500573 if (snap_names_len > (u64) SIZE_MAX)
574 return -EIO;
575 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700576 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500577 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500578 /*
579 * Note that rbd_dev_v1_header_read() guarantees
580 * the ondisk buffer we're working with has
581 * snap_names_len bytes beyond the end of the
582 * snapshot id array, this memcpy() is safe.
583 */
584 memcpy(header->snap_names, &ondisk->snaps[snap_count],
585 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500586
Alex Elder621901d2012-08-23 23:22:06 -0500587 /* Record each snapshot's size */
588
Alex Elderd2bb24e2012-07-26 23:37:14 -0500589 size = snap_count * sizeof (*header->snap_sizes);
590 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700591 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500592 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500593 for (i = 0; i < snap_count; i++)
594 header->snap_sizes[i] =
595 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700596 } else {
Alex Elderccece232012-07-10 20:30:10 -0500597 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700598 header->snap_names = NULL;
599 header->snap_sizes = NULL;
600 }
Alex Elder849b4262012-07-09 21:04:24 -0500601
Alex Elder34b13182012-07-13 20:35:12 -0500602 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603 header->obj_order = ondisk->options.order;
604 header->crypt_type = ondisk->options.crypt_type;
605 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500606
Alex Elder621901d2012-08-23 23:22:06 -0500607 /* Allocate and fill in the snapshot context */
608
Alex Elderf84344f2012-08-31 17:29:51 -0500609 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500610 size = sizeof (struct ceph_snap_context);
611 size += snap_count * sizeof (header->snapc->snaps[0]);
612 header->snapc = kzalloc(size, GFP_KERNEL);
613 if (!header->snapc)
614 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700615
616 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500617 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700618 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500619 for (i = 0; i < snap_count; i++)
620 header->snapc->snaps[i] =
621 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622
623 return 0;
624
Alex Elder6a523252012-07-19 17:12:59 -0500625out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500626 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500627 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500629 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500630 kfree(header->object_prefix);
631 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500632
Alex Elder00f1f362012-02-07 12:03:36 -0600633 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700634}
635
Alex Elder8836b992012-08-30 14:42:15 -0500636static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700637{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638
Alex Eldere86924a2012-07-10 20:30:11 -0500639 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600640
Alex Eldere86924a2012-07-10 20:30:11 -0500641 list_for_each_entry(snap, &rbd_dev->snaps, node) {
642 if (!strcmp(snap_name, snap->name)) {
643 rbd_dev->mapping.snap_id = snap->id;
644 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500645 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600646
Alex Eldere86924a2012-07-10 20:30:11 -0500647 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600648 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700649 }
Alex Eldere86924a2012-07-10 20:30:11 -0500650
Alex Elder00f1f362012-02-07 12:03:36 -0600651 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652}
653
Alex Elder5ed16172012-08-29 17:11:07 -0500654static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700655{
Alex Elder78dc4472012-07-19 08:49:18 -0500656 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657
Alex Elder4e1105a2012-08-31 17:29:52 -0500658 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800659 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500660 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500661 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500662 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500663 rbd_dev->mapping.snap_exists = false;
664 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500665 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500667 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668 if (ret < 0)
669 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500670 rbd_dev->mapping.snap_exists = true;
671 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700672 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500673 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700675 return ret;
676}
677
678static void rbd_header_free(struct rbd_image_header *header)
679{
Alex Elder849b4262012-07-09 21:04:24 -0500680 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500681 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500683 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500684 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500685 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800686 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500687 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700688}
689
Alex Elder65ccfe22012-08-09 10:33:26 -0700690static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700691{
Alex Elder65ccfe22012-08-09 10:33:26 -0700692 char *name;
693 u64 segment;
694 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695
Alex Elder65ccfe22012-08-09 10:33:26 -0700696 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
697 if (!name)
698 return NULL;
699 segment = offset >> rbd_dev->header.obj_order;
700 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
701 rbd_dev->header.object_prefix, segment);
702 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
703 pr_err("error formatting segment name for #%llu (%d)\n",
704 segment, ret);
705 kfree(name);
706 name = NULL;
707 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708
Alex Elder65ccfe22012-08-09 10:33:26 -0700709 return name;
710}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700711
Alex Elder65ccfe22012-08-09 10:33:26 -0700712static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
713{
714 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700715
Alex Elder65ccfe22012-08-09 10:33:26 -0700716 return offset & (segment_size - 1);
717}
718
719static u64 rbd_segment_length(struct rbd_device *rbd_dev,
720 u64 offset, u64 length)
721{
722 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
723
724 offset &= segment_size - 1;
725
Alex Elderaafb2302012-09-06 16:00:54 -0500726 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700727 if (offset + length > segment_size)
728 length = segment_size - offset;
729
730 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700731}
732
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700733static int rbd_get_num_segments(struct rbd_image_header *header,
734 u64 ofs, u64 len)
735{
Alex Elderdf111be2012-08-09 10:33:26 -0700736 u64 start_seg;
737 u64 end_seg;
738
739 if (!len)
740 return 0;
741 if (len - 1 > U64_MAX - ofs)
742 return -ERANGE;
743
744 start_seg = ofs >> header->obj_order;
745 end_seg = (ofs + len - 1) >> header->obj_order;
746
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700747 return end_seg - start_seg + 1;
748}
749
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700751 * returns the size of an object in the image
752 */
753static u64 rbd_obj_bytes(struct rbd_image_header *header)
754{
755 return 1 << header->obj_order;
756}
757
758/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759 * bio helpers
760 */
761
762static void bio_chain_put(struct bio *chain)
763{
764 struct bio *tmp;
765
766 while (chain) {
767 tmp = chain;
768 chain = chain->bi_next;
769 bio_put(tmp);
770 }
771}
772
773/*
774 * zeros a bio chain, starting at specific offset
775 */
776static void zero_bio_chain(struct bio *chain, int start_ofs)
777{
778 struct bio_vec *bv;
779 unsigned long flags;
780 void *buf;
781 int i;
782 int pos = 0;
783
784 while (chain) {
785 bio_for_each_segment(bv, chain, i) {
786 if (pos + bv->bv_len > start_ofs) {
787 int remainder = max(start_ofs - pos, 0);
788 buf = bvec_kmap_irq(bv, &flags);
789 memset(buf + remainder, 0,
790 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200791 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700792 }
793 pos += bv->bv_len;
794 }
795
796 chain = chain->bi_next;
797 }
798}
799
800/*
801 * bio_chain_clone - clone a chain of bios up to a certain length.
802 * might return a bio_pair that will need to be released.
803 */
804static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
805 struct bio_pair **bp,
806 int len, gfp_t gfpmask)
807{
Alex Elder542582f2012-08-09 10:33:25 -0700808 struct bio *old_chain = *old;
809 struct bio *new_chain = NULL;
810 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700811 int total = 0;
812
813 if (*bp) {
814 bio_pair_release(*bp);
815 *bp = NULL;
816 }
817
818 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700819 struct bio *tmp;
820
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700821 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
822 if (!tmp)
823 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700824 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700825
826 if (total + old_chain->bi_size > len) {
827 struct bio_pair *bp;
828
829 /*
830 * this split can only happen with a single paged bio,
831 * split_bio will BUG_ON if this is not the case
832 */
833 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500834 "bi_size=%u\n",
835 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700836
837 /* split the bio. We'll release it either in the next
838 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600839 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700840 if (!bp)
841 goto err_out;
842
843 __bio_clone(tmp, &bp->bio1);
844
845 *next = &bp->bio2;
846 } else {
847 __bio_clone(tmp, old_chain);
848 *next = old_chain->bi_next;
849 }
850
851 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700852 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700853 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700855 else
856 new_chain = tmp;
857 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700858 old_chain = old_chain->bi_next;
859
860 total += tmp->bi_size;
861 }
862
Alex Elderaafb2302012-09-06 16:00:54 -0500863 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700864
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865 *old = old_chain;
866
867 return new_chain;
868
869err_out:
870 dout("bio_chain_clone with err\n");
871 bio_chain_put(new_chain);
872 return NULL;
873}
874
875/*
876 * helpers for osd request op vectors.
877 */
Alex Elder57cfc102012-06-26 12:57:03 -0700878static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
879 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700880{
Alex Elder57cfc102012-06-26 12:57:03 -0700881 struct ceph_osd_req_op *ops;
882
883 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
884 if (!ops)
885 return NULL;
886
887 ops[0].op = opcode;
888
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889 /*
890 * op extent offset and length will be set later on
891 * in calc_raw_layout()
892 */
Alex Elder57cfc102012-06-26 12:57:03 -0700893 ops[0].payload_len = payload_len;
894
895 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896}
897
898static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
899{
900 kfree(ops);
901}
902
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700903static void rbd_coll_end_req_index(struct request *rq,
904 struct rbd_req_coll *coll,
905 int index,
906 int ret, u64 len)
907{
908 struct request_queue *q;
909 int min, max, i;
910
Alex Elderbd919d42012-07-13 20:35:11 -0500911 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
912 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700913
914 if (!rq)
915 return;
916
917 if (!coll) {
918 blk_end_request(rq, ret, len);
919 return;
920 }
921
922 q = rq->q;
923
924 spin_lock_irq(q->queue_lock);
925 coll->status[index].done = 1;
926 coll->status[index].rc = ret;
927 coll->status[index].bytes = len;
928 max = min = coll->num_done;
929 while (max < coll->total && coll->status[max].done)
930 max++;
931
932 for (i = min; i<max; i++) {
933 __blk_end_request(rq, coll->status[i].rc,
934 coll->status[i].bytes);
935 coll->num_done++;
936 kref_put(&coll->kref, rbd_coll_release);
937 }
938 spin_unlock_irq(q->queue_lock);
939}
940
941static void rbd_coll_end_req(struct rbd_request *req,
942 int ret, u64 len)
943{
944 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
945}
946
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947/*
948 * Send ceph osd request
949 */
950static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500951 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700952 struct ceph_snap_context *snapc,
953 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500954 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700955 struct bio *bio,
956 struct page **pages,
957 int num_pages,
958 int flags,
959 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700960 struct rbd_req_coll *coll,
961 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700963 struct ceph_msg *msg),
964 struct ceph_osd_request **linger_req,
965 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966{
967 struct ceph_osd_request *req;
968 struct ceph_file_layout *layout;
969 int ret;
970 u64 bno;
971 struct timespec mtime = CURRENT_TIME;
972 struct rbd_request *req_data;
973 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600974 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700977 if (!req_data) {
978 if (coll)
979 rbd_coll_end_req_index(rq, coll, coll_index,
980 -ENOMEM, len);
981 return -ENOMEM;
982 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700983
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700984 if (coll) {
985 req_data->coll = coll;
986 req_data->coll_index = coll_index;
987 }
988
Alex Elderbd919d42012-07-13 20:35:11 -0500989 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
990 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991
Alex Elder0ce1a792012-07-03 16:01:18 -0500992 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600993 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
994 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700995 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700996 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700997 goto done_pages;
998 }
999
1000 req->r_callback = rbd_cb;
1001
1002 req_data->rq = rq;
1003 req_data->bio = bio;
1004 req_data->pages = pages;
1005 req_data->len = len;
1006
1007 req->r_priv = req_data;
1008
1009 reqhead = req->r_request->front.iov_base;
1010 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1011
Alex Elderaded07e2012-07-03 16:01:18 -05001012 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001013 req->r_oid_len = strlen(req->r_oid);
1014
1015 layout = &req->r_file_layout;
1016 memset(layout, 0, sizeof(*layout));
1017 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1018 layout->fl_stripe_count = cpu_to_le32(1);
1019 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001020 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001021 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1022 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023
1024 ceph_osdc_build_request(req, ofs, &len,
1025 ops,
1026 snapc,
1027 &mtime,
1028 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001029
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001030 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001031 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001032 *linger_req = req;
1033 }
1034
Alex Elder1dbb4392012-01-24 10:08:37 -06001035 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036 if (ret < 0)
1037 goto done_err;
1038
1039 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001040 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001041 if (ver)
1042 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001043 dout("reassert_ver=%llu\n",
1044 (unsigned long long)
1045 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001046 ceph_osdc_put_request(req);
1047 }
1048 return ret;
1049
1050done_err:
1051 bio_chain_put(req_data->bio);
1052 ceph_osdc_put_request(req);
1053done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001054 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001055 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 return ret;
1057}
1058
1059/*
1060 * Ceph osd op callback
1061 */
1062static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1063{
1064 struct rbd_request *req_data = req->r_priv;
1065 struct ceph_osd_reply_head *replyhead;
1066 struct ceph_osd_op *op;
1067 __s32 rc;
1068 u64 bytes;
1069 int read_op;
1070
1071 /* parse reply */
1072 replyhead = msg->front.iov_base;
1073 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1074 op = (void *)(replyhead + 1);
1075 rc = le32_to_cpu(replyhead->result);
1076 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001077 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001078
Alex Elderbd919d42012-07-13 20:35:11 -05001079 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1080 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001081
1082 if (rc == -ENOENT && read_op) {
1083 zero_bio_chain(req_data->bio, 0);
1084 rc = 0;
1085 } else if (rc == 0 && read_op && bytes < req_data->len) {
1086 zero_bio_chain(req_data->bio, bytes);
1087 bytes = req_data->len;
1088 }
1089
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001090 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001091
1092 if (req_data->bio)
1093 bio_chain_put(req_data->bio);
1094
1095 ceph_osdc_put_request(req);
1096 kfree(req_data);
1097}
1098
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001099static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1100{
1101 ceph_osdc_put_request(req);
1102}
1103
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104/*
1105 * Do a synchronous ceph osd operation
1106 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001107static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001108 struct ceph_snap_context *snapc,
1109 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001111 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001112 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001113 u64 ofs, u64 inbound_size,
1114 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001115 struct ceph_osd_request **linger_req,
1116 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117{
1118 int ret;
1119 struct page **pages;
1120 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001121
Alex Elderaafb2302012-09-06 16:00:54 -05001122 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123
Alex Elderf8d4de62012-07-03 16:01:19 -05001124 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001125 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001126 if (IS_ERR(pages))
1127 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128
Alex Elder0ce1a792012-07-03 16:01:18 -05001129 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001130 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001131 pages, num_pages,
1132 flags,
1133 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001134 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001135 NULL,
1136 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001138 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139
Alex Elderf8d4de62012-07-03 16:01:19 -05001140 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1141 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001142
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143done:
1144 ceph_release_page_vector(pages, num_pages);
1145 return ret;
1146}
1147
1148/*
1149 * Do an asynchronous ceph osd operation
1150 */
1151static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001152 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153 struct ceph_snap_context *snapc,
1154 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001155 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001156 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001157 struct bio *bio,
1158 struct rbd_req_coll *coll,
1159 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001160{
1161 char *seg_name;
1162 u64 seg_ofs;
1163 u64 seg_len;
1164 int ret;
1165 struct ceph_osd_req_op *ops;
1166 u32 payload_len;
1167
Alex Elder65ccfe22012-08-09 10:33:26 -07001168 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001169 if (!seg_name)
1170 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001171 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1172 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173
1174 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1175
Alex Elder57cfc102012-06-26 12:57:03 -07001176 ret = -ENOMEM;
1177 ops = rbd_create_rw_ops(1, opcode, payload_len);
1178 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179 goto done;
1180
1181 /* we've taken care of segment sizes earlier when we
1182 cloned the bios. We should never have a segment
1183 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001184 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185
1186 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1187 seg_name, seg_ofs, seg_len,
1188 bio,
1189 NULL, 0,
1190 flags,
1191 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001192 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001193 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001194
1195 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001196done:
1197 kfree(seg_name);
1198 return ret;
1199}
1200
1201/*
1202 * Request async osd write
1203 */
1204static int rbd_req_write(struct request *rq,
1205 struct rbd_device *rbd_dev,
1206 struct ceph_snap_context *snapc,
1207 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001208 struct bio *bio,
1209 struct rbd_req_coll *coll,
1210 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001211{
1212 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1213 CEPH_OSD_OP_WRITE,
1214 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001215 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001216}
1217
1218/*
1219 * Request async osd read
1220 */
1221static int rbd_req_read(struct request *rq,
1222 struct rbd_device *rbd_dev,
1223 u64 snapid,
1224 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001225 struct bio *bio,
1226 struct rbd_req_coll *coll,
1227 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001228{
1229 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001230 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231 CEPH_OSD_OP_READ,
1232 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001233 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234}
1235
1236/*
1237 * Request sync osd read
1238 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001239static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001240 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001241 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001243 char *buf,
1244 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001245{
Alex Elder913d2fd2012-06-26 12:57:03 -07001246 struct ceph_osd_req_op *ops;
1247 int ret;
1248
1249 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1250 if (!ops)
1251 return -ENOMEM;
1252
1253 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001254 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001256 ops, object_name, ofs, len, buf, NULL, ver);
1257 rbd_destroy_ops(ops);
1258
1259 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001260}
1261
1262/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001263 * Request sync osd watch
1264 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001265static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001266 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001267 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001268{
1269 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001270 int ret;
1271
Alex Elder57cfc102012-06-26 12:57:03 -07001272 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1273 if (!ops)
1274 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001275
Josh Durgina71b8912011-12-05 18:10:44 -08001276 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001277 ops[0].watch.cookie = notify_id;
1278 ops[0].watch.flag = 0;
1279
Alex Elder0ce1a792012-07-03 16:01:18 -05001280 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001281 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001282 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001283 CEPH_OSD_FLAG_READ,
1284 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001285 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001286 rbd_simple_req_cb, 0, NULL);
1287
1288 rbd_destroy_ops(ops);
1289 return ret;
1290}
1291
1292static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1293{
Alex Elder0ce1a792012-07-03 16:01:18 -05001294 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001295 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001296 int rc;
1297
Alex Elder0ce1a792012-07-03 16:01:18 -05001298 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001299 return;
1300
Alex Elderbd919d42012-07-13 20:35:11 -05001301 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1302 rbd_dev->header_name, (unsigned long long) notify_id,
1303 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001304 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001305 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001306 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001307 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001308
Alex Elder7f0a24d2012-07-25 09:32:40 -05001309 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310}
1311
1312/*
1313 * Request sync osd watch
1314 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001315static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316{
1317 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001318 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001319 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001320
Alex Elder57cfc102012-06-26 12:57:03 -07001321 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1322 if (!ops)
1323 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001324
1325 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001326 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327 if (ret < 0)
1328 goto fail;
1329
Alex Elder0e6f3222012-07-25 09:32:40 -05001330 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001331 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001332 ops[0].watch.flag = 1;
1333
Alex Elder0ce1a792012-07-03 16:01:18 -05001334 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001336 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1337 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001338 rbd_dev->header_name,
1339 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001340 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341
1342 if (ret < 0)
1343 goto fail_event;
1344
1345 rbd_destroy_ops(ops);
1346 return 0;
1347
1348fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001349 ceph_osdc_cancel_event(rbd_dev->watch_event);
1350 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001351fail:
1352 rbd_destroy_ops(ops);
1353 return ret;
1354}
1355
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001356/*
1357 * Request sync osd unwatch
1358 */
Alex Elder070c6332012-07-25 09:32:41 -05001359static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001360{
1361 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001362 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001363
Alex Elder57cfc102012-06-26 12:57:03 -07001364 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1365 if (!ops)
1366 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001367
1368 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001369 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001370 ops[0].watch.flag = 0;
1371
Alex Elder0ce1a792012-07-03 16:01:18 -05001372 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001373 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001374 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1375 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001376 rbd_dev->header_name,
1377 0, 0, NULL, NULL, NULL);
1378
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001379
1380 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001381 ceph_osdc_cancel_event(rbd_dev->watch_event);
1382 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001383 return ret;
1384}
1385
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001387 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001388 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001389static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001390 const char *object_name,
1391 const char *class_name,
1392 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001393 const char *outbound,
1394 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001395 char *inbound,
1396 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001397 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001398 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001399{
1400 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001401 int class_name_len = strlen(class_name);
1402 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001403 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001404 int ret;
1405
Alex Elder3cb4a682012-06-26 12:57:03 -07001406 /*
1407 * Any input parameters required by the method we're calling
1408 * will be sent along with the class and method names as
1409 * part of the message payload. That data and its size are
1410 * supplied via the indata and indata_len fields (named from
1411 * the perspective of the server side) in the OSD request
1412 * operation.
1413 */
1414 payload_size = class_name_len + method_name_len + outbound_size;
1415 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001416 if (!ops)
1417 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001418
Alex Elderaded07e2012-07-03 16:01:18 -05001419 ops[0].cls.class_name = class_name;
1420 ops[0].cls.class_len = (__u8) class_name_len;
1421 ops[0].cls.method_name = method_name;
1422 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001423 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001424 ops[0].cls.indata = outbound;
1425 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001426
Alex Elder0ce1a792012-07-03 16:01:18 -05001427 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001428 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001429 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001430 object_name, 0, inbound_size, inbound,
1431 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001432
1433 rbd_destroy_ops(ops);
1434
1435 dout("cls_exec returned %d\n", ret);
1436 return ret;
1437}
1438
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001439static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1440{
1441 struct rbd_req_coll *coll =
1442 kzalloc(sizeof(struct rbd_req_coll) +
1443 sizeof(struct rbd_req_status) * num_reqs,
1444 GFP_ATOMIC);
1445
1446 if (!coll)
1447 return NULL;
1448 coll->total = num_reqs;
1449 kref_init(&coll->kref);
1450 return coll;
1451}
1452
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001453/*
1454 * block device queue callback
1455 */
1456static void rbd_rq_fn(struct request_queue *q)
1457{
1458 struct rbd_device *rbd_dev = q->queuedata;
1459 struct request *rq;
1460 struct bio_pair *bp = NULL;
1461
Alex Elder00f1f362012-02-07 12:03:36 -06001462 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001463 struct bio *bio;
1464 struct bio *rq_bio, *next_bio = NULL;
1465 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001466 unsigned int size;
1467 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001468 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001469 int num_segs, cur_seg = 0;
1470 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001471 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001472
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 dout("fetched request\n");
1474
1475 /* filter out block requests we don't understand */
1476 if ((rq->cmd_type != REQ_TYPE_FS)) {
1477 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001478 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001479 }
1480
1481 /* deduce our operation (read, write) */
1482 do_write = (rq_data_dir(rq) == WRITE);
1483
1484 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001485 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001486 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001487 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001488 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001489 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490 }
1491
1492 spin_unlock_irq(q->queue_lock);
1493
Josh Durgind1d25642011-12-05 14:03:05 -08001494 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001495
Alex Elderf84344f2012-08-31 17:29:51 -05001496 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1497 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001498 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001499 dout("request for non-existent snapshot");
1500 spin_lock_irq(q->queue_lock);
1501 __blk_end_request_all(rq, -ENXIO);
1502 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001503 }
1504
Josh Durgind1d25642011-12-05 14:03:05 -08001505 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1506
1507 up_read(&rbd_dev->header_rwsem);
1508
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001509 dout("%s 0x%x bytes at 0x%llx\n",
1510 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001511 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001512
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001513 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001514 if (num_segs <= 0) {
1515 spin_lock_irq(q->queue_lock);
1516 __blk_end_request_all(rq, num_segs);
1517 ceph_put_snap_context(snapc);
1518 continue;
1519 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001520 coll = rbd_alloc_coll(num_segs);
1521 if (!coll) {
1522 spin_lock_irq(q->queue_lock);
1523 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001524 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001525 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001526 }
1527
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528 do {
1529 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001530 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001531 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001532 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1534 op_size, GFP_ATOMIC);
1535 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001536 rbd_coll_end_req_index(rq, coll, cur_seg,
1537 -ENOMEM, op_size);
1538 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 }
1540
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001541
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001542 /* init OSD command: write or read */
1543 if (do_write)
1544 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001545 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001547 op_size, bio,
1548 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001549 else
1550 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001551 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001552 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001553 op_size, bio,
1554 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001556next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001557 size -= op_size;
1558 ofs += op_size;
1559
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001560 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001561 rq_bio = next_bio;
1562 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001563 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001564
1565 if (bp)
1566 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001568
1569 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001570 }
1571}
1572
1573/*
1574 * a queue callback. Makes sure that we don't create a bio that spans across
1575 * multiple osd objects. One exception would be with a single page bios,
1576 * which we handle later at bio_chain_clone
1577 */
1578static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1579 struct bio_vec *bvec)
1580{
1581 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001582 unsigned int chunk_sectors;
1583 sector_t sector;
1584 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001585 int max;
1586
Alex Elder593a9e72012-02-07 12:03:37 -06001587 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1588 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1589 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1590
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001591 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001592 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001593 if (max < 0)
1594 max = 0; /* bio_add cannot handle a negative return */
1595 if (max <= bvec->bv_len && bio_sectors == 0)
1596 return bvec->bv_len;
1597 return max;
1598}
1599
1600static void rbd_free_disk(struct rbd_device *rbd_dev)
1601{
1602 struct gendisk *disk = rbd_dev->disk;
1603
1604 if (!disk)
1605 return;
1606
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001607 if (disk->flags & GENHD_FL_UP)
1608 del_gendisk(disk);
1609 if (disk->queue)
1610 blk_cleanup_queue(disk->queue);
1611 put_disk(disk);
1612}
1613
1614/*
Alex Elder4156d992012-08-02 11:29:46 -05001615 * Read the complete header for the given rbd device.
1616 *
1617 * Returns a pointer to a dynamically-allocated buffer containing
1618 * the complete and validated header. Caller can pass the address
1619 * of a variable that will be filled in with the version of the
1620 * header object at the time it was read.
1621 *
1622 * Returns a pointer-coded errno if a failure occurs.
1623 */
1624static struct rbd_image_header_ondisk *
1625rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1626{
1627 struct rbd_image_header_ondisk *ondisk = NULL;
1628 u32 snap_count = 0;
1629 u64 names_size = 0;
1630 u32 want_count;
1631 int ret;
1632
1633 /*
1634 * The complete header will include an array of its 64-bit
1635 * snapshot ids, followed by the names of those snapshots as
1636 * a contiguous block of NUL-terminated strings. Note that
1637 * the number of snapshots could change by the time we read
1638 * it in, in which case we re-read it.
1639 */
1640 do {
1641 size_t size;
1642
1643 kfree(ondisk);
1644
1645 size = sizeof (*ondisk);
1646 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1647 size += names_size;
1648 ondisk = kmalloc(size, GFP_KERNEL);
1649 if (!ondisk)
1650 return ERR_PTR(-ENOMEM);
1651
1652 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1653 rbd_dev->header_name,
1654 0, size,
1655 (char *) ondisk, version);
1656
1657 if (ret < 0)
1658 goto out_err;
1659 if (WARN_ON((size_t) ret < size)) {
1660 ret = -ENXIO;
1661 pr_warning("short header read for image %s"
1662 " (want %zd got %d)\n",
1663 rbd_dev->image_name, size, ret);
1664 goto out_err;
1665 }
1666 if (!rbd_dev_ondisk_valid(ondisk)) {
1667 ret = -ENXIO;
1668 pr_warning("invalid header for image %s\n",
1669 rbd_dev->image_name);
1670 goto out_err;
1671 }
1672
1673 names_size = le64_to_cpu(ondisk->snap_names_len);
1674 want_count = snap_count;
1675 snap_count = le32_to_cpu(ondisk->snap_count);
1676 } while (snap_count != want_count);
1677
1678 return ondisk;
1679
1680out_err:
1681 kfree(ondisk);
1682
1683 return ERR_PTR(ret);
1684}
1685
1686/*
1687 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001688 */
1689static int rbd_read_header(struct rbd_device *rbd_dev,
1690 struct rbd_image_header *header)
1691{
Alex Elder4156d992012-08-02 11:29:46 -05001692 struct rbd_image_header_ondisk *ondisk;
1693 u64 ver = 0;
1694 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001695
Alex Elder4156d992012-08-02 11:29:46 -05001696 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1697 if (IS_ERR(ondisk))
1698 return PTR_ERR(ondisk);
1699 ret = rbd_header_from_disk(header, ondisk);
1700 if (ret >= 0)
1701 header->obj_version = ver;
1702 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001703
Alex Elder4156d992012-08-02 11:29:46 -05001704 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001705}
1706
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001707static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1708{
1709 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001710 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001711
Alex Eldera0593292012-07-19 09:09:27 -05001712 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001713 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001714}
1715
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001716/*
1717 * only read the first part of the ondisk header, without the snaps info
1718 */
Alex Elderb8136232012-07-25 09:32:41 -05001719static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001720{
1721 int ret;
1722 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001723
1724 ret = rbd_read_header(rbd_dev, &h);
1725 if (ret < 0)
1726 return ret;
1727
Josh Durgina51aa0c2011-12-05 10:35:04 -08001728 down_write(&rbd_dev->header_rwsem);
1729
Sage Weil9db4b3e2011-04-19 22:49:06 -07001730 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001731 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001732 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1733
Alex Elder99c1f082012-08-30 14:42:15 -05001734 if (size != (sector_t) rbd_dev->mapping.size) {
1735 dout("setting size to %llu sectors",
1736 (unsigned long long) size);
1737 rbd_dev->mapping.size = (u64) size;
1738 set_capacity(rbd_dev->disk, size);
1739 }
Josh Durgin474ef7c2011-11-21 17:13:54 -08001740 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001741
Alex Elder849b4262012-07-09 21:04:24 -05001742 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001743 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001744 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001745 /* osd requests may still refer to snapc */
1746 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001747
Alex Elderb8136232012-07-25 09:32:41 -05001748 if (hver)
1749 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001750 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001751 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001752 rbd_dev->header.snapc = h.snapc;
1753 rbd_dev->header.snap_names = h.snap_names;
1754 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001755 /* Free the extra copy of the object prefix */
1756 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1757 kfree(h.object_prefix);
1758
Alex Elder304f6802012-08-31 17:29:52 -05001759 ret = rbd_dev_snaps_update(rbd_dev);
1760 if (!ret)
1761 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001762
Josh Durginc6666012011-11-21 17:11:12 -08001763 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001764
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001765 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001766}
1767
Alex Elder1fe5e992012-07-25 09:32:41 -05001768static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1769{
1770 int ret;
1771
1772 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1773 ret = __rbd_refresh_header(rbd_dev, hver);
1774 mutex_unlock(&ctl_mutex);
1775
1776 return ret;
1777}
1778
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001779static int rbd_init_disk(struct rbd_device *rbd_dev)
1780{
1781 struct gendisk *disk;
1782 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001783 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001784
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001785 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1787 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001788 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001789
Alex Elderf0f8cef2012-01-29 13:57:44 -06001790 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001791 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001792 disk->major = rbd_dev->major;
1793 disk->first_minor = 0;
1794 disk->fops = &rbd_bd_ops;
1795 disk->private_data = rbd_dev;
1796
1797 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001798 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1799 if (!q)
1800 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001801
Alex Elder593a9e72012-02-07 12:03:37 -06001802 /* We use the default size, but let's be explicit about it. */
1803 blk_queue_physical_block_size(q, SECTOR_SIZE);
1804
Josh Durgin029bcbd2011-07-22 11:35:23 -07001805 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001806 segment_size = rbd_obj_bytes(&rbd_dev->header);
1807 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1808 blk_queue_max_segment_size(q, segment_size);
1809 blk_queue_io_min(q, segment_size);
1810 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001811
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812 blk_queue_merge_bvec(q, rbd_merge_bvec);
1813 disk->queue = q;
1814
1815 q->queuedata = rbd_dev;
1816
1817 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001818
Alex Elder12f02942012-08-29 17:11:07 -05001819 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1820
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001821 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001822out_disk:
1823 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001824
1825 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001826}
1827
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001828/*
1829 sysfs
1830*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001831
Alex Elder593a9e72012-02-07 12:03:37 -06001832static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1833{
1834 return container_of(dev, struct rbd_device, dev);
1835}
1836
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001837static ssize_t rbd_size_show(struct device *dev,
1838 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001839{
Alex Elder593a9e72012-02-07 12:03:37 -06001840 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001841 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001842
Josh Durgina51aa0c2011-12-05 10:35:04 -08001843 down_read(&rbd_dev->header_rwsem);
1844 size = get_capacity(rbd_dev->disk);
1845 up_read(&rbd_dev->header_rwsem);
1846
1847 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001848}
1849
Alex Elder34b13182012-07-13 20:35:12 -05001850/*
1851 * Note this shows the features for whatever's mapped, which is not
1852 * necessarily the base image.
1853 */
1854static ssize_t rbd_features_show(struct device *dev,
1855 struct device_attribute *attr, char *buf)
1856{
1857 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1858
1859 return sprintf(buf, "0x%016llx\n",
1860 (unsigned long long) rbd_dev->mapping.features);
1861}
1862
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001863static ssize_t rbd_major_show(struct device *dev,
1864 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001865{
Alex Elder593a9e72012-02-07 12:03:37 -06001866 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001867
1868 return sprintf(buf, "%d\n", rbd_dev->major);
1869}
1870
1871static ssize_t rbd_client_id_show(struct device *dev,
1872 struct device_attribute *attr, char *buf)
1873{
Alex Elder593a9e72012-02-07 12:03:37 -06001874 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001875
Alex Elder1dbb4392012-01-24 10:08:37 -06001876 return sprintf(buf, "client%lld\n",
1877 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001878}
1879
1880static ssize_t rbd_pool_show(struct device *dev,
1881 struct device_attribute *attr, char *buf)
1882{
Alex Elder593a9e72012-02-07 12:03:37 -06001883 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001884
1885 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1886}
1887
Alex Elder9bb2f332012-07-12 10:46:35 -05001888static ssize_t rbd_pool_id_show(struct device *dev,
1889 struct device_attribute *attr, char *buf)
1890{
1891 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1892
1893 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1894}
1895
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001896static ssize_t rbd_name_show(struct device *dev,
1897 struct device_attribute *attr, char *buf)
1898{
Alex Elder593a9e72012-02-07 12:03:37 -06001899 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001900
Alex Elder0bed54d2012-07-03 16:01:18 -05001901 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001902}
1903
Alex Elder589d30e2012-07-10 20:30:11 -05001904static ssize_t rbd_image_id_show(struct device *dev,
1905 struct device_attribute *attr, char *buf)
1906{
1907 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1908
1909 return sprintf(buf, "%s\n", rbd_dev->image_id);
1910}
1911
Alex Elder34b13182012-07-13 20:35:12 -05001912/*
1913 * Shows the name of the currently-mapped snapshot (or
1914 * RBD_SNAP_HEAD_NAME for the base image).
1915 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001916static ssize_t rbd_snap_show(struct device *dev,
1917 struct device_attribute *attr,
1918 char *buf)
1919{
Alex Elder593a9e72012-02-07 12:03:37 -06001920 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001921
Alex Elderf84344f2012-08-31 17:29:51 -05001922 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001923}
1924
1925static ssize_t rbd_image_refresh(struct device *dev,
1926 struct device_attribute *attr,
1927 const char *buf,
1928 size_t size)
1929{
Alex Elder593a9e72012-02-07 12:03:37 -06001930 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001931 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001932
Alex Elder1fe5e992012-07-25 09:32:41 -05001933 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001934
1935 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001936}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001937
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001938static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05001939static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001940static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1941static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1942static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001943static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001944static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05001945static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001946static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1947static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001948
1949static struct attribute *rbd_attrs[] = {
1950 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05001951 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001952 &dev_attr_major.attr,
1953 &dev_attr_client_id.attr,
1954 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001955 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001956 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05001957 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001958 &dev_attr_current_snap.attr,
1959 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001960 NULL
1961};
1962
1963static struct attribute_group rbd_attr_group = {
1964 .attrs = rbd_attrs,
1965};
1966
1967static const struct attribute_group *rbd_attr_groups[] = {
1968 &rbd_attr_group,
1969 NULL
1970};
1971
1972static void rbd_sysfs_dev_release(struct device *dev)
1973{
1974}
1975
1976static struct device_type rbd_device_type = {
1977 .name = "rbd",
1978 .groups = rbd_attr_groups,
1979 .release = rbd_sysfs_dev_release,
1980};
1981
1982
1983/*
1984 sysfs - snapshots
1985*/
1986
1987static ssize_t rbd_snap_size_show(struct device *dev,
1988 struct device_attribute *attr,
1989 char *buf)
1990{
1991 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1992
Josh Durgin35915382011-12-05 18:25:13 -08001993 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001994}
1995
1996static ssize_t rbd_snap_id_show(struct device *dev,
1997 struct device_attribute *attr,
1998 char *buf)
1999{
2000 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2001
Josh Durgin35915382011-12-05 18:25:13 -08002002 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002003}
2004
Alex Elder34b13182012-07-13 20:35:12 -05002005static ssize_t rbd_snap_features_show(struct device *dev,
2006 struct device_attribute *attr,
2007 char *buf)
2008{
2009 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2010
2011 return sprintf(buf, "0x%016llx\n",
2012 (unsigned long long) snap->features);
2013}
2014
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002015static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2016static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002017static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002018
2019static struct attribute *rbd_snap_attrs[] = {
2020 &dev_attr_snap_size.attr,
2021 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002022 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002023 NULL,
2024};
2025
2026static struct attribute_group rbd_snap_attr_group = {
2027 .attrs = rbd_snap_attrs,
2028};
2029
2030static void rbd_snap_dev_release(struct device *dev)
2031{
2032 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2033 kfree(snap->name);
2034 kfree(snap);
2035}
2036
2037static const struct attribute_group *rbd_snap_attr_groups[] = {
2038 &rbd_snap_attr_group,
2039 NULL
2040};
2041
2042static struct device_type rbd_snap_device_type = {
2043 .groups = rbd_snap_attr_groups,
2044 .release = rbd_snap_dev_release,
2045};
2046
Alex Elder304f6802012-08-31 17:29:52 -05002047static bool rbd_snap_registered(struct rbd_snap *snap)
2048{
2049 bool ret = snap->dev.type == &rbd_snap_device_type;
2050 bool reg = device_is_registered(&snap->dev);
2051
2052 rbd_assert(!ret ^ reg);
2053
2054 return ret;
2055}
2056
Alex Elder14e70852012-07-19 09:09:27 -05002057static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002058{
2059 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002060 if (device_is_registered(&snap->dev))
2061 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002062}
2063
Alex Elder14e70852012-07-19 09:09:27 -05002064static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002065 struct device *parent)
2066{
2067 struct device *dev = &snap->dev;
2068 int ret;
2069
2070 dev->type = &rbd_snap_device_type;
2071 dev->parent = parent;
2072 dev->release = rbd_snap_dev_release;
2073 dev_set_name(dev, "snap_%s", snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002074 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2075
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002076 ret = device_register(dev);
2077
2078 return ret;
2079}
2080
Alex Elder4e891e02012-07-10 20:30:10 -05002081static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002082 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002083 u64 snap_id, u64 snap_size,
2084 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002085{
Alex Elder4e891e02012-07-10 20:30:10 -05002086 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002087 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002088
2089 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002090 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002091 return ERR_PTR(-ENOMEM);
2092
2093 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002094 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002095 if (!snap->name)
2096 goto err;
2097
Alex Elderc8d18422012-07-10 20:30:11 -05002098 snap->id = snap_id;
2099 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002100 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002101
2102 return snap;
2103
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002104err:
2105 kfree(snap->name);
2106 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002107
2108 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002109}
2110
Alex Eldercd892122012-07-03 16:01:19 -05002111static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2112 u64 *snap_size, u64 *snap_features)
2113{
2114 char *snap_name;
2115
2116 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2117
2118 *snap_size = rbd_dev->header.snap_sizes[which];
2119 *snap_features = 0; /* No features for v1 */
2120
2121 /* Skip over names until we find the one we are looking for */
2122
2123 snap_name = rbd_dev->header.snap_names;
2124 while (which--)
2125 snap_name += strlen(snap_name) + 1;
2126
2127 return snap_name;
2128}
2129
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002130/*
Alex Elder35938152012-08-02 11:29:46 -05002131 * Scan the rbd device's current snapshot list and compare it to the
2132 * newly-received snapshot context. Remove any existing snapshots
2133 * not present in the new snapshot context. Add a new snapshot for
2134 * any snaphots in the snapshot context not in the current list.
2135 * And verify there are no changes to snapshots we already know
2136 * about.
2137 *
2138 * Assumes the snapshots in the snapshot context are sorted by
2139 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2140 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002141 */
Alex Elder304f6802012-08-31 17:29:52 -05002142static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002143{
Alex Elder35938152012-08-02 11:29:46 -05002144 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2145 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002146 struct list_head *head = &rbd_dev->snaps;
2147 struct list_head *links = head->next;
2148 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002149
Alex Elder9fcbb802012-08-23 23:48:49 -05002150 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002151 while (index < snap_count || links != head) {
2152 u64 snap_id;
2153 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002154 char *snap_name;
2155 u64 snap_size = 0;
2156 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157
Alex Elder35938152012-08-02 11:29:46 -05002158 snap_id = index < snap_count ? snapc->snaps[index]
2159 : CEPH_NOSNAP;
2160 snap = links != head ? list_entry(links, struct rbd_snap, node)
2161 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002162 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002163
Alex Elder35938152012-08-02 11:29:46 -05002164 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2165 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166
Alex Elder35938152012-08-02 11:29:46 -05002167 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002168
Alex Elderf84344f2012-08-31 17:29:51 -05002169 if (rbd_dev->mapping.snap_id == snap->id)
2170 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002171 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002172 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002173 rbd_dev->mapping.snap_id == snap->id ?
2174 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002175 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002176
Alex Elder35938152012-08-02 11:29:46 -05002177 /* Done with this list entry; advance */
2178
2179 links = next;
2180 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002181 }
Alex Elder35938152012-08-02 11:29:46 -05002182
Alex Eldercd892122012-07-03 16:01:19 -05002183 snap_name = rbd_dev_v1_snap_info(rbd_dev, index,
2184 &snap_size, &snap_features);
2185 if (IS_ERR(snap_name))
2186 return PTR_ERR(snap_name);
2187
Alex Elder9fcbb802012-08-23 23:48:49 -05002188 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2189 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002190 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2191 struct rbd_snap *new_snap;
2192
2193 /* We haven't seen this snapshot before */
2194
Alex Elderc8d18422012-07-10 20:30:11 -05002195 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002196 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002197 if (IS_ERR(new_snap)) {
2198 int err = PTR_ERR(new_snap);
2199
2200 dout(" failed to add dev, error %d\n", err);
2201
2202 return err;
2203 }
Alex Elder35938152012-08-02 11:29:46 -05002204
2205 /* New goes before existing, or at end of list */
2206
Alex Elder9fcbb802012-08-23 23:48:49 -05002207 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002208 if (snap)
2209 list_add_tail(&new_snap->node, &snap->node);
2210 else
Alex Elder523f3252012-08-30 00:16:37 -05002211 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002212 } else {
2213 /* Already have this one */
2214
Alex Elder9fcbb802012-08-23 23:48:49 -05002215 dout(" already present\n");
2216
Alex Eldercd892122012-07-03 16:01:19 -05002217 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002218 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002219 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002220
2221 /* Done with this list entry; advance */
2222
2223 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002224 }
Alex Elder35938152012-08-02 11:29:46 -05002225
2226 /* Advance to the next entry in the snapshot context */
2227
2228 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002229 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002230 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002231
2232 return 0;
2233}
2234
Alex Elder304f6802012-08-31 17:29:52 -05002235/*
2236 * Scan the list of snapshots and register the devices for any that
2237 * have not already been registered.
2238 */
2239static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2240{
2241 struct rbd_snap *snap;
2242 int ret = 0;
2243
2244 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002245 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2246 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002247
2248 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2249 if (!rbd_snap_registered(snap)) {
2250 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2251 if (ret < 0)
2252 break;
2253 }
2254 }
2255 dout("%s: returning %d\n", __func__, ret);
2256
2257 return ret;
2258}
2259
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002260static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2261{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002262 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002263 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002264
2265 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002266
Alex Eldercd789ab2012-08-30 00:16:38 -05002267 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002268 dev->bus = &rbd_bus_type;
2269 dev->type = &rbd_device_type;
2270 dev->parent = &rbd_root_dev;
2271 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002272 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002273 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002274
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002275 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002276
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002277 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002278}
2279
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002280static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2281{
2282 device_unregister(&rbd_dev->dev);
2283}
2284
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002285static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2286{
2287 int ret, rc;
2288
2289 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002290 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002291 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002292 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002293 if (rc < 0)
2294 return rc;
2295 }
2296 } while (ret == -ERANGE);
2297
2298 return ret;
2299}
2300
Alex Eldere2839302012-08-29 17:11:06 -05002301static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002302
2303/*
Alex Elder499afd52012-02-02 08:13:29 -06002304 * Get a unique rbd identifier for the given new rbd_dev, and add
2305 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002306 */
Alex Eldere2839302012-08-29 17:11:06 -05002307static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002308{
Alex Eldere2839302012-08-29 17:11:06 -05002309 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002310
2311 spin_lock(&rbd_dev_list_lock);
2312 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2313 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002314 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2315 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002316}
Alex Elderb7f23c32012-01-29 13:57:43 -06002317
Alex Elder1ddbe942012-01-29 13:57:44 -06002318/*
Alex Elder499afd52012-02-02 08:13:29 -06002319 * Remove an rbd_dev from the global list, and record that its
2320 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002321 */
Alex Eldere2839302012-08-29 17:11:06 -05002322static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002323{
Alex Elderd184f6b2012-01-29 13:57:44 -06002324 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002325 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002326 int max_id;
2327
Alex Elderaafb2302012-09-06 16:00:54 -05002328 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002329
Alex Eldere2839302012-08-29 17:11:06 -05002330 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2331 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002332 spin_lock(&rbd_dev_list_lock);
2333 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002334
2335 /*
2336 * If the id being "put" is not the current maximum, there
2337 * is nothing special we need to do.
2338 */
Alex Eldere2839302012-08-29 17:11:06 -05002339 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002340 spin_unlock(&rbd_dev_list_lock);
2341 return;
2342 }
2343
2344 /*
2345 * We need to update the current maximum id. Search the
2346 * list to find out what it is. We're more likely to find
2347 * the maximum at the end, so search the list backward.
2348 */
2349 max_id = 0;
2350 list_for_each_prev(tmp, &rbd_dev_list) {
2351 struct rbd_device *rbd_dev;
2352
2353 rbd_dev = list_entry(tmp, struct rbd_device, node);
2354 if (rbd_id > max_id)
2355 max_id = rbd_id;
2356 }
Alex Elder499afd52012-02-02 08:13:29 -06002357 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002358
Alex Elder1ddbe942012-01-29 13:57:44 -06002359 /*
Alex Eldere2839302012-08-29 17:11:06 -05002360 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002361 * which case it now accurately reflects the new maximum.
2362 * Be careful not to overwrite the maximum value in that
2363 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002364 */
Alex Eldere2839302012-08-29 17:11:06 -05002365 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2366 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002367}
2368
Alex Eldera725f65e2012-02-02 08:13:30 -06002369/*
Alex Eldere28fff262012-02-02 08:13:30 -06002370 * Skips over white space at *buf, and updates *buf to point to the
2371 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002372 * the token (string of non-white space characters) found. Note
2373 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002374 */
2375static inline size_t next_token(const char **buf)
2376{
2377 /*
2378 * These are the characters that produce nonzero for
2379 * isspace() in the "C" and "POSIX" locales.
2380 */
2381 const char *spaces = " \f\n\r\t\v";
2382
2383 *buf += strspn(*buf, spaces); /* Find start of token */
2384
2385 return strcspn(*buf, spaces); /* Return token length */
2386}
2387
2388/*
2389 * Finds the next token in *buf, and if the provided token buffer is
2390 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002391 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2392 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002393 *
2394 * Returns the length of the token found (not including the '\0').
2395 * Return value will be 0 if no token is found, and it will be >=
2396 * token_size if the token would not fit.
2397 *
Alex Elder593a9e72012-02-07 12:03:37 -06002398 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002399 * found token. Note that this occurs even if the token buffer is
2400 * too small to hold it.
2401 */
2402static inline size_t copy_token(const char **buf,
2403 char *token,
2404 size_t token_size)
2405{
2406 size_t len;
2407
2408 len = next_token(buf);
2409 if (len < token_size) {
2410 memcpy(token, *buf, len);
2411 *(token + len) = '\0';
2412 }
2413 *buf += len;
2414
2415 return len;
2416}
2417
2418/*
Alex Elderea3352f2012-07-09 21:04:23 -05002419 * Finds the next token in *buf, dynamically allocates a buffer big
2420 * enough to hold a copy of it, and copies the token into the new
2421 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2422 * that a duplicate buffer is created even for a zero-length token.
2423 *
2424 * Returns a pointer to the newly-allocated duplicate, or a null
2425 * pointer if memory for the duplicate was not available. If
2426 * the lenp argument is a non-null pointer, the length of the token
2427 * (not including the '\0') is returned in *lenp.
2428 *
2429 * If successful, the *buf pointer will be updated to point beyond
2430 * the end of the found token.
2431 *
2432 * Note: uses GFP_KERNEL for allocation.
2433 */
2434static inline char *dup_token(const char **buf, size_t *lenp)
2435{
2436 char *dup;
2437 size_t len;
2438
2439 len = next_token(buf);
2440 dup = kmalloc(len + 1, GFP_KERNEL);
2441 if (!dup)
2442 return NULL;
2443
2444 memcpy(dup, *buf, len);
2445 *(dup + len) = '\0';
2446 *buf += len;
2447
2448 if (lenp)
2449 *lenp = len;
2450
2451 return dup;
2452}
2453
2454/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002455 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2456 * rbd_md_name, and name fields of the given rbd_dev, based on the
2457 * list of monitor addresses and other options provided via
2458 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2459 * copy of the snapshot name to map if successful, or a
2460 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002461 *
2462 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002463 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002464static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2465 const char *buf,
2466 const char **mon_addrs,
2467 size_t *mon_addrs_size,
2468 char *options,
2469 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002470{
Alex Elderd22f76e2012-07-12 10:46:35 -05002471 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002472 char *err_ptr = ERR_PTR(-EINVAL);
2473 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002474
2475 /* The first four tokens are required */
2476
Alex Elder7ef32142012-02-02 08:13:30 -06002477 len = next_token(&buf);
2478 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002479 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002480 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002481 *mon_addrs = buf;
2482
2483 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002484
Alex Eldere28fff262012-02-02 08:13:30 -06002485 len = copy_token(&buf, options, options_size);
2486 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002487 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002488
Alex Elder3feeb8942012-08-31 17:29:52 -05002489 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002490 rbd_dev->pool_name = dup_token(&buf, NULL);
2491 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002492 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002493
Alex Elder0bed54d2012-07-03 16:01:18 -05002494 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2495 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002496 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002497
Alex Elder3feeb8942012-08-31 17:29:52 -05002498 /* Snapshot name is optional */
2499 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002500 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002501 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2502 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002503 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002504 snap_name = kmalloc(len + 1, GFP_KERNEL);
2505 if (!snap_name)
2506 goto out_err;
2507 memcpy(snap_name, buf, len);
2508 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002509
Alex Elder3feeb8942012-08-31 17:29:52 -05002510dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2511
2512 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002513
2514out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002515 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002516 rbd_dev->image_name = NULL;
2517 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002518 kfree(rbd_dev->pool_name);
2519 rbd_dev->pool_name = NULL;
2520
Alex Elder3feeb8942012-08-31 17:29:52 -05002521 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002522}
2523
Alex Elder589d30e2012-07-10 20:30:11 -05002524/*
2525 * An rbd format 2 image has a unique identifier, distinct from the
2526 * name given to it by the user. Internally, that identifier is
2527 * what's used to specify the names of objects related to the image.
2528 *
2529 * A special "rbd id" object is used to map an rbd image name to its
2530 * id. If that object doesn't exist, then there is no v2 rbd image
2531 * with the supplied name.
2532 *
2533 * This function will record the given rbd_dev's image_id field if
2534 * it can be determined, and in that case will return 0. If any
2535 * errors occur a negative errno will be returned and the rbd_dev's
2536 * image_id field will be unchanged (and should be NULL).
2537 */
2538static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2539{
2540 int ret;
2541 size_t size;
2542 char *object_name;
2543 void *response;
2544 void *p;
2545
2546 /*
2547 * First, see if the format 2 image id file exists, and if
2548 * so, get the image's persistent id from it.
2549 */
2550 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2551 object_name = kmalloc(size, GFP_NOIO);
2552 if (!object_name)
2553 return -ENOMEM;
2554 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2555 dout("rbd id object name is %s\n", object_name);
2556
2557 /* Response will be an encoded string, which includes a length */
2558
2559 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2560 response = kzalloc(size, GFP_NOIO);
2561 if (!response) {
2562 ret = -ENOMEM;
2563 goto out;
2564 }
2565
2566 ret = rbd_req_sync_exec(rbd_dev, object_name,
2567 "rbd", "get_id",
2568 NULL, 0,
2569 response, RBD_IMAGE_ID_LEN_MAX,
2570 CEPH_OSD_FLAG_READ, NULL);
2571 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2572 if (ret < 0)
2573 goto out;
2574
2575 p = response;
2576 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2577 p + RBD_IMAGE_ID_LEN_MAX,
2578 &rbd_dev->image_id_len,
2579 GFP_NOIO);
2580 if (IS_ERR(rbd_dev->image_id)) {
2581 ret = PTR_ERR(rbd_dev->image_id);
2582 rbd_dev->image_id = NULL;
2583 } else {
2584 dout("image_id is %s\n", rbd_dev->image_id);
2585 }
2586out:
2587 kfree(response);
2588 kfree(object_name);
2589
2590 return ret;
2591}
2592
Alex Eldera30b71b2012-07-10 20:30:11 -05002593static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2594{
2595 int ret;
2596 size_t size;
2597
2598 /* Version 1 images have no id; empty string is used */
2599
2600 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2601 if (!rbd_dev->image_id)
2602 return -ENOMEM;
2603 rbd_dev->image_id_len = 0;
2604
2605 /* Record the header object name for this rbd image. */
2606
2607 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2608 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2609 if (!rbd_dev->header_name) {
2610 ret = -ENOMEM;
2611 goto out_err;
2612 }
2613 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2614
2615 /* Populate rbd image metadata */
2616
2617 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2618 if (ret < 0)
2619 goto out_err;
2620 rbd_dev->image_format = 1;
2621
2622 dout("discovered version 1 image, header name is %s\n",
2623 rbd_dev->header_name);
2624
2625 return 0;
2626
2627out_err:
2628 kfree(rbd_dev->header_name);
2629 rbd_dev->header_name = NULL;
2630 kfree(rbd_dev->image_id);
2631 rbd_dev->image_id = NULL;
2632
2633 return ret;
2634}
2635
2636static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2637{
2638 size_t size;
2639
2640 /*
2641 * Image id was filled in by the caller. Record the header
2642 * object name for this rbd image.
2643 */
2644 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2645 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2646 if (!rbd_dev->header_name)
2647 return -ENOMEM;
2648 sprintf(rbd_dev->header_name, "%s%s",
2649 RBD_HEADER_PREFIX, rbd_dev->image_id);
2650 rbd_dev->image_format = 2;
2651
2652 dout("discovered version 2 image, header name is %s\n",
2653 rbd_dev->header_name);
2654
2655 return -ENOTSUPP;
2656}
2657
2658/*
2659 * Probe for the existence of the header object for the given rbd
2660 * device. For format 2 images this includes determining the image
2661 * id.
2662 */
2663static int rbd_dev_probe(struct rbd_device *rbd_dev)
2664{
2665 int ret;
2666
2667 /*
2668 * Get the id from the image id object. If it's not a
2669 * format 2 image, we'll get ENOENT back, and we'll assume
2670 * it's a format 1 image.
2671 */
2672 ret = rbd_dev_image_id(rbd_dev);
2673 if (ret)
2674 ret = rbd_dev_v1_probe(rbd_dev);
2675 else
2676 ret = rbd_dev_v2_probe(rbd_dev);
2677 if (ret)
2678 dout("probe failed, returning %d\n", ret);
2679
2680 return ret;
2681}
2682
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002683static ssize_t rbd_add(struct bus_type *bus,
2684 const char *buf,
2685 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002686{
Alex Eldercb8627c2012-07-09 21:04:23 -05002687 char *options;
2688 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002689 const char *mon_addrs = NULL;
2690 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002691 struct ceph_osd_client *osdc;
2692 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05002693 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002694
2695 if (!try_module_get(THIS_MODULE))
2696 return -ENODEV;
2697
Alex Elder27cc2592012-02-02 08:13:30 -06002698 options = kmalloc(count, GFP_KERNEL);
2699 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05002700 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002701 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2702 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05002703 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002704
2705 /* static rbd_device initialization */
2706 spin_lock_init(&rbd_dev->lock);
2707 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002708 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002709 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002710
Alex Eldera725f65e2012-02-02 08:13:30 -06002711 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05002712 snap_name = rbd_add_parse_args(rbd_dev, buf,
2713 &mon_addrs, &mon_addrs_size, options, count);
2714 if (IS_ERR(snap_name)) {
2715 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05002716 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05002717 }
Alex Eldera725f65e2012-02-02 08:13:30 -06002718
Alex Elderf8c38922012-08-10 13:12:07 -07002719 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2720 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002721 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002722
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002723 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002724 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002725 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2726 if (rc < 0)
2727 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002728 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002729
Alex Eldera30b71b2012-07-10 20:30:11 -05002730 rc = rbd_dev_probe(rbd_dev);
2731 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05002732 goto err_out_client;
Alex Eldera30b71b2012-07-10 20:30:11 -05002733 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder05fd6f62012-08-29 17:11:07 -05002734
2735 /* no need to lock here, as rbd_dev is not registered yet */
2736 rc = rbd_dev_snaps_update(rbd_dev);
2737 if (rc)
2738 goto err_out_header;
2739
2740 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2741 if (rc)
2742 goto err_out_header;
2743
Alex Elder85ae8922012-07-26 23:37:14 -05002744 /* generate unique id: find highest unique id, add one */
2745 rbd_dev_id_get(rbd_dev);
2746
2747 /* Fill in the device name, now that we have its id. */
2748 BUILD_BUG_ON(DEV_NAME_LEN
2749 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2750 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2751
2752 /* Get our block major device number. */
2753
Alex Elder27cc2592012-02-02 08:13:30 -06002754 rc = register_blkdev(0, rbd_dev->name);
2755 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002756 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06002757 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002758
Alex Elder0f308a32012-08-29 17:11:07 -05002759 /* Set up the blkdev mapping. */
2760
2761 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002762 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002763 goto err_out_blkdev;
2764
Alex Elder0f308a32012-08-29 17:11:07 -05002765 rc = rbd_bus_add_dev(rbd_dev);
2766 if (rc)
2767 goto err_out_disk;
2768
Alex Elder32eec682012-02-08 16:11:14 -06002769 /*
2770 * At this point cleanup in the event of an error is the job
2771 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06002772 */
Alex Elder2ac4e752012-07-10 20:30:10 -05002773
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002774 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05002775 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002776 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002777 if (rc)
2778 goto err_out_bus;
2779
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002780 rc = rbd_init_watch_dev(rbd_dev);
2781 if (rc)
2782 goto err_out_bus;
2783
Alex Elder3ee40012012-08-29 17:11:07 -05002784 /* Everything's ready. Announce the disk to the world. */
2785
2786 add_disk(rbd_dev->disk);
2787
2788 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2789 (unsigned long long) rbd_dev->mapping.size);
2790
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002791 return count;
2792
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002793err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002794 /* this will also clean up rest of rbd_dev stuff */
2795
2796 rbd_bus_del_dev(rbd_dev);
2797 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002798 return rc;
2799
Alex Elder0f308a32012-08-29 17:11:07 -05002800err_out_disk:
2801 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002802err_out_blkdev:
2803 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05002804err_out_id:
2805 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05002806err_out_header:
2807 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002808err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05002809 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002810 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05002811 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05002812err_out_args:
2813 kfree(rbd_dev->mapping.snap_name);
2814 kfree(rbd_dev->image_name);
2815 kfree(rbd_dev->pool_name);
2816err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06002817 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002818 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002819
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002820 dout("Error adding device %s\n", buf);
2821 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002822
2823 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002824}
2825
Alex Elderde71a292012-07-03 16:01:19 -05002826static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002827{
2828 struct list_head *tmp;
2829 struct rbd_device *rbd_dev;
2830
Alex Eldere124a822012-01-29 13:57:44 -06002831 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002832 list_for_each(tmp, &rbd_dev_list) {
2833 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002834 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002835 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002836 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002837 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002838 }
Alex Eldere124a822012-01-29 13:57:44 -06002839 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002840 return NULL;
2841}
2842
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002843static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002844{
Alex Elder593a9e72012-02-07 12:03:37 -06002845 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002846
Alex Elder1dbb4392012-01-24 10:08:37 -06002847 if (rbd_dev->watch_request) {
2848 struct ceph_client *client = rbd_dev->rbd_client->client;
2849
2850 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002851 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002852 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002853 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002854 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002855
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002856 rbd_put_client(rbd_dev);
2857
2858 /* clean up and free blkdev */
2859 rbd_free_disk(rbd_dev);
2860 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002861
Alex Elder2ac4e752012-07-10 20:30:10 -05002862 /* release allocated disk header fields */
2863 rbd_header_free(&rbd_dev->header);
2864
Alex Elder32eec682012-02-08 16:11:14 -06002865 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05002866 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05002867 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05002868 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002869 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002870 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002871 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002872 kfree(rbd_dev);
2873
2874 /* release module ref */
2875 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002876}
2877
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002878static ssize_t rbd_remove(struct bus_type *bus,
2879 const char *buf,
2880 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002881{
2882 struct rbd_device *rbd_dev = NULL;
2883 int target_id, rc;
2884 unsigned long ul;
2885 int ret = count;
2886
2887 rc = strict_strtoul(buf, 10, &ul);
2888 if (rc)
2889 return rc;
2890
2891 /* convert to int; abort if we lost anything in the conversion */
2892 target_id = (int) ul;
2893 if (target_id != ul)
2894 return -EINVAL;
2895
2896 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2897
2898 rbd_dev = __rbd_get_dev(target_id);
2899 if (!rbd_dev) {
2900 ret = -ENOENT;
2901 goto done;
2902 }
2903
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002904 __rbd_remove_all_snaps(rbd_dev);
2905 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002906
2907done:
2908 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05002909
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002910 return ret;
2911}
2912
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002913/*
2914 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002915 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002916 */
2917static int rbd_sysfs_init(void)
2918{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002919 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002920
Alex Elderfed4c142012-02-07 12:03:36 -06002921 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002922 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002923 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002924
Alex Elderfed4c142012-02-07 12:03:36 -06002925 ret = bus_register(&rbd_bus_type);
2926 if (ret < 0)
2927 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002928
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002929 return ret;
2930}
2931
2932static void rbd_sysfs_cleanup(void)
2933{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002934 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002935 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002936}
2937
2938int __init rbd_init(void)
2939{
2940 int rc;
2941
2942 rc = rbd_sysfs_init();
2943 if (rc)
2944 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002945 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002946 return 0;
2947}
2948
2949void __exit rbd_exit(void)
2950{
2951 rbd_sysfs_cleanup();
2952}
2953
2954module_init(rbd_init);
2955module_exit(rbd_exit);
2956
2957MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2958MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2959MODULE_DESCRIPTION("rados block device");
2960
2961/* following authorship retained from original osdblk.c */
2962MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2963
2964MODULE_LICENSE("GPL");